Talk:Using GIZA++

From Apertium
Revision as of 21:04, 9 August 2008 by Francis Tyers (talk | contribs) (→‎Welsh to English (classic))
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigation Jump to search

First try[edit]

 629  cat es-text.txt | grep -v '^<' > es-text.t
 630  mv es-text.t es-text.txt 
 633  cat it-text.txt | grep -v '<' > it
 634  mv it it-text.txt 
 721  cat es-text.txt | apertium-destxt | lt-proc ~/local/share/apertium/apertium-es-it/es-it.automorf.bin | 
      apertium-tagger -g ~/local/share/apertium/apertium-es-it/es-it.prob | apertium-retxt > es-text.tagged.txt &
 525  cat it-text.txt | apertium-destxt | lt-proc ~/local/share/apertium/apertium-es-it/it-es.automorf.bin |
      apertium-tagger -g ~/local/share/apertium/apertium-es-it/it-es.prob | apertium-retxt > it-text.tagged.txt &
 550  ~fsanchez/GIZA++-v2/plain2snt.out es/es-text.tagged.txt it/it-text.tagged.txt
 551  ~fsanchez/GIZA++-v2/mkcls-v2/mkcls -m2 -pes/es-text.tagged.txt -c50 -Ves/es-text.tagged.vcb.classes opt >& es/mkcls1.log
 555  ~fsanchez/GIZA++-v2/mkcls-v2/mkcls -m2 -pit/it-text.tagged.txt -c50 -Vit/it-text.tagged.vcb.classes opt >& it/mkcls1.log
 564  ~fsanchez/GIZA++-v2/GIZA++ -S es/es-text.tagged.vcb -T it/it-text.tagged.vcb -C es/es-text.tagged_it-text.tagged.snt -p0 0.98 -o es-it.aligned >& alignment.log

Second try[edit]

 629  cat es-text.txt | grep -v '^<' > es-text.t
 630  mv es-text.t es-text.txt 
 633  cat it-text.txt | grep -v '<' > it
 634  mv it it-text.txt 
 550  ~fsanchez/GIZA++-v2/plain2snt.out es/es-text.txt it/it-text.txt
 930  ~fsanchez/GIZA++-v2/mkcls-v2/mkcls -m2 -pes/es-text.txt -c50 -Ves/es-text.vcb.classes opt >& es/mkcls1.log
 932  ~fsanchez/GIZA++-v2/mkcls-v2/mkcls -m2 -pit/it-text.txt -c50 -Vit/it-text.vcb.classes opt >& it/mkcls1.log

Welsh to English (classic)[edit]

  774  cat cy-clean.txt | python -c "import sys, codecs; sys.stdout = codecs.getwriter('utf-8')(sys.stdout); sys.stdin = codecs.getreader('utf-8')(sys.stdin); print sys.stdin.read().lower();" > cy.crp.txt &
  775  cat en-clean.txt | python -c "import sys, codecs; sys.stdout = codecs.getwriter('utf-8')(sys.stdout); sys.stdin = codecs.getreader('utf-8')(sys.stdin); print sys.stdin.read().lower();" > en.crp.txt &
  781  plain2snt.out cy.crp.txt en.crp.txt 
  783  snt2cooc.out cy.crp.vcb en.crp.vcb cy.crp_en.crp.snt > cy-en.model.cooc
  784  trainGIZA++.sh cy.crp.vcb en.crp.vcb cy.crp_en.crp.snt