Difference between revisions of "Talk:Using GIZA++"

From Apertium
Jump to navigation Jump to search
 
(3 intermediate revisions by the same user not shown)
Line 10: Line 10:
apertium-tagger -g ~/local/share/apertium/apertium-es-it/it-es.prob | apertium-retxt > it-text.tagged.txt &
apertium-tagger -g ~/local/share/apertium/apertium-es-it/it-es.prob | apertium-retxt > it-text.tagged.txt &
550 ~fsanchez/GIZA++-v2/plain2snt.out es/es-text.tagged.txt it/it-text.tagged.txt
550 ~fsanchez/GIZA++-v2/plain2snt.out es/es-text.tagged.txt it/it-text.tagged.txt
551 ~fsanchez/GIZA++-v2/mkcls-v2/mkcls -m2 -pes/es-text.txt -c50 -Ves/es-text.vcb.classes opt >& es/mkcls1.log
551 ~fsanchez/GIZA++-v2/mkcls-v2/mkcls -m2 -pes/es-text.tagged.txt -c50 -Ves/es-text.tagged.vcb.classes opt >& es/mkcls1.log
555 ~fsanchez/GIZA++-v2/mkcls-v2/mkcls -m2 -pit/it-text.txt -c50 -Vit/it-text.vcb.classes opt >& it/mkcls1.log
555 ~fsanchez/GIZA++-v2/mkcls-v2/mkcls -m2 -pit/it-text.tagged.txt -c50 -Vit/it-text.tagged.vcb.classes opt >& it/mkcls1.log
564 ~fsanchez/GIZA++-v2/GIZA++ -S es/es-text.tagged.vcb -T it/it-text.tagged.vcb -C es/es-text.tagged_it-text.tagged.snt -p0 0.98 -o es-it.aligned >& alignment.log
564 ~fsanchez/GIZA++-v2/GIZA++ -S es/es-text.tagged.vcb -T it/it-text.tagged.vcb -C es/es-text.tagged_it-text.tagged.snt -p0 0.98 -o es-it.aligned >& alignment.log


Line 23: Line 23:
930 ~fsanchez/GIZA++-v2/mkcls-v2/mkcls -m2 -pes/es-text.txt -c50 -Ves/es-text.vcb.classes opt >& es/mkcls1.log
930 ~fsanchez/GIZA++-v2/mkcls-v2/mkcls -m2 -pes/es-text.txt -c50 -Ves/es-text.vcb.classes opt >& es/mkcls1.log
932 ~fsanchez/GIZA++-v2/mkcls-v2/mkcls -m2 -pit/it-text.txt -c50 -Vit/it-text.vcb.classes opt >& it/mkcls1.log
932 ~fsanchez/GIZA++-v2/mkcls-v2/mkcls -m2 -pit/it-text.txt -c50 -Vit/it-text.vcb.classes opt >& it/mkcls1.log

==Welsh to English (classic)==

<pre>
774 cat cy-clean.txt | python -c "import sys, codecs; sys.stdout = codecs.getwriter('utf-8')(sys.stdout); sys.stdin = codecs.getreader('utf-8')(sys.stdin); print sys.stdin.read().lower();" > cy.crp.txt &
775 cat en-clean.txt | python -c "import sys, codecs; sys.stdout = codecs.getwriter('utf-8')(sys.stdout); sys.stdin = codecs.getreader('utf-8')(sys.stdin); print sys.stdin.read().lower();" > en.crp.txt &
781 plain2snt.out cy.crp.txt en.crp.txt
783 snt2cooc.out cy.crp.vcb en.crp.vcb cy.crp_en.crp.snt > cy-en.model.cooc
784 trainGIZA++.sh cy.crp.vcb en.crp.vcb cy.crp_en.crp.snt


</pre>

Latest revision as of 21:04, 9 August 2008

First try[edit]

 629  cat es-text.txt | grep -v '^<' > es-text.t
 630  mv es-text.t es-text.txt 
 633  cat it-text.txt | grep -v '<' > it
 634  mv it it-text.txt 
 721  cat es-text.txt | apertium-destxt | lt-proc ~/local/share/apertium/apertium-es-it/es-it.automorf.bin | 
      apertium-tagger -g ~/local/share/apertium/apertium-es-it/es-it.prob | apertium-retxt > es-text.tagged.txt &
 525  cat it-text.txt | apertium-destxt | lt-proc ~/local/share/apertium/apertium-es-it/it-es.automorf.bin |
      apertium-tagger -g ~/local/share/apertium/apertium-es-it/it-es.prob | apertium-retxt > it-text.tagged.txt &
 550  ~fsanchez/GIZA++-v2/plain2snt.out es/es-text.tagged.txt it/it-text.tagged.txt
 551  ~fsanchez/GIZA++-v2/mkcls-v2/mkcls -m2 -pes/es-text.tagged.txt -c50 -Ves/es-text.tagged.vcb.classes opt >& es/mkcls1.log
 555  ~fsanchez/GIZA++-v2/mkcls-v2/mkcls -m2 -pit/it-text.tagged.txt -c50 -Vit/it-text.tagged.vcb.classes opt >& it/mkcls1.log
 564  ~fsanchez/GIZA++-v2/GIZA++ -S es/es-text.tagged.vcb -T it/it-text.tagged.vcb -C es/es-text.tagged_it-text.tagged.snt -p0 0.98 -o es-it.aligned >& alignment.log

Second try[edit]

 629  cat es-text.txt | grep -v '^<' > es-text.t
 630  mv es-text.t es-text.txt 
 633  cat it-text.txt | grep -v '<' > it
 634  mv it it-text.txt 
 550  ~fsanchez/GIZA++-v2/plain2snt.out es/es-text.txt it/it-text.txt
 930  ~fsanchez/GIZA++-v2/mkcls-v2/mkcls -m2 -pes/es-text.txt -c50 -Ves/es-text.vcb.classes opt >& es/mkcls1.log
 932  ~fsanchez/GIZA++-v2/mkcls-v2/mkcls -m2 -pit/it-text.txt -c50 -Vit/it-text.vcb.classes opt >& it/mkcls1.log

Welsh to English (classic)[edit]

  774  cat cy-clean.txt | python -c "import sys, codecs; sys.stdout = codecs.getwriter('utf-8')(sys.stdout); sys.stdin = codecs.getreader('utf-8')(sys.stdin); print sys.stdin.read().lower();" > cy.crp.txt &
  775  cat en-clean.txt | python -c "import sys, codecs; sys.stdout = codecs.getwriter('utf-8')(sys.stdout); sys.stdin = codecs.getreader('utf-8')(sys.stdin); print sys.stdin.read().lower();" > en.crp.txt &
  781  plain2snt.out cy.crp.txt en.crp.txt 
  783  snt2cooc.out cy.crp.vcb en.crp.vcb cy.crp_en.crp.snt > cy-en.model.cooc
  784  trainGIZA++.sh cy.crp.vcb en.crp.vcb cy.crp_en.crp.snt