Difference between revisions of "User:Francis Tyers/Experiments"

From Apertium
Jump to navigation Jump to search
Line 1: Line 1:
 
{|class=wikitable
 
{|class=wikitable
! Language pair !! Corpus !! Lines !! W. (src) !! SL cov. || W. (train) !! Words (test) !! Words >1 trad. !! Avg. trad / word !!
+
! Language pair !! Corpus !! Lines !! W. (src) !! SL cov. || W. (train) !! Words (test) !! Uniq. tokens >1 trad. !! Avg. trad / word !!
 
|-
 
|-
 
| br-fr || oab || || || || || || ||
 
| br-fr || oab || || || || || || ||
 
|-
 
|-
| en-es || europarl || 1,467,708 || 30,154,098 || 98.08% || - || - || - || -
+
| en-es || europarl || 1,467,708 || 30,154,098 || 98.08% || - || - || 2,082 || 1.08
 
|-
 
|-
| eu-es || opendata.euskadi.net || 765,115 || 10,190,079 || 91.70% || - || - || 1,806 || 1.30
+
| eu-es || opendata.euskadi.net || 765,115 || 10,190,079 || 91.70% || - || - || 1,806 || 1.30
 
|-
 
|-
 
| mk-en || setimes || || || || || || ||
 
| mk-en || setimes || || || || || || ||

Revision as of 09:48, 22 June 2012

Language pair Corpus Lines W. (src) SL cov. W. (train) Words (test) Uniq. tokens >1 trad. Avg. trad / word
br-fr oab
en-es europarl 1,467,708 30,154,098 98.08% - - 2,082 1.08
eu-es opendata.euskadi.net 765,115 10,190,079 91.70% - - 1,806 1.30
mk-en setimes
sh-mk setimes

Basque→Spanish

 2081  cat europako_testuak_memoria_2010.tmx | iconv -f utf-16 -t utf-8 > europako_testuak_memoria_2010.tmx.u8
 2082  cat 2010_memo_orokorra.tmx | iconv -f utf-16 -t utf-8 > 2010_memo_orokorra.tmx.u8
 2088  python3 process-tmx.py europako_testuak_memoria_2010.tmx.u8 > europako_testuak_memoria_2010.txt
 2090  python3 process-tmx.py 2010_memo_orokorra.tmx.u8 > 2010_memo_orokorra.txt
 2091  cat 2010_memo_orokorra.txt | grep '^es' | cut -f2- > 2010_memo_orokorra.es.txt
 2092  cat 2010_memo_orokorra.txt | grep '^eu' | cut -f2- > 2010_memo_orokorra.eu.txt
 2094  cat europako_testuak_memoria_2010.txt | grep '^es' | cut -f2- > europako_testuak_memoria_2010.es.txt
 2095  cat europako_testuak_memoria_2010.txt | grep '^eu' | cut -f2- > europako_testuak_memoria_2010.eu.txt
 2099  cat europako_testuak_memoria_2010.es.txt 2010_memo_orokorra.es.txt > opendata.es
 2100  cat europako_testuak_memoria_2010.eu.txt 2010_memo_orokorra.eu.txt > opendata.eu


$ wc -l opendata.e*
   782325 opendata.es
   782325 opendata.eu


 2114  perl /home/fran/local/bin/scripts-20120109-1229/training/clean-corpus-n.perl opendata eu es opendata.clean 1 80

 2117  cat opendata.clean.eu |apertium-destxt | apertium -f none -d ~/source/apertium-eu-es/ eu-es-pretransfer > opendata.tagged.eu
 2126  cat opendata.clean.es |apertium-destxt | apertium -f none -d ~/source/apertium-eu-es/ es-eu-pretransfer > opendata.tagged.es &


 2132  seq 1 771238 > opendata.lines
 2133  paste opendata.lines opendata.tagged.eu opendata.tagged.es | grep '<' | cut -f1 > opendata.lines.new
 2134  paste opendata.lines opendata.tagged.eu opendata.tagged.es | grep '<' | cut -f2 > opendata.tagged.eu.new
 2135  paste opendata.lines opendata.tagged.eu opendata.tagged.es | grep '<' | cut -f3 > opendata.tagged.es.new

 2137  mv opendata.lines.new opendata.lines
 2138  mv opendata.tagged.es.new opendata.tagged.es
 2139  mv opendata.tagged.eu.new opendata.tagged.eu

 2146  cat opendata.tagged.eu | lt-proc -b ~/source/apertium-eu-es/eu-es.autobil.bin  >/tmp/eu-es.bil1

 2148  cat opendata.tagged.eu | lt-proc -b ~/source/apertium-eu-es/eu-es.autobil-noRL.bin  >/tmp/eu-es.bil2

$ tail -n 1 /tmp/*.poly
==> /tmp/eu-es.bil1.poly <==
1.00240014637

==> /tmp/eu-es.bil2.poly <==
1.3015831681

 2191  mv /tmp/eu-es.bil2 opendata.biltrans.eu-es

 2258  cat opendata.tagged.es | python /home/fran/source/apertium-lex-tools/scripts/process-tagger-output.py es > opendata.token.es
 2007  cat opendata.tagged.eu |  python /home/fran/source/apertium-lex-tools/scripts/process-tagger-output.py eu > opendata.token.eu
 2014  cat opendata.biltrans.eu-es | python /home/fran/source/apertium-lex-tools/scripts/process-biltrans-output.py > opendata.token.eu-es &

$ nohup perl ~/local/bin/scripts-20120109-1229/training/train-model.perl -scripts-root-dir \
 /home/fran/local/bin/scripts-20120109-1229/ -root-dir . -corpus opendata.token -f eu -e es -alignment grow-diag-final-and \
 -reordering msd-bidirectional-fe  -lm 0:5:/home/fran/corpora/europarl/europarl.lm:0 >log 2>&1 &

 2011  paste opendata.lines opendata.token.eu opendata.token.es  | grep -v '\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*' | cut -f1 > opendata.lines.new&
 2013  paste opendata.lines opendata.token.eu opendata.token.es  | grep -v '\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*' | cut -f2 > opendata.eu.new &
 2014  paste opendata.lines opendata.token.eu opendata.token.es  | grep -v '\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*' | cut -f3 > opendata.es.new &


 2017  mv opendata.lines.new opendata.lines
 2018  mv opendata.es.new opendata.token.es
 2019  mv opendata.eu.new opendata.token.eu


 2032  paste opendata.lines opendata.token.eu opendata.token.es  | grep -v '<sent>.*<sent>.*<sent>.*<sent>.*<sent>.*<sent>' | cut -f1 > opendata.lines.new
 2033  paste opendata.lines opendata.token.eu opendata.token.es  | grep -v '<sent>.*<sent>.*<sent>.*<sent>.*<sent>.*<sent>' | cut -f2 > opendata.eu.new &
 2034  paste opendata.lines opendata.token.eu opendata.token.es  | grep -v '<sent>.*<sent>.*<sent>.*<sent>.*<sent>.*<sent>' | cut -f3 > opendata.es.new &
 2035  mv opendata.lines.new opendata.lines
 2036  mv opendata.es.new opendata.token.es
 2037  mv opendata.eu.new opendata.token.eu