Difference between revisions of "User:Francis Tyers/Experiments"
Jump to navigation
Jump to search
Line 65: | Line 65: | ||
2014 cat opendata.biltrans.eu-es | python /home/fran/source/apertium-lex-tools/scripts/process-biltrans-output.py > opendata.token.eu-es & |
2014 cat opendata.biltrans.eu-es | python /home/fran/source/apertium-lex-tools/scripts/process-biltrans-output.py > opendata.token.eu-es & |
||
$ |
$ nohup perl ~/local/bin/scripts-20120109-1229/training/train-model.perl -scripts-root-dir \ |
||
/home/fran/local/bin/scripts-20120109-1229/ -root-dir . -corpus opendata.token -f eu -e es -alignment grow-diag-final-and \ |
|||
-reordering msd-bidirectional-fe -lm 0:5:/home/fran/corpora/europarl/europarl.lm:0 >log 2>&1 & |
-reordering msd-bidirectional-fe -lm 0:5:/home/fran/corpora/europarl/europarl.lm:0 >log 2>&1 & |
||
Line 77: | Line 77: | ||
2018 mv opendata.es.new opendata.token.es |
2018 mv opendata.es.new opendata.token.es |
||
2019 mv opendata.eu.new opendata.token.eu |
2019 mv opendata.eu.new opendata.token.eu |
||
2032 paste opendata.lines opendata.token.eu opendata.token.es | grep -v '<sent>.*<sent>.*<sent>.*<sent>.*<sent>.*<sent>' | cut -f1 > opendata.lines.new |
|||
2033 paste opendata.lines opendata.token.eu opendata.token.es | grep -v '<sent>.*<sent>.*<sent>.*<sent>.*<sent>.*<sent>' | cut -f2 > opendata.eu.new & |
|||
2034 paste opendata.lines opendata.token.eu opendata.token.es | grep -v '<sent>.*<sent>.*<sent>.*<sent>.*<sent>.*<sent>' | cut -f3 > opendata.es.new & |
|||
2035 mv opendata.lines.new opendata.lines |
|||
2036 mv opendata.es.new opendata.token.es |
|||
2037 mv opendata.eu.new opendata.token.eu |
|||
</pre> |
</pre> |
Revision as of 08:32, 22 June 2012
Language pair | Corpus | Lines | W. (src) | SL cov. | W. (train) | Words (test) | Words >1 trad. | Avg. trad / word | |
---|---|---|---|---|---|---|---|---|---|
br-fr | oab | ||||||||
en-es | europarl | 1,467,708 | 30,154,098 | 98.08% | - | - | - | - | |
eu-es | opendata.euskadi.net | 765,115 | 10,190,079 | 91.70% | - | - | 1,806 | 1.30 | |
mk-en | setimes | ||||||||
sh-mk | setimes |
Basque→Spanish
2081 cat europako_testuak_memoria_2010.tmx | iconv -f utf-16 -t utf-8 > europako_testuak_memoria_2010.tmx.u8 2082 cat 2010_memo_orokorra.tmx | iconv -f utf-16 -t utf-8 > 2010_memo_orokorra.tmx.u8 2088 python3 process-tmx.py europako_testuak_memoria_2010.tmx.u8 > europako_testuak_memoria_2010.txt 2090 python3 process-tmx.py 2010_memo_orokorra.tmx.u8 > 2010_memo_orokorra.txt 2091 cat 2010_memo_orokorra.txt | grep '^es' | cut -f2- > 2010_memo_orokorra.es.txt 2092 cat 2010_memo_orokorra.txt | grep '^eu' | cut -f2- > 2010_memo_orokorra.eu.txt 2094 cat europako_testuak_memoria_2010.txt | grep '^es' | cut -f2- > europako_testuak_memoria_2010.es.txt 2095 cat europako_testuak_memoria_2010.txt | grep '^eu' | cut -f2- > europako_testuak_memoria_2010.eu.txt 2099 cat europako_testuak_memoria_2010.es.txt 2010_memo_orokorra.es.txt > opendata.es 2100 cat europako_testuak_memoria_2010.eu.txt 2010_memo_orokorra.eu.txt > opendata.eu $ wc -l opendata.e* 782325 opendata.es 782325 opendata.eu 2114 perl /home/fran/local/bin/scripts-20120109-1229/training/clean-corpus-n.perl opendata eu es opendata.clean 1 80 2117 cat opendata.clean.eu |apertium-destxt | apertium -f none -d ~/source/apertium-eu-es/ eu-es-pretransfer > opendata.tagged.eu 2126 cat opendata.clean.es |apertium-destxt | apertium -f none -d ~/source/apertium-eu-es/ es-eu-pretransfer > opendata.tagged.es & 2132 seq 1 771238 > opendata.lines 2133 paste opendata.lines opendata.tagged.eu opendata.tagged.es | grep '<' | cut -f1 > opendata.lines.new 2134 paste opendata.lines opendata.tagged.eu opendata.tagged.es | grep '<' | cut -f2 > opendata.tagged.eu.new 2135 paste opendata.lines opendata.tagged.eu opendata.tagged.es | grep '<' | cut -f3 > opendata.tagged.es.new 2137 mv opendata.lines.new opendata.lines 2138 mv opendata.tagged.es.new opendata.tagged.es 2139 mv opendata.tagged.eu.new opendata.tagged.eu 2146 cat opendata.tagged.eu | lt-proc -b ~/source/apertium-eu-es/eu-es.autobil.bin >/tmp/eu-es.bil1 2148 cat opendata.tagged.eu | lt-proc -b ~/source/apertium-eu-es/eu-es.autobil-noRL.bin >/tmp/eu-es.bil2 $ tail -n 1 /tmp/*.poly ==> /tmp/eu-es.bil1.poly <== 1.00240014637 ==> /tmp/eu-es.bil2.poly <== 1.3015831681 2191 mv /tmp/eu-es.bil2 opendata.biltrans.eu-es 2258 cat opendata.tagged.es | python /home/fran/source/apertium-lex-tools/scripts/process-tagger-output.py es > opendata.token.es 2007 cat opendata.tagged.eu | python /home/fran/source/apertium-lex-tools/scripts/process-tagger-output.py eu > opendata.token.eu 2014 cat opendata.biltrans.eu-es | python /home/fran/source/apertium-lex-tools/scripts/process-biltrans-output.py > opendata.token.eu-es & $ nohup perl ~/local/bin/scripts-20120109-1229/training/train-model.perl -scripts-root-dir \ /home/fran/local/bin/scripts-20120109-1229/ -root-dir . -corpus opendata.token -f eu -e es -alignment grow-diag-final-and \ -reordering msd-bidirectional-fe -lm 0:5:/home/fran/corpora/europarl/europarl.lm:0 >log 2>&1 & 2011 paste opendata.lines opendata.token.eu opendata.token.es | grep -v '\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*' | cut -f1 > opendata.lines.new& 2013 paste opendata.lines opendata.token.eu opendata.token.es | grep -v '\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*' | cut -f2 > opendata.eu.new & 2014 paste opendata.lines opendata.token.eu opendata.token.es | grep -v '\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*' | cut -f3 > opendata.es.new & 2017 mv opendata.lines.new opendata.lines 2018 mv opendata.es.new opendata.token.es 2019 mv opendata.eu.new opendata.token.eu 2032 paste opendata.lines opendata.token.eu opendata.token.es | grep -v '<sent>.*<sent>.*<sent>.*<sent>.*<sent>.*<sent>' | cut -f1 > opendata.lines.new 2033 paste opendata.lines opendata.token.eu opendata.token.es | grep -v '<sent>.*<sent>.*<sent>.*<sent>.*<sent>.*<sent>' | cut -f2 > opendata.eu.new & 2034 paste opendata.lines opendata.token.eu opendata.token.es | grep -v '<sent>.*<sent>.*<sent>.*<sent>.*<sent>.*<sent>' | cut -f3 > opendata.es.new & 2035 mv opendata.lines.new opendata.lines 2036 mv opendata.es.new opendata.token.es 2037 mv opendata.eu.new opendata.token.eu