Difference between revisions of "User:Francis Tyers/Experiments"

From Apertium
Jump to navigation Jump to search
Line 27: Line 27:
! Pair !! freq !! tlm !! ling !! alig !! rules<br/>(c>1.5) !! rules<br/>(c>2.0) !! rules<br/>(c>2.5) !! rules<br/>(c>3.0) !! rules<br/>(c>3.5) !! rules<br/>(c>4.0)
! Pair !! freq !! tlm !! ling !! alig !! rules<br/>(c>1.5) !! rules<br/>(c>2.0) !! rules<br/>(c>2.5) !! rules<br/>(c>3.0) !! rules<br/>(c>3.5) !! rules<br/>(c>4.0)
|-
|-
| br-fr || <small>&mdash;</small><br/>[55.4, 61.2] || <small>&mdash;</small><br/> || <small>168</small><br/>[55.4, 61.1] || <small>115</small><br/>[31.6, 36.9] || <small>221</small><br/>[31.0, 36.6] ||align="center"| <small>213</small><br/>[30.7, 36.3] || <small>159</small><br/>[29.9, 35.2] || <small>150</small><br/>[29.7, 35.1] || <small>135</small><br/>[30.6, 36.0] || <small>135</small><br/>[30.6, 36.0]
| br-fr || <small>&mdash;</small><br/>[55.4, 61.2] || <small>&mdash;</small><br/>[42.8, 48.8] || <small>168</small><br/>[55.4, 61.1] || <small>115</small><br/>[31.6, 36.9] || <small>221</small><br/>[31.0, 36.6] ||align="center"| <small>213</small><br/>[30.7, 36.3] || <small>159</small><br/>[29.9, 35.2] || <small>150</small><br/>[29.7, 35.1] || <small>135</small><br/>[30.6, 36.0] || <small>135</small><br/>[30.6, 36.0]


|-
|-

Revision as of 19:07, 1 August 2012

Pair Corpus Lines W. (src) SL cov. Extracted Extracted (%) L. (train) L. (test) Uniq. tokens >1 trad. Avg. trad / word
br-fr oab 57,305 702,328 94.47% 4,668 8.32 2,668 1,000 603 1.07
en-es europarl 1,467,708 30,154,098 98.08% 312,162 22.18 310,162 1,000 2,082 1.08
eu-es opendata.euskadi.net 765,115 10,190,079 91.70% 87,907 11.48 85,907 1,000 1,806 1.30
mk-en setimes 190,493 4,259,338 92.17% 19,747 10.94 17,747 1,000 13,134 1.86
sh-mk setimes


EAMT results

LER

is the "crispiness" ratio, the amount of times an alternative translation is seen in a given context compared to the default translation. So, a of 2.0 means that the translation appears twice as frequently as the default.

Pair freq tlm ling alig rules
(c>1.5)
rules
(c>2.0)
rules
(c>2.5)
rules
(c>3.0)
rules
(c>3.5)
rules
(c>4.0)
br-fr
[55.4, 61.2]

[42.8, 48.8]
168
[55.4, 61.1]
115
[31.6, 36.9]
221
[31.0, 36.6]
213
[30.7, 36.3]
159
[29.9, 35.2]
150
[29.7, 35.1]
135
[30.6, 36.0]
135
[30.6, 36.0]
en-es

667
[21.3, 25.4]
630
[10.4, 13.8]
2881
[9.4, 12.6]
2728
[9.2, 12.3]
1683
[9.1, 12.2]
1578
[8.9, 12.0]
1242
[9.0, 12.2]
652
[9.2, 12.3]
eu-es

697
[50.4, 55.6]
598
[23.2, 27.7]
2253
[26.0, 30.5]
2088
[23.9, 28.5]
1382
[23.4, 28.1]
1266
[23.1, 27.8]
1022
[23.1, 27.8]
995
[23.3, 27.9]
mk-en

1385
[35.4, 38.2]
1079
[29.4, 32.2]
1684
[29.1, 32.1]
1635
[29.2, 32.1]
1323
[29.3, 32.3]
1271
[29.3, 32.2]
1198
[29.4, 32.3]
1079

Processing

Basque→Spanish

 2081  cat europako_testuak_memoria_2010.tmx | iconv -f utf-16 -t utf-8 > europako_testuak_memoria_2010.tmx.u8
 2082  cat 2010_memo_orokorra.tmx | iconv -f utf-16 -t utf-8 > 2010_memo_orokorra.tmx.u8
 2088  python3 process-tmx.py europako_testuak_memoria_2010.tmx.u8 > europako_testuak_memoria_2010.txt
 2090  python3 process-tmx.py 2010_memo_orokorra.tmx.u8 > 2010_memo_orokorra.txt
 2091  cat 2010_memo_orokorra.txt | grep '^es' | cut -f2- > 2010_memo_orokorra.es.txt
 2092  cat 2010_memo_orokorra.txt | grep '^eu' | cut -f2- > 2010_memo_orokorra.eu.txt
 2094  cat europako_testuak_memoria_2010.txt | grep '^es' | cut -f2- > europako_testuak_memoria_2010.es.txt
 2095  cat europako_testuak_memoria_2010.txt | grep '^eu' | cut -f2- > europako_testuak_memoria_2010.eu.txt
 2099  cat europako_testuak_memoria_2010.es.txt 2010_memo_orokorra.es.txt > opendata.es
 2100  cat europako_testuak_memoria_2010.eu.txt 2010_memo_orokorra.eu.txt > opendata.eu


$ wc -l opendata.e*
   782325 opendata.es
   782325 opendata.eu


 2114  perl /home/fran/local/bin/scripts-20120109-1229/training/clean-corpus-n.perl opendata eu es opendata.clean 1 40

 2117  cat opendata.clean.eu |apertium-destxt | apertium -f none -d ~/source/apertium-eu-es/ eu-es-pretransfer > opendata.tagged.eu
 2126  cat opendata.clean.es |apertium-destxt | apertium -f none -d ~/source/apertium-eu-es/ es-eu-pretransfer > opendata.tagged.es &


 2132  seq 1 771238 > opendata.lines
 2133  paste opendata.lines opendata.tagged.eu opendata.tagged.es | grep '<' | cut -f1 > opendata.lines.new
 2134  paste opendata.lines opendata.tagged.eu opendata.tagged.es | grep '<' | cut -f2 > opendata.tagged.eu.new
 2135  paste opendata.lines opendata.tagged.eu opendata.tagged.es | grep '<' | cut -f3 > opendata.tagged.es.new

 2137  mv opendata.lines.new opendata.lines
 2138  mv opendata.tagged.es.new opendata.tagged.es
 2139  mv opendata.tagged.eu.new opendata.tagged.eu

 2146  cat opendata.tagged.eu | lt-proc -b ~/source/apertium-eu-es/eu-es.autobil.bin  >/tmp/eu-es.bil1

 2148  cat opendata.tagged.eu | lt-proc -b ~/source/apertium-eu-es/eu-es.autobil-noRL.bin  >/tmp/eu-es.bil2

$ tail -n 1 /tmp/*.poly
==> /tmp/eu-es.bil1.poly <==
1.00240014637

==> /tmp/eu-es.bil2.poly <==
1.3015831681

 2191  mv /tmp/eu-es.bil2 opendata.biltrans.eu-es

 2258  cat opendata.tagged.es | python /home/fran/source/apertium-lex-tools/scripts/process-tagger-output.py es > opendata.token.es
 2007  cat opendata.tagged.eu |  python /home/fran/source/apertium-lex-tools/scripts/process-tagger-output.py eu > opendata.token.eu
 2014  cat opendata.biltrans.eu-es | python /home/fran/source/apertium-lex-tools/scripts/process-biltrans-output.py > opendata.token.eu-es &

$ nohup perl ~/local/bin/scripts-20120109-1229/training/train-model.perl -scripts-root-dir \
 /home/fran/local/bin/scripts-20120109-1229/ -root-dir . -corpus opendata.token -f eu -e es -alignment grow-diag-final-and \
 -reordering msd-bidirectional-fe  -lm 0:5:/home/fran/corpora/europarl/europarl.lm:0 >log 2>&1 &

 2011  paste opendata.lines opendata.token.eu opendata.token.es  | grep -v '\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*' | cut -f1 > opendata.lines.new&
 2013  paste opendata.lines opendata.token.eu opendata.token.es  | grep -v '\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*' | cut -f2 > opendata.eu.new &
 2014  paste opendata.lines opendata.token.eu opendata.token.es  | grep -v '\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*' | cut -f3 > opendata.es.new &


 2017  mv opendata.lines.new opendata.lines
 2018  mv opendata.es.new opendata.token.es
 2019  mv opendata.eu.new opendata.token.eu


 2032  paste opendata.lines opendata.token.eu opendata.token.es  | grep -v '<sent>.*<sent>.*<sent>.*<sent>.*<sent>.*<sent>' | cut -f1 > opendata.lines.new
 2033  paste opendata.lines opendata.token.eu opendata.token.es  | grep -v '<sent>.*<sent>.*<sent>.*<sent>.*<sent>.*<sent>' | cut -f2 > opendata.eu.new &
 2034  paste opendata.lines opendata.token.eu opendata.token.es  | grep -v '<sent>.*<sent>.*<sent>.*<sent>.*<sent>.*<sent>' | cut -f3 > opendata.es.new &
 2035  mv opendata.lines.new opendata.lines
 2036  mv opendata.es.new opendata.token.es
 2037  mv opendata.eu.new opendata.token.eu

 2055  cat opendata.token.es | sed 's/ *$//g' > opendata.token.es.new
 2056  cat opendata.token.eu | sed 's/ *$//g' > opendata.token.eu.new
 2057  mv opendata.token.es.new opendata.token.es
 2058  mv opendata.token.eu.new opendata.token.eu


English→Spanish


 2114  perl /home/fran/local/bin/scripts-20120109-1229/training/clean-corpus-n.perl europarl en es europarl.clean 1 40

 2056  cat europarl.clean.en | apertium-destxt | apertium -f none -d ~/source/apertium-en-es en-es-pretransfer > europarl.tagged.en &
 2057  cat europarl.clean.es | apertium-destxt | apertium -f none -d ~/source/apertium-en-es es-en-pretransfer > europarl.tagged.es &

 2073  paste europarl.lines europarl.tagged.en europarl.tagged.es | grep '<' | cut -f1 > europarl.lines.new
 2074  paste europarl.lines europarl.tagged.en europarl.tagged.es | grep '<' | cut -f2 > europarl.tagged.en.new
 2075  paste europarl.lines europarl.tagged.en europarl.tagged.es | grep '<' | cut -f3 > europarl.tagged.es.new

 2087  paste europarl.lines europarl.tagged.en europarl.tagged.es | grep -v '<sent>.*<sent>.*<sent>.*<sent>.*<sent>.*<sent>' | grep -v '\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*' | cut -f1 >europarl.lines.new
 2088  bg
 2089  paste europarl.lines europarl.tagged.en europarl.tagged.es | grep -v '<sent>.*<sent>.*<sent>.*<sent>.*<sent>.*<sent>' | grep -v '\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*' | cut -f2 >europarl.en.new&
 2090  paste europarl.lines europarl.tagged.en europarl.tagged.es | grep -v '<sent>.*<sent>.*<sent>.*<sent>.*<sent>.*<sent>' | grep -v '\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*.*\*' | cut -f3 >europarl.es.new&

 2097  nohup cat europarl.tagged.en | python ~/source/apertium-lex-tools/scripts/process-tagger-output.py en > europarl.token.en &
 2098  nohup cat europarl.tagged.es | python ~/source/apertium-lex-tools/scripts/process-tagger-output.py es > europarl.token.es &
 2099  nohup cat europarl.biltrans.en-es | python ~/source/apertium-lex-tools/scripts/process-biltrans-output.py > europarl.token.en-es &

Macedonian→English




:%s/еfу/еѓу/g
:%s/аfа/аѓа/g
:%s/оfа/оѓа/g
:%s/уfе/уѓе/g
:%s/нfи/нѓи/g
:%s/Ѓиниfиќ/Ѓинѓиќ/g
:%s/еfе/еѓе/g
:%s/уfм/уѓм/g
:%s/рfи/рѓи/g
:%s/ fе / ѓе /g
:%s/рfе/рѓе/g
:%s/уfи/уѓи/g
:%s/ fу/ ѓу/g
:%s/Караfорѓевиќ/Караѓорѓевиќ/g
:%s/Холанfанец/Холанѓанец/g
:%s/реfаваат/реѓаваат/g
:%s/Швеfанката/Швеѓанката/g
:%s/Новозеланfани/Новозеланѓани/g
:%s/Мрfан/Мрѓан/g
:%s/Анfелка/Анѓелка/g
:%s/рfосаната/рѓосаната/g
:%s/оттуfуваоето/оттуѓуваоето/g
:%s/Енfел/Енѓел/g
:%s/Караfорѓевиќ/Караѓорѓевиќ/g
:%s/маfународната/маѓународната/g
:%s/Пеfа/Пеѓа/g
:%s/маfепсник/маѓепсник/g
:%s/Караfорѓе/Караѓорѓе/g
:%s/Граfевинар/Граѓевинар/g
:%s/Меfаши/Меѓаши/g
:%s/Ванfел/Ванѓел/g
:%s/Караfиќ/Караѓиќ/g
:%s/Анfели/Анѓели/g
:%s/саfи/саѓи/g
:%s/маfионичарски/маѓионичарски/g
:%s/Караfорѓевиќ/Караѓорѓевиќ/g
:%s/панаfур/панаѓур/g
:%s/Ѓерf/Ѓерѓ/g
:%s/Ѓинѓиf/Ѓинѓиѓ/g


 2042  paste setimes.mk setimes.en| grep -v '^(' | cut -f1 > setimes.mk.new
 2043  paste setimes.mk setimes.en| grep -v '^(' | cut -f2 > setimes.en.new
 2044  paste setimes.en.new setimes.mk.new | grep -v '^(' | cut -f1 > setimes.en
 2045  paste setimes.en.new setimes.mk.new | grep -v '^(' | cut -f2 > setimes.mk

perl /home/fran/local/bin/scripts-20120109-1229/training/clean-corpus-n.perl setimes mk en setimes.clean 1 40 


 2052  cat setimes.clean.mk | apertium-destxt | apertium -f none -d ~/source/apertium-mk-en/ mk-en-pretransfer > setimes.tagged.mk&
 2054  cat setimes.clean.en | apertium-destxt | apertium -f none -d ~/source/apertium-mk-en/ en-mk-pretransfer > setimes.tagged.en&

 2063  seq 1 190503 > setimes.lines
 2064  paste setimes.lines setimes.tagged.mk setimes.tagged.en | grep '<' | cut -f1 > setimes.lines.new
 2065  paste setimes.lines setimes.tagged.mk setimes.tagged.en | grep '<' | cut -f2 > setimes.mk.new
 2066  paste setimes.lines setimes.tagged.mk setimes.tagged.en | grep '<' | cut -f3 > setimes.en.new
 2067  mv setimes.en.new setimes.tagged.en
 2068  mv setimes.mk.new setimes.tagged.mk
 2069  mv setimes.lines.new setimes.lines

 2077  nohup cat setimes.tagged.mk | lt-proc -b ~/source/apertium-mk-en/mk-en.autobil.bin > setimes.biltrans.mk-en &


 2122  cat setimes.tagged.mk |  python ~/source/apertium-lex-tools/scripts/process-tagger-output.py mk > setimes.token.mk &
 2123  cat setimes.tagged.en |  python ~/source/apertium-lex-tools/scripts/process-tagger-output.py en > setimes.token.en &