Difference between revisions of "Talk:Scandinavian MT project"
Jump to navigation
Jump to search
(complete rerun) |
|||
(6 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
==Coverage== |
|||
Coverage on Wikipedia dumps ("w/o cmp" is with decompounding turned off, ie. without the -e switch to lt-proc). |
|||
Coverage on Wikipedia dumps ("no decomp" is with decompounding turned off, ie. without the -e switch to lt-proc; ex-upper is excluding anything with uppercase characters from all counts; script at bottom of page). |
|||
E.g. |
|||
===Results=== |
|||
<pre> |
<pre> |
||
$ ./scandicov.sh |
|||
bzcat ~/corpora/nnclean2.txt.bz2 \ |
|||
nno-swe |
|||
|tr ' ' '\n' \ |
|||
unk known tot cov % |
|||
|grep -m5113060 . \ |
|||
805040 5307135 6112175 86.8289 no decomp ex-upper: 88.251 |
|||
|apertium-deshtml \ |
|||
703448 5408727 6112175 88.491 with decomp ex-upper: 91.5592 |
|||
|lt-proc nno-dan.automorf.bin \ |
|||
|apertium-cleanstream -n \ |
|||
nob-swe |
|||
|awk 'BEGIN{OFS=FS="\t"} /^\^/{lu++} /\/\*/{u++} END{print "unk","known","tot","cov %";print u,lu-u,lu,100*(lu-u)/lu}' |
|||
unk known tot cov % |
|||
691359 5408402 6099761 88.6658 no decomp ex-upper: 90.0757 |
|||
579262 5520499 6099761 90.5035 with decomp ex-upper: 93.3714 |
|||
swe-nno |
|||
unk known tot cov % |
|||
914584 5167659 6082243 84.963 no decomp ex-upper: 87.082 |
|||
744040 5338204 6082244 87.767 with decomp ex-upper: 92.1901 |
|||
swe-nob |
|||
unk known tot cov % |
|||
907104 5175868 6082972 85.0878 no decomp ex-upper: 87.2664 |
|||
733559 5349414 6082973 87.9408 with decomp ex-upper: 92.4382 |
|||
dan-swe |
|||
unk known tot cov % |
|||
1135679 4775050 5910729 80.7861 no decomp ex-upper: 86.2313 |
|||
997339 4913390 5910729 83.1266 with decomp ex-upper: 90.5919 |
|||
swe-dan |
|||
unk known tot cov % |
|||
1120955 4760872 5881827 80.9421 no decomp ex-upper: 86.4166 |
|||
926361 4955466 5881827 84.2505 with decomp ex-upper: 92.2963 |
|||
dan-nno |
|||
unk known tot cov % |
|||
870318 5020221 5890539 85.2252 no decomp ex-upper: 87.512 |
|||
752064 5138475 5890539 87.2327 with decomp ex-upper: 91.4789 |
|||
dan-nob |
|||
unk known tot cov % |
|||
832984 5052919 5885903 85.8478 no decomp ex-upper: 88.22 |
|||
703618 5182285 5885903 88.0457 with decomp ex-upper: 92.308 |
|||
nno-dan |
|||
unk known tot cov % |
|||
707162 5295158 6002320 88.2185 no decomp ex-upper: 90.2369 |
|||
613734 5388586 6002320 89.7751 with decomp ex-upper: 93.4124 |
|||
nob-dan |
|||
unk known tot cov % |
|||
594784 5387514 5982298 90.0576 no decomp ex-upper: 91.7191 |
|||
492014 5490284 5982298 91.7755 with decomp ex-upper: 94.8719 |
|||
nno-nob |
|||
unk known tot cov % |
|||
704975 5364593 6069568 88.3851 no decomp ex-upper: 91.2337 |
|||
613334 5456234 6069568 89.8949 with decomp ex-upper: 94.2736 |
|||
nob-nno |
|||
unk known tot cov % |
|||
603139 5456170 6059309 90.0461 no decomp ex-upper: 92.6105 |
|||
502464 5556845 6059309 91.7076 with decomp ex-upper: 95.5668 |
|||
</pre> |
</pre> |
||
===script=== |
|||
{|class=wikitable |
|||
<pre> |
|||
! Direction !! w/o cmp !! regular |
|||
$ cat scandicov.sh |
|||
|- |
|||
#!/bin/bash |
|||
| nob-nno || 90.9% || 92.6% |
|||
|- |
|||
# kill process group: |
|||
| nob-dan || 89.8% || 91.5% |
|||
trap "kill -- -0" EXIT |
|||
|- |
|||
| nno-nob || 89.2% || 90.6% |
|||
sum () { |
|||
|- |
|||
awk -v note="$1" ' |
|||
| nno-dan || 87.4% || 88.8% |
|||
BEGIN{OFS=FS="\t"} |
|||
|- |
|||
/^\^/{w++} |
|||
| dan-nob || 85.1% || 86.4% |
|||
/\/\*/{u++} |
|||
|- |
|||
/\/\*.*[[:upper:]]/{Uu++} |
|||
| swe-dan || 80.4% || 83.7% |
|||
/[[:upper:]]/{Uw++} |
|||
|- |
|||
END{ |
|||
| dan-nno || 82.5% || 83.5% |
|||
lw=w-Uw |
|||
|- |
|||
lu=u-Uu |
|||
| dan-swe || 80.6% || 82.9% |
|||
print u,w-u,w,100*(w-u)/w, note,"ex-upper:",100*(lw-lu)/lw}' |
|||
|- |
|||
} |
|||
| nob-swe || 74.9% || 76.2% |
|||
|- |
|||
for ana in /l/a/*/apertium-{swe-nor,swe-dan,dan-nor,nno-nob}/*.automorf.bin; do |
|||
| nno-swe || 73.5% || 74.6% |
|||
f="$(basename "${ana}")" |
|||
|- |
|||
echo "${f%%.automorf.bin}" |
|||
| swe-nob || 69.2% || 72.1% |
|||
src="${f%%-*}" |
|||
|- |
|||
printf "unk\tknown\ttot\tcov %%\n" |
|||
| swe-nno || 69.1% || 71.9% |
|||
xzcat ~/corpora/"${src}".wikicov.xz | lt-proc "${ana}" \ |
|||
|} |
|||
| apertium-cleanstream -n \ |
|||
| sum " no decomp" & |
|||
xzcat ~/corpora/"${src}".wikicov.xz | lt-proc -e "${ana}" \ |
|||
| apertium-cleanstream -n \ |
|||
| sum "with decomp" & |
|||
wait |
|||
echo |
|||
done |
|||
</pre> |
|||
==Testvoc== |
|||
===Results=== |
|||
<pre> |
|||
1 ../apertium-dan-nor/nobackup/69419-dan-nno |
|||
30 ../apertium-dan-nor/nobackup/69419-dan-nob |
|||
37 ../apertium-dan-nor/nobackup/69419-nno-dan |
|||
26 ../apertium-dan-nor/nobackup/69419-nob-dan |
|||
94 total |
|||
136 ../apertium-nno-nob/nobackup/69419-nno-nob |
|||
71 ../apertium-nno-nob/nobackup/69419-nob-nno |
|||
207 total |
|||
157 ../apertium-swe-dan/nobackup/69419-dan-swe |
|||
276 ../apertium-swe-dan/nobackup/69419-swe-dan |
|||
433 total |
|||
0 ../apertium-swe-nor/nobackup/69419-nno-swe |
|||
0 ../apertium-swe-nor/nobackup/69419-nob-swe |
|||
0 ../apertium-swe-nor/nobackup/69419-swe-nno |
|||
0 ../apertium-swe-nor/nobackup/69419-swe-nob.tmp |
|||
0 total |
|||
</pre> |
|||
===script=== |
|||
<pre> |
|||
$ while sleep 30; do |
|||
for d in trunk/apertium-{swe-nor,dan-nor,nno-nob,swe-dan}; do |
|||
( cd $d && up && make langs && rev=$(svn info|grep ^Revision: |grep -o '[0-9]*') && |
|||
for m in modes/???-???.mode; do |
|||
p=${m##modes/} && p=${p%%.mode} && echo $p && |
|||
( test -f nobackup/${rev}-$p || |
|||
( dev/testvoc/generation.sh --hfst $p > nobackup/${rev}-$p.tmp |
|||
mv nobackup/${rev}-$p.tmp nobackup/${rev}-$p |
|||
) |
|||
) |
|||
done |
|||
) |
|||
done |
|||
done |
|||
</pre> |
Latest revision as of 18:20, 1 July 2016
Coverage[edit]
Coverage on Wikipedia dumps ("no decomp" is with decompounding turned off, ie. without the -e switch to lt-proc; ex-upper is excluding anything with uppercase characters from all counts; script at bottom of page).
Results[edit]
$ ./scandicov.sh nno-swe unk known tot cov % 805040 5307135 6112175 86.8289 no decomp ex-upper: 88.251 703448 5408727 6112175 88.491 with decomp ex-upper: 91.5592 nob-swe unk known tot cov % 691359 5408402 6099761 88.6658 no decomp ex-upper: 90.0757 579262 5520499 6099761 90.5035 with decomp ex-upper: 93.3714 swe-nno unk known tot cov % 914584 5167659 6082243 84.963 no decomp ex-upper: 87.082 744040 5338204 6082244 87.767 with decomp ex-upper: 92.1901 swe-nob unk known tot cov % 907104 5175868 6082972 85.0878 no decomp ex-upper: 87.2664 733559 5349414 6082973 87.9408 with decomp ex-upper: 92.4382 dan-swe unk known tot cov % 1135679 4775050 5910729 80.7861 no decomp ex-upper: 86.2313 997339 4913390 5910729 83.1266 with decomp ex-upper: 90.5919 swe-dan unk known tot cov % 1120955 4760872 5881827 80.9421 no decomp ex-upper: 86.4166 926361 4955466 5881827 84.2505 with decomp ex-upper: 92.2963 dan-nno unk known tot cov % 870318 5020221 5890539 85.2252 no decomp ex-upper: 87.512 752064 5138475 5890539 87.2327 with decomp ex-upper: 91.4789 dan-nob unk known tot cov % 832984 5052919 5885903 85.8478 no decomp ex-upper: 88.22 703618 5182285 5885903 88.0457 with decomp ex-upper: 92.308 nno-dan unk known tot cov % 707162 5295158 6002320 88.2185 no decomp ex-upper: 90.2369 613734 5388586 6002320 89.7751 with decomp ex-upper: 93.4124 nob-dan unk known tot cov % 594784 5387514 5982298 90.0576 no decomp ex-upper: 91.7191 492014 5490284 5982298 91.7755 with decomp ex-upper: 94.8719 nno-nob unk known tot cov % 704975 5364593 6069568 88.3851 no decomp ex-upper: 91.2337 613334 5456234 6069568 89.8949 with decomp ex-upper: 94.2736 nob-nno unk known tot cov % 603139 5456170 6059309 90.0461 no decomp ex-upper: 92.6105 502464 5556845 6059309 91.7076 with decomp ex-upper: 95.5668
script[edit]
$ cat scandicov.sh #!/bin/bash # kill process group: trap "kill -- -0" EXIT sum () { awk -v note="$1" ' BEGIN{OFS=FS="\t"} /^\^/{w++} /\/\*/{u++} /\/\*.*[[:upper:]]/{Uu++} /[[:upper:]]/{Uw++} END{ lw=w-Uw lu=u-Uu print u,w-u,w,100*(w-u)/w, note,"ex-upper:",100*(lw-lu)/lw}' } for ana in /l/a/*/apertium-{swe-nor,swe-dan,dan-nor,nno-nob}/*.automorf.bin; do f="$(basename "${ana}")" echo "${f%%.automorf.bin}" src="${f%%-*}" printf "unk\tknown\ttot\tcov %%\n" xzcat ~/corpora/"${src}".wikicov.xz | lt-proc "${ana}" \ | apertium-cleanstream -n \ | sum " no decomp" & xzcat ~/corpora/"${src}".wikicov.xz | lt-proc -e "${ana}" \ | apertium-cleanstream -n \ | sum "with decomp" & wait echo done
Testvoc[edit]
Results[edit]
1 ../apertium-dan-nor/nobackup/69419-dan-nno 30 ../apertium-dan-nor/nobackup/69419-dan-nob 37 ../apertium-dan-nor/nobackup/69419-nno-dan 26 ../apertium-dan-nor/nobackup/69419-nob-dan 94 total 136 ../apertium-nno-nob/nobackup/69419-nno-nob 71 ../apertium-nno-nob/nobackup/69419-nob-nno 207 total 157 ../apertium-swe-dan/nobackup/69419-dan-swe 276 ../apertium-swe-dan/nobackup/69419-swe-dan 433 total 0 ../apertium-swe-nor/nobackup/69419-nno-swe 0 ../apertium-swe-nor/nobackup/69419-nob-swe 0 ../apertium-swe-nor/nobackup/69419-swe-nno 0 ../apertium-swe-nor/nobackup/69419-swe-nob.tmp 0 total
script[edit]
$ while sleep 30; do for d in trunk/apertium-{swe-nor,dan-nor,nno-nob,swe-dan}; do ( cd $d && up && make langs && rev=$(svn info|grep ^Revision: |grep -o '[0-9]*') && for m in modes/???-???.mode; do p=${m##modes/} && p=${p%%.mode} && echo $p && ( test -f nobackup/${rev}-$p || ( dev/testvoc/generation.sh --hfst $p > nobackup/${rev}-$p.tmp mv nobackup/${rev}-$p.tmp nobackup/${rev}-$p ) ) done ) done done