Talk:Scandinavian MT project
Jump to navigation
Jump to search
Coverage[edit]
Coverage on Wikipedia dumps ("no decomp" is with decompounding turned off, ie. without the -e switch to lt-proc; ex-upper is excluding anything with uppercase characters from all counts; script at bottom of page).
Results[edit]
$ ./scandicov.sh nno-swe unk known tot cov % 805040 5307135 6112175 86.8289 no decomp ex-upper: 88.251 703448 5408727 6112175 88.491 with decomp ex-upper: 91.5592 nob-swe unk known tot cov % 691359 5408402 6099761 88.6658 no decomp ex-upper: 90.0757 579262 5520499 6099761 90.5035 with decomp ex-upper: 93.3714 swe-nno unk known tot cov % 914584 5167659 6082243 84.963 no decomp ex-upper: 87.082 744040 5338204 6082244 87.767 with decomp ex-upper: 92.1901 swe-nob unk known tot cov % 907104 5175868 6082972 85.0878 no decomp ex-upper: 87.2664 733559 5349414 6082973 87.9408 with decomp ex-upper: 92.4382 dan-swe unk known tot cov % 1135679 4775050 5910729 80.7861 no decomp ex-upper: 86.2313 997339 4913390 5910729 83.1266 with decomp ex-upper: 90.5919 swe-dan unk known tot cov % 1120955 4760872 5881827 80.9421 no decomp ex-upper: 86.4166 926361 4955466 5881827 84.2505 with decomp ex-upper: 92.2963 dan-nno unk known tot cov % 870318 5020221 5890539 85.2252 no decomp ex-upper: 87.512 752064 5138475 5890539 87.2327 with decomp ex-upper: 91.4789 dan-nob unk known tot cov % 832984 5052919 5885903 85.8478 no decomp ex-upper: 88.22 703618 5182285 5885903 88.0457 with decomp ex-upper: 92.308 nno-dan unk known tot cov % 707162 5295158 6002320 88.2185 no decomp ex-upper: 90.2369 613734 5388586 6002320 89.7751 with decomp ex-upper: 93.4124 nob-dan unk known tot cov % 594784 5387514 5982298 90.0576 no decomp ex-upper: 91.7191 492014 5490284 5982298 91.7755 with decomp ex-upper: 94.8719 nno-nob unk known tot cov % 704975 5364593 6069568 88.3851 no decomp ex-upper: 91.2337 613334 5456234 6069568 89.8949 with decomp ex-upper: 94.2736 nob-nno unk known tot cov % 603139 5456170 6059309 90.0461 no decomp ex-upper: 92.6105 502464 5556845 6059309 91.7076 with decomp ex-upper: 95.5668
script[edit]
$ cat scandicov.sh #!/bin/bash # kill process group: trap "kill -- -0" EXIT sum () { awk -v note="$1" ' BEGIN{OFS=FS="\t"} /^\^/{w++} /\/\*/{u++} /\/\*.*[[:upper:]]/{Uu++} /[[:upper:]]/{Uw++} END{ lw=w-Uw lu=u-Uu print u,w-u,w,100*(w-u)/w, note,"ex-upper:",100*(lw-lu)/lw}' } for ana in /l/a/*/apertium-{swe-nor,swe-dan,dan-nor,nno-nob}/*.automorf.bin; do f="$(basename "${ana}")" echo "${f%%.automorf.bin}" src="${f%%-*}" printf "unk\tknown\ttot\tcov %%\n" xzcat ~/corpora/"${src}".wikicov.xz | lt-proc "${ana}" \ | apertium-cleanstream -n \ | sum " no decomp" & xzcat ~/corpora/"${src}".wikicov.xz | lt-proc -e "${ana}" \ | apertium-cleanstream -n \ | sum "with decomp" & wait echo done
Testvoc[edit]
Results[edit]
1 ../apertium-dan-nor/nobackup/69419-dan-nno 30 ../apertium-dan-nor/nobackup/69419-dan-nob 37 ../apertium-dan-nor/nobackup/69419-nno-dan 26 ../apertium-dan-nor/nobackup/69419-nob-dan 94 total 136 ../apertium-nno-nob/nobackup/69419-nno-nob 71 ../apertium-nno-nob/nobackup/69419-nob-nno 207 total 157 ../apertium-swe-dan/nobackup/69419-dan-swe 276 ../apertium-swe-dan/nobackup/69419-swe-dan 433 total 0 ../apertium-swe-nor/nobackup/69419-nno-swe 0 ../apertium-swe-nor/nobackup/69419-nob-swe 0 ../apertium-swe-nor/nobackup/69419-swe-nno 0 ../apertium-swe-nor/nobackup/69419-swe-nob.tmp 0 total
script[edit]
$ while sleep 30; do for d in trunk/apertium-{swe-nor,dan-nor,nno-nob,swe-dan}; do ( cd $d && up && make langs && rev=$(svn info|grep ^Revision: |grep -o '[0-9]*') && for m in modes/???-???.mode; do p=${m##modes/} && p=${p%%.mode} && echo $p && ( test -f nobackup/${rev}-$p || ( dev/testvoc/generation.sh --hfst $p > nobackup/${rev}-$p.tmp mv nobackup/${rev}-$p.tmp nobackup/${rev}-$p ) ) done ) done done