Difference between revisions of "Talk:Scandinavian MT project"
Jump to navigation
Jump to search
(18 intermediate revisions by the same user not shown) | |||
Line 1: | Line 1: | ||
+ | ==Coverage== |
||
− | Cov on wikipedia (nocmp is with decompounding turned off): |
||
+ | |||
− | {|class=wikitable |
||
+ | Coverage on Wikipedia dumps ("no decomp" is with decompounding turned off, ie. without the -e switch to lt-proc; ex-upper is excluding anything with uppercase characters from all counts; script at bottom of page). |
||
− | ! Direction !! w/o cmp !! regular |
||
+ | ===Results=== |
||
− | |- |
||
+ | <pre> |
||
− | | nob-nno || 90.4% || |
||
+ | $ ./scandicov.sh |
||
− | |- |
||
+ | nno-swe |
||
− | | nno-nob || 89.2% || |
||
+ | unk known tot cov % |
||
− | |- |
||
+ | 805040 5307135 6112175 86.8289 no decomp ex-upper: 88.251 |
||
− | | dan-nob || 82.7% || |
||
+ | 703448 5408727 6112175 88.491 with decomp ex-upper: 91.5592 |
||
− | |- |
||
+ | |||
− | | dan-nno || 81.8% || |
||
+ | nob-swe |
||
− | |- |
||
+ | unk known tot cov % |
||
− | | dan-swe || 76.2% || |
||
+ | 691359 5408402 6099761 88.6658 no decomp ex-upper: 90.0757 |
||
− | |- |
||
+ | 579262 5520499 6099761 90.5035 with decomp ex-upper: 93.3714 |
||
− | | swe-dan || 80.4% || |
||
+ | |||
− | |- |
||
+ | swe-nno |
||
− | | nno-dan || || |
||
+ | unk known tot cov % |
||
− | |- |
||
+ | 914584 5167659 6082243 84.963 no decomp ex-upper: 87.082 |
||
− | | nob-dan || || |
||
+ | 744040 5338204 6082244 87.767 with decomp ex-upper: 92.1901 |
||
− | |- |
||
+ | |||
− | | swe-nno || || |
||
+ | swe-nob |
||
− | |- |
||
+ | unk known tot cov % |
||
− | | swe-nob || || |
||
+ | 907104 5175868 6082972 85.0878 no decomp ex-upper: 87.2664 |
||
− | |- |
||
+ | 733559 5349414 6082973 87.9408 with decomp ex-upper: 92.4382 |
||
− | | nno-swe || || |
||
+ | |||
− | |- |
||
− | + | dan-swe |
|
+ | unk known tot cov % |
||
− | |} |
||
+ | 1135679 4775050 5910729 80.7861 no decomp ex-upper: 86.2313 |
||
+ | 997339 4913390 5910729 83.1266 with decomp ex-upper: 90.5919 |
||
+ | |||
+ | swe-dan |
||
+ | unk known tot cov % |
||
+ | 1120955 4760872 5881827 80.9421 no decomp ex-upper: 86.4166 |
||
+ | 926361 4955466 5881827 84.2505 with decomp ex-upper: 92.2963 |
||
+ | |||
+ | dan-nno |
||
+ | unk known tot cov % |
||
+ | 870318 5020221 5890539 85.2252 no decomp ex-upper: 87.512 |
||
+ | 752064 5138475 5890539 87.2327 with decomp ex-upper: 91.4789 |
||
+ | |||
+ | dan-nob |
||
+ | unk known tot cov % |
||
+ | 832984 5052919 5885903 85.8478 no decomp ex-upper: 88.22 |
||
+ | 703618 5182285 5885903 88.0457 with decomp ex-upper: 92.308 |
||
+ | |||
+ | nno-dan |
||
+ | unk known tot cov % |
||
+ | 707162 5295158 6002320 88.2185 no decomp ex-upper: 90.2369 |
||
+ | 613734 5388586 6002320 89.7751 with decomp ex-upper: 93.4124 |
||
+ | |||
+ | nob-dan |
||
+ | unk known tot cov % |
||
+ | 594784 5387514 5982298 90.0576 no decomp ex-upper: 91.7191 |
||
+ | 492014 5490284 5982298 91.7755 with decomp ex-upper: 94.8719 |
||
+ | |||
+ | nno-nob |
||
+ | unk known tot cov % |
||
+ | 704975 5364593 6069568 88.3851 no decomp ex-upper: 91.2337 |
||
+ | 613334 5456234 6069568 89.8949 with decomp ex-upper: 94.2736 |
||
+ | |||
+ | nob-nno |
||
+ | unk known tot cov % |
||
+ | 603139 5456170 6059309 90.0461 no decomp ex-upper: 92.6105 |
||
+ | 502464 5556845 6059309 91.7076 with decomp ex-upper: 95.5668 |
||
+ | </pre> |
||
+ | |||
+ | ===script=== |
||
+ | <pre> |
||
+ | $ cat scandicov.sh |
||
+ | #!/bin/bash |
||
+ | |||
+ | # kill process group: |
||
+ | trap "kill -- -0" EXIT |
||
+ | |||
+ | sum () { |
||
+ | awk -v note="$1" ' |
||
+ | BEGIN{OFS=FS="\t"} |
||
+ | /^\^/{w++} |
||
+ | /\/\*/{u++} |
||
+ | /\/\*.*[[:upper:]]/{Uu++} |
||
+ | /[[:upper:]]/{Uw++} |
||
+ | END{ |
||
+ | lw=w-Uw |
||
+ | lu=u-Uu |
||
+ | print u,w-u,w,100*(w-u)/w, note,"ex-upper:",100*(lw-lu)/lw}' |
||
+ | } |
||
+ | |||
+ | for ana in /l/a/*/apertium-{swe-nor,swe-dan,dan-nor,nno-nob}/*.automorf.bin; do |
||
+ | f="$(basename "${ana}")" |
||
+ | echo "${f%%.automorf.bin}" |
||
+ | src="${f%%-*}" |
||
+ | printf "unk\tknown\ttot\tcov %%\n" |
||
+ | xzcat ~/corpora/"${src}".wikicov.xz | lt-proc "${ana}" \ |
||
+ | | apertium-cleanstream -n \ |
||
+ | | sum " no decomp" & |
||
+ | |||
+ | xzcat ~/corpora/"${src}".wikicov.xz | lt-proc -e "${ana}" \ |
||
+ | | apertium-cleanstream -n \ |
||
+ | | sum "with decomp" & |
||
+ | |||
+ | wait |
||
+ | echo |
||
+ | done |
||
+ | </pre> |
||
+ | ==Testvoc== |
||
+ | ===Results=== |
||
+ | <pre> |
||
+ | 1 ../apertium-dan-nor/nobackup/69419-dan-nno |
||
+ | 30 ../apertium-dan-nor/nobackup/69419-dan-nob |
||
+ | 37 ../apertium-dan-nor/nobackup/69419-nno-dan |
||
+ | 26 ../apertium-dan-nor/nobackup/69419-nob-dan |
||
+ | 94 total |
||
+ | |||
+ | 136 ../apertium-nno-nob/nobackup/69419-nno-nob |
||
+ | 71 ../apertium-nno-nob/nobackup/69419-nob-nno |
||
+ | 207 total |
||
+ | |||
+ | 157 ../apertium-swe-dan/nobackup/69419-dan-swe |
||
+ | 276 ../apertium-swe-dan/nobackup/69419-swe-dan |
||
+ | 433 total |
||
+ | |||
+ | 0 ../apertium-swe-nor/nobackup/69419-nno-swe |
||
+ | 0 ../apertium-swe-nor/nobackup/69419-nob-swe |
||
+ | 0 ../apertium-swe-nor/nobackup/69419-swe-nno |
||
+ | 0 ../apertium-swe-nor/nobackup/69419-swe-nob.tmp |
||
+ | 0 total |
||
+ | </pre> |
||
+ | |||
+ | ===script=== |
||
+ | <pre> |
||
+ | $ while sleep 30; do |
||
+ | for d in trunk/apertium-{swe-nor,dan-nor,nno-nob,swe-dan}; do |
||
+ | ( cd $d && up && make langs && rev=$(svn info|grep ^Revision: |grep -o '[0-9]*') && |
||
+ | for m in modes/???-???.mode; do |
||
+ | p=${m##modes/} && p=${p%%.mode} && echo $p && |
||
+ | ( test -f nobackup/${rev}-$p || |
||
+ | ( dev/testvoc/generation.sh --hfst $p > nobackup/${rev}-$p.tmp |
||
+ | mv nobackup/${rev}-$p.tmp nobackup/${rev}-$p |
||
+ | ) |
||
+ | ) |
||
+ | done |
||
+ | ) |
||
+ | done |
||
+ | done |
||
+ | </pre> |
Latest revision as of 18:20, 1 July 2016
Coverage[edit]
Coverage on Wikipedia dumps ("no decomp" is with decompounding turned off, ie. without the -e switch to lt-proc; ex-upper is excluding anything with uppercase characters from all counts; script at bottom of page).
Results[edit]
$ ./scandicov.sh nno-swe unk known tot cov % 805040 5307135 6112175 86.8289 no decomp ex-upper: 88.251 703448 5408727 6112175 88.491 with decomp ex-upper: 91.5592 nob-swe unk known tot cov % 691359 5408402 6099761 88.6658 no decomp ex-upper: 90.0757 579262 5520499 6099761 90.5035 with decomp ex-upper: 93.3714 swe-nno unk known tot cov % 914584 5167659 6082243 84.963 no decomp ex-upper: 87.082 744040 5338204 6082244 87.767 with decomp ex-upper: 92.1901 swe-nob unk known tot cov % 907104 5175868 6082972 85.0878 no decomp ex-upper: 87.2664 733559 5349414 6082973 87.9408 with decomp ex-upper: 92.4382 dan-swe unk known tot cov % 1135679 4775050 5910729 80.7861 no decomp ex-upper: 86.2313 997339 4913390 5910729 83.1266 with decomp ex-upper: 90.5919 swe-dan unk known tot cov % 1120955 4760872 5881827 80.9421 no decomp ex-upper: 86.4166 926361 4955466 5881827 84.2505 with decomp ex-upper: 92.2963 dan-nno unk known tot cov % 870318 5020221 5890539 85.2252 no decomp ex-upper: 87.512 752064 5138475 5890539 87.2327 with decomp ex-upper: 91.4789 dan-nob unk known tot cov % 832984 5052919 5885903 85.8478 no decomp ex-upper: 88.22 703618 5182285 5885903 88.0457 with decomp ex-upper: 92.308 nno-dan unk known tot cov % 707162 5295158 6002320 88.2185 no decomp ex-upper: 90.2369 613734 5388586 6002320 89.7751 with decomp ex-upper: 93.4124 nob-dan unk known tot cov % 594784 5387514 5982298 90.0576 no decomp ex-upper: 91.7191 492014 5490284 5982298 91.7755 with decomp ex-upper: 94.8719 nno-nob unk known tot cov % 704975 5364593 6069568 88.3851 no decomp ex-upper: 91.2337 613334 5456234 6069568 89.8949 with decomp ex-upper: 94.2736 nob-nno unk known tot cov % 603139 5456170 6059309 90.0461 no decomp ex-upper: 92.6105 502464 5556845 6059309 91.7076 with decomp ex-upper: 95.5668
script[edit]
$ cat scandicov.sh #!/bin/bash # kill process group: trap "kill -- -0" EXIT sum () { awk -v note="$1" ' BEGIN{OFS=FS="\t"} /^\^/{w++} /\/\*/{u++} /\/\*.*[[:upper:]]/{Uu++} /[[:upper:]]/{Uw++} END{ lw=w-Uw lu=u-Uu print u,w-u,w,100*(w-u)/w, note,"ex-upper:",100*(lw-lu)/lw}' } for ana in /l/a/*/apertium-{swe-nor,swe-dan,dan-nor,nno-nob}/*.automorf.bin; do f="$(basename "${ana}")" echo "${f%%.automorf.bin}" src="${f%%-*}" printf "unk\tknown\ttot\tcov %%\n" xzcat ~/corpora/"${src}".wikicov.xz | lt-proc "${ana}" \ | apertium-cleanstream -n \ | sum " no decomp" & xzcat ~/corpora/"${src}".wikicov.xz | lt-proc -e "${ana}" \ | apertium-cleanstream -n \ | sum "with decomp" & wait echo done
Testvoc[edit]
Results[edit]
1 ../apertium-dan-nor/nobackup/69419-dan-nno 30 ../apertium-dan-nor/nobackup/69419-dan-nob 37 ../apertium-dan-nor/nobackup/69419-nno-dan 26 ../apertium-dan-nor/nobackup/69419-nob-dan 94 total 136 ../apertium-nno-nob/nobackup/69419-nno-nob 71 ../apertium-nno-nob/nobackup/69419-nob-nno 207 total 157 ../apertium-swe-dan/nobackup/69419-dan-swe 276 ../apertium-swe-dan/nobackup/69419-swe-dan 433 total 0 ../apertium-swe-nor/nobackup/69419-nno-swe 0 ../apertium-swe-nor/nobackup/69419-nob-swe 0 ../apertium-swe-nor/nobackup/69419-swe-nno 0 ../apertium-swe-nor/nobackup/69419-swe-nob.tmp 0 total
script[edit]
$ while sleep 30; do for d in trunk/apertium-{swe-nor,dan-nor,nno-nob,swe-dan}; do ( cd $d && up && make langs && rev=$(svn info|grep ^Revision: |grep -o '[0-9]*') && for m in modes/???-???.mode; do p=${m##modes/} && p=${p%%.mode} && echo $p && ( test -f nobackup/${rev}-$p || ( dev/testvoc/generation.sh --hfst $p > nobackup/${rev}-$p.tmp mv nobackup/${rev}-$p.tmp nobackup/${rev}-$p ) ) done ) done done