Difference between revisions of "Talk:Scandinavian MT project"
Jump to navigation
Jump to search
(cov) |
|||
Line 5: | Line 5: | ||
nno-swe |
nno-swe |
||
unk known tot cov % |
unk known tot cov % |
||
876166 5251500 6127666 85.7015 no decomp ex-upper: 87.6831 |
|||
773219 5354447 6127666 87.3815 with decomp ex-upper: 91.0789 |
|||
nob-swe |
nob-swe |
||
unk known tot cov % |
unk known tot cov % |
||
744696 5369943 6114639 87.8211 no decomp ex-upper: 89.4771 |
|||
632117 5482522 6114639 89.6622 with decomp ex-upper: 92.8601 |
|||
swe-nno |
swe-nno |
||
unk known tot cov % |
unk known tot cov % |
||
992577 5282677 6275254 84.1827 no decomp ex-upper: 87.128 |
|||
813461 5461794 6275255 87.037 with decomp ex-upper: 92.1041 |
|||
swe-nob |
swe-nob |
||
unk known tot cov % |
unk known tot cov % |
||
979754 5295424 6275178 84.3868 no decomp ex-upper: 87.3452 |
|||
795866 5479313 6275179 87.3172 with decomp ex-upper: 92.4016 |
|||
dan-swe |
dan-swe |
||
unk known tot cov % |
unk known tot cov % |
||
⚫ | |||
1168922 4733019 5901941 80.1943 with decomp ex-upper: 87.5534 |
1168922 4733019 5901941 80.1943 with decomp ex-upper: 87.5534 |
||
⚫ | |||
swe-dan |
swe-dan |
||
unk known tot cov % |
unk known tot cov % |
||
1337817 4539056 5876873 77.2359 no decomp ex-upper: 81.2688 |
|||
1136015 4740858 5876873 80.6697 with decomp ex-upper: 88.3335 |
|||
dan-nno |
dan-nno |
||
Line 45: | Line 45: | ||
nno-dan |
nno-dan |
||
unk known tot cov % |
unk known tot cov % |
||
708015 5304680 6012695 88.2247 no decomp ex-upper: 90.2486 |
|||
614142 5398553 6012695 89.7859 with decomp ex-upper: 93.4224 |
|||
nob-dan |
nob-dan |
||
unk known tot cov % |
unk known tot cov % |
||
594853 5397002 5991855 90.0723 no decomp ex-upper: 91.7318 |
|||
492242 5499613 5991855 91.7848 with decomp ex-upper: 94.8782 |
|||
nno-nob |
nno-nob |
||
unk known tot cov % |
unk known tot cov % |
||
706329 5367221 6073550 88.3704 no decomp ex-upper: 91.2201 |
|||
614665 5458885 6073550 89.8796 with decomp ex-upper: 94.2577 |
|||
nob-nno |
nob-nno |
||
unk known tot cov % |
unk known tot cov % |
||
603978 5459984 6063962 90.0399 no decomp ex-upper: 92.6016 |
|||
503705 5560257 6063962 91.6935 with decomp ex-upper: 95.5487 |
|||
</pre> |
</pre> |
||
Revision as of 23:48, 13 May 2016
Coverage on Wikipedia dumps ("no decomp" is with decompounding turned off, ie. without the -e switch to lt-proc; ex-upper is excluding anything with uppercase characters from all counts; script at bottom of page).
$ ./scandicov.sh nno-swe unk known tot cov % 876166 5251500 6127666 85.7015 no decomp ex-upper: 87.6831 773219 5354447 6127666 87.3815 with decomp ex-upper: 91.0789 nob-swe unk known tot cov % 744696 5369943 6114639 87.8211 no decomp ex-upper: 89.4771 632117 5482522 6114639 89.6622 with decomp ex-upper: 92.8601 swe-nno unk known tot cov % 992577 5282677 6275254 84.1827 no decomp ex-upper: 87.128 813461 5461794 6275255 87.037 with decomp ex-upper: 92.1041 swe-nob unk known tot cov % 979754 5295424 6275178 84.3868 no decomp ex-upper: 87.3452 795866 5479313 6275179 87.3172 with decomp ex-upper: 92.4016 dan-swe unk known tot cov % 1168922 4733019 5901941 80.1943 with decomp ex-upper: 87.5534 1301693 4600248 5901941 77.9447 no decomp ex-upper: 82.9197 swe-dan unk known tot cov % 1337817 4539056 5876873 77.2359 no decomp ex-upper: 81.2688 1136015 4740858 5876873 80.6697 with decomp ex-upper: 88.3335 dan-nno unk known tot cov % 869064 5033026 5902090 85.2753 no decomp ex-upper: 87.573 750810 5151280 5902090 87.2789 with decomp ex-upper: 91.5197 dan-nob unk known tot cov % 831676 5065778 5897454 85.8977 no decomp ex-upper: 88.2809 702310 5195144 5897454 88.0913 with decomp ex-upper: 92.3478 nno-dan unk known tot cov % 708015 5304680 6012695 88.2247 no decomp ex-upper: 90.2486 614142 5398553 6012695 89.7859 with decomp ex-upper: 93.4224 nob-dan unk known tot cov % 594853 5397002 5991855 90.0723 no decomp ex-upper: 91.7318 492242 5499613 5991855 91.7848 with decomp ex-upper: 94.8782 nno-nob unk known tot cov % 706329 5367221 6073550 88.3704 no decomp ex-upper: 91.2201 614665 5458885 6073550 89.8796 with decomp ex-upper: 94.2577 nob-nno unk known tot cov % 603978 5459984 6063962 90.0399 no decomp ex-upper: 92.6016 503705 5560257 6063962 91.6935 with decomp ex-upper: 95.5487
$ cat scandicov.sh #!/bin/bash # kill process group: trap "kill -- -0" EXIT sum () { awk -v note="$1" ' BEGIN{OFS=FS="\t"} /^\^/{w++} /\/\*/{u++} /\/\*.*[[:upper:]]/{Uu++} /[[:upper:]]/{Uw++} END{ lw=w-Uw lu=u-Uu print u,w-u,w,100*(w-u)/w, note,"ex-upper:",100*(lw-lu)/lw}' } for ana in /l/a/*/apertium-{swe-nor,swe-dan,dan-nor,nno-nob}/*.automorf.bin; do f="$(basename "${ana}")" echo "${f%%.automorf.bin}" src="${f%%-*}" printf "unk\tknown\ttot\tcov %%\n" xzcat ~/corpora/"${src}".wikicov.xz | lt-proc "${ana}" \ | apertium-cleanstream -n \ | sum " no decomp" & xzcat ~/corpora/"${src}".wikicov.xz | lt-proc -e "${ana}" \ | apertium-cleanstream -n \ | sum "with decomp" & wait echo done