Difference between revisions of "Talk:Scandinavian MT project"

From Apertium
Jump to navigation Jump to search
(cov)
Line 5: Line 5:
nno-swe
nno-swe
unk known tot cov %
unk known tot cov %
1106753 4819429 5926182 81.3244 no decomp ex-upper: 85.525
876166 5251500 6127666 85.7015 no decomp ex-upper: 87.6831
998089 4928093 5926182 83.158 with decomp ex-upper: 89.4046
773219 5354447 6127666 87.3815 with decomp ex-upper: 91.0789


nob-swe
nob-swe
unk known tot cov %
unk known tot cov %
958699 4962110 5920809 83.808 no decomp ex-upper: 87.9224
744696 5369943 6114639 87.8211 no decomp ex-upper: 89.4771
840577 5080232 5920809 85.803 with decomp ex-upper: 91.7045
632117 5482522 6114639 89.6622 with decomp ex-upper: 92.8601


swe-nno
swe-nno
unk known tot cov %
unk known tot cov %
1221786 4353355 5575141 78.0851 no decomp ex-upper: 82.4417
992577 5282677 6275254 84.1827 no decomp ex-upper: 87.128
1029606 4545535 5575141 81.5322 with decomp ex-upper: 89.4172
813461 5461794 6275255 87.037 with decomp ex-upper: 92.1041


swe-nob
swe-nob
unk known tot cov %
unk known tot cov %
1213665 4361421 5575086 78.2306 no decomp ex-upper: 82.5387
979754 5295424 6275178 84.3868 no decomp ex-upper: 87.3452
1016052 4559034 5575086 81.7751 with decomp ex-upper: 89.6631
795866 5479313 6275179 87.3172 with decomp ex-upper: 92.4016


dan-swe
dan-swe
unk known tot cov %
unk known tot cov %
1301693 4600248 5901941 77.9447 no decomp ex-upper: 82.9197
1168922 4733019 5901941 80.1943 with decomp ex-upper: 87.5534
1168922 4733019 5901941 80.1943 with decomp ex-upper: 87.5534
1301693 4600248 5901941 77.9447 no decomp ex-upper: 82.9197


swe-dan
swe-dan
unk known tot cov %
unk known tot cov %
1306501 4570368 5876869 77.7688 no decomp ex-upper: 81.9783
1337817 4539056 5876873 77.2359 no decomp ex-upper: 81.2688
1106278 4770591 5876869 81.1757 with decomp ex-upper: 88.8576
1136015 4740858 5876873 80.6697 with decomp ex-upper: 88.3335


dan-nno
dan-nno
Line 45: Line 45:
nno-dan
nno-dan
unk known tot cov %
unk known tot cov %
707963 5304732 6012695 88.2255 no decomp ex-upper: 90.2499
708015 5304680 6012695 88.2247 no decomp ex-upper: 90.2486
614090 5398605 6012695 89.7868 with decomp ex-upper: 93.4234
614142 5398553 6012695 89.7859 with decomp ex-upper: 93.4224


nob-dan
nob-dan
unk known tot cov %
unk known tot cov %
594818 5397037 5991855 90.0729 no decomp ex-upper: 91.7326
594853 5397002 5991855 90.0723 no decomp ex-upper: 91.7318
492207 5499648 5991855 91.7854 with decomp ex-upper: 94.8789
492242 5499613 5991855 91.7848 with decomp ex-upper: 94.8782


nno-nob
nno-nob
unk known tot cov %
unk known tot cov %
706322 5367265 6073587 88.3706 no decomp ex-upper: 91.2203
706329 5367221 6073550 88.3704 no decomp ex-upper: 91.2201
614658 5458929 6073587 89.8798 with decomp ex-upper: 94.2579
614665 5458885 6073550 89.8796 with decomp ex-upper: 94.2577


nob-nno
nob-nno
unk known tot cov %
unk known tot cov %
604062 5459945 6064007 90.0386 no decomp ex-upper: 92.5996
603978 5459984 6063962 90.0399 no decomp ex-upper: 92.6016
503789 5560218 6064007 91.6921 with decomp ex-upper: 95.5471
503705 5560257 6063962 91.6935 with decomp ex-upper: 95.5487
</pre>
</pre>



Revision as of 23:48, 13 May 2016

Coverage on Wikipedia dumps ("no decomp" is with decompounding turned off, ie. without the -e switch to lt-proc; ex-upper is excluding anything with uppercase characters from all counts; script at bottom of page).

$ ./scandicov.sh
nno-swe
unk     known   tot     cov %
876166  5251500 6127666 85.7015   no decomp     ex-upper:       87.6831
773219  5354447 6127666 87.3815 with decomp     ex-upper:       91.0789

nob-swe
unk     known   tot     cov %
744696  5369943 6114639 87.8211   no decomp     ex-upper:       89.4771
632117  5482522 6114639 89.6622 with decomp     ex-upper:       92.8601

swe-nno
unk     known   tot     cov %
992577  5282677 6275254 84.1827   no decomp     ex-upper:       87.128
813461  5461794 6275255 87.037  with decomp     ex-upper:       92.1041

swe-nob
unk     known   tot     cov %
979754  5295424 6275178 84.3868   no decomp     ex-upper:       87.3452
795866  5479313 6275179 87.3172 with decomp     ex-upper:       92.4016

dan-swe
unk     known   tot     cov %
1168922 4733019 5901941 80.1943 with decomp     ex-upper:       87.5534
1301693 4600248 5901941 77.9447   no decomp     ex-upper:       82.9197

swe-dan
unk     known   tot     cov %
1337817 4539056 5876873 77.2359   no decomp     ex-upper:       81.2688
1136015 4740858 5876873 80.6697 with decomp     ex-upper:       88.3335

dan-nno
unk     known   tot     cov %
869064  5033026 5902090 85.2753   no decomp     ex-upper:       87.573
750810  5151280 5902090 87.2789 with decomp     ex-upper:       91.5197

dan-nob
unk     known   tot     cov %
831676  5065778 5897454 85.8977   no decomp     ex-upper:       88.2809
702310  5195144 5897454 88.0913 with decomp     ex-upper:       92.3478

nno-dan
unk     known   tot     cov %
708015  5304680 6012695 88.2247   no decomp     ex-upper:       90.2486
614142  5398553 6012695 89.7859 with decomp     ex-upper:       93.4224

nob-dan
unk     known   tot     cov %
594853  5397002 5991855 90.0723   no decomp     ex-upper:       91.7318
492242  5499613 5991855 91.7848 with decomp     ex-upper:       94.8782

nno-nob
unk     known   tot     cov %
706329  5367221 6073550 88.3704   no decomp     ex-upper:       91.2201
614665  5458885 6073550 89.8796 with decomp     ex-upper:       94.2577

nob-nno
unk     known   tot     cov %
603978  5459984 6063962 90.0399   no decomp     ex-upper:       92.6016
503705  5560257 6063962 91.6935 with decomp     ex-upper:       95.5487
$ cat scandicov.sh
#!/bin/bash

# kill process group:
trap "kill -- -0" EXIT

sum () {
    awk -v note="$1" '
BEGIN{OFS=FS="\t"} 
/^\^/{w++} 
/\/\*/{u++}
/\/\*.*[[:upper:]]/{Uu++}
/[[:upper:]]/{Uw++} 
END{
lw=w-Uw
lu=u-Uu
print u,w-u,w,100*(w-u)/w, note,"ex-upper:",100*(lw-lu)/lw}'
}

for ana in /l/a/*/apertium-{swe-nor,swe-dan,dan-nor,nno-nob}/*.automorf.bin; do
    f="$(basename "${ana}")"
    echo "${f%%.automorf.bin}"
    src="${f%%-*}"
    printf "unk\tknown\ttot\tcov %%\n"
    xzcat ~/corpora/"${src}".wikicov.xz | lt-proc "${ana}" \
        | apertium-cleanstream -n \
        | sum "  no decomp" &

    xzcat ~/corpora/"${src}".wikicov.xz | lt-proc -e "${ana}" \
        | apertium-cleanstream -n \
        | sum "with decomp" &

    wait
    echo
done