Talk:Scandinavian MT project

From Apertium
Revision as of 08:50, 29 April 2016 by Unhammer (talk | contribs)
Jump to navigation Jump to search

Coverage on Wikipedia dumps ("no decomp" is with decompounding turned off, ie. without the -e switch to lt-proc; ex-upper is excluding anything with uppercase characters from all counts; script at bottom of page).

$ ./scandicov.sh
nno-swe
unk     known   tot     cov %
1106753 4819429 5926182 81.3244   no decomp     ex-upper:       85.525
998089  4928093 5926182 83.158  with decomp     ex-upper:       89.4046

nob-swe
unk     known   tot     cov %
958699  4962110 5920809 83.808    no decomp     ex-upper:       87.9224
840577  5080232 5920809 85.803  with decomp     ex-upper:       91.7045

swe-nno
unk     known   tot     cov %
1221786 4353355 5575141 78.0851   no decomp     ex-upper:       82.4417
1029606 4545535 5575141 81.5322 with decomp     ex-upper:       89.4172

swe-nob
unk     known   tot     cov %
1213665 4361421 5575086 78.2306   no decomp     ex-upper:       82.5387
1016052 4559034 5575086 81.7751 with decomp     ex-upper:       89.6631

dan-swe
unk     known   tot     cov %
1301693 4600248 5901941 77.9447   no decomp     ex-upper:       82.9197
1168922 4733019 5901941 80.1943 with decomp     ex-upper:       87.5534

swe-dan
unk     known   tot     cov %
1306501 4570368 5876869 77.7688   no decomp     ex-upper:       81.9783
1106278 4770591 5876869 81.1757 with decomp     ex-upper:       88.8576

dan-nno
unk     known   tot     cov %
869064  5033026 5902090 85.2753   no decomp     ex-upper:       87.573
750810  5151280 5902090 87.2789 with decomp     ex-upper:       91.5197

dan-nob
unk     known   tot     cov %
831676  5065778 5897454 85.8977   no decomp     ex-upper:       88.2809
702310  5195144 5897454 88.0913 with decomp     ex-upper:       92.3478

nno-dan
unk     known   tot     cov %
707963  5304732 6012695 88.2255   no decomp     ex-upper:       90.2499
614090  5398605 6012695 89.7868 with decomp     ex-upper:       93.4234

nob-dan
unk     known   tot     cov %
594818  5397037 5991855 90.0729   no decomp     ex-upper:       91.7326
492207  5499648 5991855 91.7854 with decomp     ex-upper:       94.8789

nno-nob
unk     known   tot     cov %
706322  5367265 6073587 88.3706   no decomp     ex-upper:       91.2203
614658  5458929 6073587 89.8798 with decomp     ex-upper:       94.2579

nob-nno
unk     known   tot     cov %
604062  5459945 6064007 90.0386   no decomp     ex-upper:       92.5996
503789  5560218 6064007 91.6921 with decomp     ex-upper:       95.5471
$ cat scandicov.sh
#!/bin/bash

# kill process group:
trap "kill -- -0" EXIT

sum () {
    awk -v note="$1" '
BEGIN{OFS=FS="\t"} 
/^\^/{w++} 
/\/\*/{u++}
/\/\*.*[[:upper:]]/{Uu++}
/[[:upper:]]/{Uw++} 
END{
lw=w-Uw
lu=u-Uu
print u,w-u,w,100*(w-u)/w, note,"ex-upper:",100*(lw-lu)/lw}'
}

for ana in /l/a/*/apertium-{swe-nor,swe-dan,dan-nor,nno-nob}/*.automorf.bin; do
    f="$(basename "${ana}")"
    echo "${f%%.automorf.bin}"
    src="${f%%-*}"
    printf "unk\tknown\ttot\tcov %%\n"
    xzcat ~/corpora/"${src}".wikicov.xz | lt-proc "${ana}" \
        | apertium-cleanstream -n \
        | sum "  no decomp" &

    xzcat ~/corpora/"${src}".wikicov.xz | lt-proc -e "${ana}" \
        | apertium-cleanstream -n \
        | sum "with decomp" &

    wait
    echo
done