Difference between revisions of "Talk:Scandinavian MT project"

From Apertium
Jump to navigation Jump to search
(complete rerun)
Line 1: Line 1:
Coverage on Wikipedia dumps ("w/o cmp" is with decompounding turned off, ie. without the -e switch to lt-proc).
+
Coverage on Wikipedia dumps ("no decomp" is with decompounding turned off, ie. without the -e switch to lt-proc; ex-upper is excluding anything with uppercase characters from all counts; script at bottom of page).
   
E.g.
 
 
<pre>
 
<pre>
  +
$ ./scandicov.sh
bzcat ~/corpora/nnclean2.txt.bz2 \
 
  +
nno-swe
|tr ' ' '\n' \
 
  +
unk known tot cov %
|grep -m5113060 . \
 
  +
1116709 4809475 5926184 81.1564 no decomp ex-upper: 85.524
|apertium-deshtml \
 
  +
1008045 4918139 5926184 82.99 with decomp ex-upper: 89.4046
|lt-proc nno-dan.automorf.bin \
 
  +
|apertium-cleanstream -n \
 
  +
nob-swe
|awk 'BEGIN{OFS=FS="\t"} /^\^/{lu++} /\/\*/{u++} END{print "unk","known","tot","cov %";print u,lu-u,lu,100*(lu-u)/lu}'
 
  +
unk known tot cov %
  +
971001 4949813 5920814 83.6002 no decomp ex-upper: 87.9034
  +
852860 5067954 5920814 85.5956 with decomp ex-upper: 91.7049
  +
  +
swe-nno
  +
unk known tot cov %
  +
1256170 4319483 5575653 77.4704 no decomp ex-upper: 81.5153
  +
1065995 4509658 5575653 80.8813 with decomp ex-upper: 88.6429
  +
  +
swe-nob
  +
unk known tot cov %
  +
1249097 4326501 5575598 77.5971 no decomp ex-upper: 81.5869
  +
1052948 4522650 5575598 81.1151 with decomp ex-upper: 88.8848
  +
  +
dan-swe
  +
unk known tot cov %
  +
1301693 4600248 5901941 77.9447 no decomp ex-upper: 82.9197
  +
1168922 4733019 5901941 80.1943 with decomp ex-upper: 87.5534
  +
  +
swe-dan
  +
unk known tot cov %
  +
1306501 4570368 5876869 77.7688 no decomp ex-upper: 81.9783
  +
1106278 4770591 5876869 81.1757 with decomp ex-upper: 88.8576
  +
  +
dan-nno
  +
unk known tot cov %
  +
869064 5033026 5902090 85.2753 no decomp ex-upper: 87.573
  +
750810 5151280 5902090 87.2789 with decomp ex-upper: 91.5197
  +
  +
dan-nob
  +
unk known tot cov %
  +
831676 5065778 5897454 85.8977 no decomp ex-upper: 88.2809
  +
702310 5195144 5897454 88.0913 with decomp ex-upper: 92.3478
  +
  +
nno-dan
  +
unk known tot cov %
  +
707963 5304732 6012695 88.2255 no decomp ex-upper: 90.2499
  +
614090 5398605 6012695 89.7868 with decomp ex-upper: 93.4234
  +
  +
nob-dan
  +
unk known tot cov %
  +
594818 5397037 5991855 90.0729 no decomp ex-upper: 91.7326
  +
492207 5499648 5991855 91.7854 with decomp ex-upper: 94.8789
  +
  +
nno-nob
  +
unk known tot cov %
  +
706322 5367265 6073587 88.3706 no decomp ex-upper: 91.2203
  +
614658 5458929 6073587 89.8798 with decomp ex-upper: 94.2579
  +
  +
nob-nno
  +
unk known tot cov %
  +
604062 5459945 6064007 90.0386 no decomp ex-upper: 92.5996
  +
503789 5560218 6064007 91.6921 with decomp ex-upper: 95.5471
 
</pre>
 
</pre>
   
  +
<pre>
{|class=wikitable
 
  +
$ cat scandicov.sh
! Direction !! w/o cmp !! regular
 
  +
#!/bin/bash
|-
 
  +
| nob-nno || 90.9% || 92.6%
 
  +
# kill process group:
|-
 
  +
trap "kill -- -0" EXIT
| nob-dan || 89.8% || 91.5%
 
  +
|-
 
  +
sum () {
| nno-nob || 89.2% || 90.6%
 
  +
awk -v note="$1" '
|-
 
  +
BEGIN{OFS=FS="\t"}
| nno-dan || 87.4% || 88.8%
 
  +
/^\^/{w++}
|-
 
  +
/\/\*/{u++}
| dan-nob || 85.1% || 86.4%
 
  +
/\/\*.*[[:upper:]]/{Uu++}
|-
 
  +
/[[:upper:]]/{Uw++}
| swe-dan || 80.4% || 83.7%
 
  +
END{
|-
 
  +
lw=w-Uw
| dan-nno || 82.5% || 83.5%
 
  +
lu=u-Uu
|-
 
  +
print u,w-u,w,100*(w-u)/w, note,"ex-upper:",100*(lw-lu)/lw}'
| dan-swe || 80.6% || 82.9%
 
  +
}
|-
 
  +
| nob-swe || 74.9% || 76.2%
 
  +
for ana in /l/a/*/apertium-{swe-nor,swe-dan,dan-nor,nno-nob}/*.automorf.bin; do
|-
 
  +
f="$(basename "${ana}")"
| nno-swe || 73.5% || 74.6%
 
  +
echo "${f%%.automorf.bin}"
|-
 
  +
src="${f%%-*}"
| swe-nob || 69.2% || 72.1%
 
  +
printf "unk\tknown\ttot\tcov %%\n"
|-
 
  +
xzcat ~/corpora/"${src}".wikicov.xz | lt-proc "${ana}" \
| swe-nno || 69.1% || 71.9%
 
  +
| apertium-cleanstream -n \
|}
 
  +
| sum " no decomp" &
  +
  +
xzcat ~/corpora/"${src}".wikicov.xz | lt-proc -e "${ana}" \
  +
| apertium-cleanstream -n \
  +
| sum "with decomp" &
  +
  +
wait
  +
echo
  +
done
  +
</pre>

Revision as of 09:03, 28 April 2016

Coverage on Wikipedia dumps ("no decomp" is with decompounding turned off, ie. without the -e switch to lt-proc; ex-upper is excluding anything with uppercase characters from all counts; script at bottom of page).

$ ./scandicov.sh
nno-swe
unk     known   tot     cov %
1116709 4809475 5926184 81.1564   no decomp     ex-upper:       85.524
1008045 4918139 5926184 82.99   with decomp     ex-upper:       89.4046

nob-swe
unk     known   tot     cov %
971001  4949813 5920814 83.6002   no decomp     ex-upper:       87.9034
852860  5067954 5920814 85.5956 with decomp     ex-upper:       91.7049

swe-nno
unk     known   tot     cov %
1256170 4319483 5575653 77.4704   no decomp     ex-upper:       81.5153
1065995 4509658 5575653 80.8813 with decomp     ex-upper:       88.6429

swe-nob
unk     known   tot     cov %
1249097 4326501 5575598 77.5971   no decomp     ex-upper:       81.5869
1052948 4522650 5575598 81.1151 with decomp     ex-upper:       88.8848

dan-swe
unk     known   tot     cov %
1301693 4600248 5901941 77.9447   no decomp     ex-upper:       82.9197
1168922 4733019 5901941 80.1943 with decomp     ex-upper:       87.5534

swe-dan
unk     known   tot     cov %
1306501 4570368 5876869 77.7688   no decomp     ex-upper:       81.9783
1106278 4770591 5876869 81.1757 with decomp     ex-upper:       88.8576

dan-nno
unk     known   tot     cov %
869064  5033026 5902090 85.2753   no decomp     ex-upper:       87.573
750810  5151280 5902090 87.2789 with decomp     ex-upper:       91.5197

dan-nob
unk     known   tot     cov %
831676  5065778 5897454 85.8977   no decomp     ex-upper:       88.2809
702310  5195144 5897454 88.0913 with decomp     ex-upper:       92.3478

nno-dan
unk     known   tot     cov %
707963  5304732 6012695 88.2255   no decomp     ex-upper:       90.2499
614090  5398605 6012695 89.7868 with decomp     ex-upper:       93.4234

nob-dan
unk     known   tot     cov %
594818  5397037 5991855 90.0729   no decomp     ex-upper:       91.7326
492207  5499648 5991855 91.7854 with decomp     ex-upper:       94.8789

nno-nob
unk     known   tot     cov %
706322  5367265 6073587 88.3706   no decomp     ex-upper:       91.2203
614658  5458929 6073587 89.8798 with decomp     ex-upper:       94.2579

nob-nno
unk     known   tot     cov %
604062  5459945 6064007 90.0386   no decomp     ex-upper:       92.5996
503789  5560218 6064007 91.6921 with decomp     ex-upper:       95.5471
$ cat scandicov.sh
#!/bin/bash

# kill process group:
trap "kill -- -0" EXIT

sum () {
    awk -v note="$1" '
BEGIN{OFS=FS="\t"} 
/^\^/{w++} 
/\/\*/{u++}
/\/\*.*[[:upper:]]/{Uu++}
/[[:upper:]]/{Uw++} 
END{
lw=w-Uw
lu=u-Uu
print u,w-u,w,100*(w-u)/w, note,"ex-upper:",100*(lw-lu)/lw}'
}

for ana in /l/a/*/apertium-{swe-nor,swe-dan,dan-nor,nno-nob}/*.automorf.bin; do
    f="$(basename "${ana}")"
    echo "${f%%.automorf.bin}"
    src="${f%%-*}"
    printf "unk\tknown\ttot\tcov %%\n"
    xzcat ~/corpora/"${src}".wikicov.xz | lt-proc "${ana}" \
        | apertium-cleanstream -n \
        | sum "  no decomp" &

    xzcat ~/corpora/"${src}".wikicov.xz | lt-proc -e "${ana}" \
        | apertium-cleanstream -n \
        | sum "with decomp" &

    wait
    echo
done