Difference between revisions of "Talk:Scandinavian MT project"
Jump to navigation
Jump to search
(complete rerun) |
|||
Line 1: | Line 1: | ||
Coverage on Wikipedia dumps (" |
Coverage on Wikipedia dumps ("no decomp" is with decompounding turned off, ie. without the -e switch to lt-proc; ex-upper is excluding anything with uppercase characters from all counts; script at bottom of page). |
||
E.g. |
|||
<pre> |
<pre> |
||
$ ./scandicov.sh |
|||
bzcat ~/corpora/nnclean2.txt.bz2 \ |
|||
nno-swe |
|||
|tr ' ' '\n' \ |
|||
unk known tot cov % |
|||
|grep -m5113060 . \ |
|||
1116709 4809475 5926184 81.1564 no decomp ex-upper: 85.524 |
|||
|apertium-deshtml \ |
|||
1008045 4918139 5926184 82.99 with decomp ex-upper: 89.4046 |
|||
|lt-proc nno-dan.automorf.bin \ |
|||
|apertium-cleanstream -n \ |
|||
nob-swe |
|||
|awk 'BEGIN{OFS=FS="\t"} /^\^/{lu++} /\/\*/{u++} END{print "unk","known","tot","cov %";print u,lu-u,lu,100*(lu-u)/lu}' |
|||
unk known tot cov % |
|||
971001 4949813 5920814 83.6002 no decomp ex-upper: 87.9034 |
|||
852860 5067954 5920814 85.5956 with decomp ex-upper: 91.7049 |
|||
swe-nno |
|||
unk known tot cov % |
|||
1256170 4319483 5575653 77.4704 no decomp ex-upper: 81.5153 |
|||
1065995 4509658 5575653 80.8813 with decomp ex-upper: 88.6429 |
|||
swe-nob |
|||
unk known tot cov % |
|||
1249097 4326501 5575598 77.5971 no decomp ex-upper: 81.5869 |
|||
1052948 4522650 5575598 81.1151 with decomp ex-upper: 88.8848 |
|||
dan-swe |
|||
unk known tot cov % |
|||
1301693 4600248 5901941 77.9447 no decomp ex-upper: 82.9197 |
|||
1168922 4733019 5901941 80.1943 with decomp ex-upper: 87.5534 |
|||
swe-dan |
|||
unk known tot cov % |
|||
1306501 4570368 5876869 77.7688 no decomp ex-upper: 81.9783 |
|||
1106278 4770591 5876869 81.1757 with decomp ex-upper: 88.8576 |
|||
dan-nno |
|||
unk known tot cov % |
|||
869064 5033026 5902090 85.2753 no decomp ex-upper: 87.573 |
|||
750810 5151280 5902090 87.2789 with decomp ex-upper: 91.5197 |
|||
dan-nob |
|||
unk known tot cov % |
|||
831676 5065778 5897454 85.8977 no decomp ex-upper: 88.2809 |
|||
702310 5195144 5897454 88.0913 with decomp ex-upper: 92.3478 |
|||
nno-dan |
|||
unk known tot cov % |
|||
707963 5304732 6012695 88.2255 no decomp ex-upper: 90.2499 |
|||
614090 5398605 6012695 89.7868 with decomp ex-upper: 93.4234 |
|||
nob-dan |
|||
unk known tot cov % |
|||
594818 5397037 5991855 90.0729 no decomp ex-upper: 91.7326 |
|||
492207 5499648 5991855 91.7854 with decomp ex-upper: 94.8789 |
|||
nno-nob |
|||
unk known tot cov % |
|||
706322 5367265 6073587 88.3706 no decomp ex-upper: 91.2203 |
|||
614658 5458929 6073587 89.8798 with decomp ex-upper: 94.2579 |
|||
nob-nno |
|||
unk known tot cov % |
|||
604062 5459945 6064007 90.0386 no decomp ex-upper: 92.5996 |
|||
503789 5560218 6064007 91.6921 with decomp ex-upper: 95.5471 |
|||
</pre> |
</pre> |
||
<pre> |
|||
{|class=wikitable |
|||
$ cat scandicov.sh |
|||
! Direction !! w/o cmp !! regular |
|||
#!/bin/bash |
|||
|- |
|||
| nob-nno || 90.9% || 92.6% |
|||
# kill process group: |
|||
|- |
|||
trap "kill -- -0" EXIT |
|||
| nob-dan || 89.8% || 91.5% |
|||
|- |
|||
sum () { |
|||
| nno-nob || 89.2% || 90.6% |
|||
awk -v note="$1" ' |
|||
|- |
|||
BEGIN{OFS=FS="\t"} |
|||
| nno-dan || 87.4% || 88.8% |
|||
/^\^/{w++} |
|||
|- |
|||
/\/\*/{u++} |
|||
| dan-nob || 85.1% || 86.4% |
|||
/\/\*.*[[:upper:]]/{Uu++} |
|||
|- |
|||
/[[:upper:]]/{Uw++} |
|||
| swe-dan || 80.4% || 83.7% |
|||
END{ |
|||
|- |
|||
lw=w-Uw |
|||
| dan-nno || 82.5% || 83.5% |
|||
lu=u-Uu |
|||
|- |
|||
print u,w-u,w,100*(w-u)/w, note,"ex-upper:",100*(lw-lu)/lw}' |
|||
| dan-swe || 80.6% || 82.9% |
|||
} |
|||
|- |
|||
| nob-swe || 74.9% || 76.2% |
|||
for ana in /l/a/*/apertium-{swe-nor,swe-dan,dan-nor,nno-nob}/*.automorf.bin; do |
|||
|- |
|||
f="$(basename "${ana}")" |
|||
| nno-swe || 73.5% || 74.6% |
|||
echo "${f%%.automorf.bin}" |
|||
|- |
|||
src="${f%%-*}" |
|||
| swe-nob || 69.2% || 72.1% |
|||
printf "unk\tknown\ttot\tcov %%\n" |
|||
|- |
|||
xzcat ~/corpora/"${src}".wikicov.xz | lt-proc "${ana}" \ |
|||
| swe-nno || 69.1% || 71.9% |
|||
| apertium-cleanstream -n \ |
|||
|} |
|||
| sum " no decomp" & |
|||
xzcat ~/corpora/"${src}".wikicov.xz | lt-proc -e "${ana}" \ |
|||
| apertium-cleanstream -n \ |
|||
| sum "with decomp" & |
|||
wait |
|||
echo |
|||
done |
|||
</pre> |
Revision as of 09:03, 28 April 2016
Coverage on Wikipedia dumps ("no decomp" is with decompounding turned off, ie. without the -e switch to lt-proc; ex-upper is excluding anything with uppercase characters from all counts; script at bottom of page).
$ ./scandicov.sh nno-swe unk known tot cov % 1116709 4809475 5926184 81.1564 no decomp ex-upper: 85.524 1008045 4918139 5926184 82.99 with decomp ex-upper: 89.4046 nob-swe unk known tot cov % 971001 4949813 5920814 83.6002 no decomp ex-upper: 87.9034 852860 5067954 5920814 85.5956 with decomp ex-upper: 91.7049 swe-nno unk known tot cov % 1256170 4319483 5575653 77.4704 no decomp ex-upper: 81.5153 1065995 4509658 5575653 80.8813 with decomp ex-upper: 88.6429 swe-nob unk known tot cov % 1249097 4326501 5575598 77.5971 no decomp ex-upper: 81.5869 1052948 4522650 5575598 81.1151 with decomp ex-upper: 88.8848 dan-swe unk known tot cov % 1301693 4600248 5901941 77.9447 no decomp ex-upper: 82.9197 1168922 4733019 5901941 80.1943 with decomp ex-upper: 87.5534 swe-dan unk known tot cov % 1306501 4570368 5876869 77.7688 no decomp ex-upper: 81.9783 1106278 4770591 5876869 81.1757 with decomp ex-upper: 88.8576 dan-nno unk known tot cov % 869064 5033026 5902090 85.2753 no decomp ex-upper: 87.573 750810 5151280 5902090 87.2789 with decomp ex-upper: 91.5197 dan-nob unk known tot cov % 831676 5065778 5897454 85.8977 no decomp ex-upper: 88.2809 702310 5195144 5897454 88.0913 with decomp ex-upper: 92.3478 nno-dan unk known tot cov % 707963 5304732 6012695 88.2255 no decomp ex-upper: 90.2499 614090 5398605 6012695 89.7868 with decomp ex-upper: 93.4234 nob-dan unk known tot cov % 594818 5397037 5991855 90.0729 no decomp ex-upper: 91.7326 492207 5499648 5991855 91.7854 with decomp ex-upper: 94.8789 nno-nob unk known tot cov % 706322 5367265 6073587 88.3706 no decomp ex-upper: 91.2203 614658 5458929 6073587 89.8798 with decomp ex-upper: 94.2579 nob-nno unk known tot cov % 604062 5459945 6064007 90.0386 no decomp ex-upper: 92.5996 503789 5560218 6064007 91.6921 with decomp ex-upper: 95.5471
$ cat scandicov.sh #!/bin/bash # kill process group: trap "kill -- -0" EXIT sum () { awk -v note="$1" ' BEGIN{OFS=FS="\t"} /^\^/{w++} /\/\*/{u++} /\/\*.*[[:upper:]]/{Uu++} /[[:upper:]]/{Uw++} END{ lw=w-Uw lu=u-Uu print u,w-u,w,100*(w-u)/w, note,"ex-upper:",100*(lw-lu)/lw}' } for ana in /l/a/*/apertium-{swe-nor,swe-dan,dan-nor,nno-nob}/*.automorf.bin; do f="$(basename "${ana}")" echo "${f%%.automorf.bin}" src="${f%%-*}" printf "unk\tknown\ttot\tcov %%\n" xzcat ~/corpora/"${src}".wikicov.xz | lt-proc "${ana}" \ | apertium-cleanstream -n \ | sum " no decomp" & xzcat ~/corpora/"${src}".wikicov.xz | lt-proc -e "${ana}" \ | apertium-cleanstream -n \ | sum "with decomp" & wait echo done