Difference between revisions of "Talk:Scandinavian MT project"

From Apertium
Jump to navigation Jump to search
 
(14 intermediate revisions by the same user not shown)
Line 1: Line 1:
  +
==Coverage==
Coverage on Wikipedia dumps ("w/o cmp" is with decompounding turned off):
 
   
  +
Coverage on Wikipedia dumps ("no decomp" is with decompounding turned off, ie. without the -e switch to lt-proc; ex-upper is excluding anything with uppercase characters from all counts; script at bottom of page).
{|class=wikitable
 
  +
===Results===
! Direction !! w/o cmp !! regular
 
  +
<pre>
|-
 
  +
$ ./scandicov.sh
| nob-nno || 90.4% || 92.2%
 
  +
nno-swe
|-
 
  +
unk known tot cov %
| nno-nob || 89.2% || 90.9%
 
  +
805040 5307135 6112175 86.8289 no decomp ex-upper: 88.251
|-
 
  +
703448 5408727 6112175 88.491 with decomp ex-upper: 91.5592
| dan-nob || 82.7% ||
 
  +
|-
 
  +
nob-swe
| dan-nno || 81.8% ||
 
  +
unk known tot cov %
|-
 
  +
691359 5408402 6099761 88.6658 no decomp ex-upper: 90.0757
| nno-dan || || 89.0%
 
  +
579262 5520499 6099761 90.5035 with decomp ex-upper: 93.3714
|-
 
  +
| nob-dan || ||
 
  +
swe-nno
|-
 
  +
unk known tot cov %
| dan-swe || 76.2% ||
 
  +
914584 5167659 6082243 84.963 no decomp ex-upper: 87.082
|-
 
  +
744040 5338204 6082244 87.767 with decomp ex-upper: 92.1901
| swe-dan || 80.4% || 83.7%
 
  +
|-
 
| swe-nno || ||
+
swe-nob
  +
unk known tot cov %
|-
 
  +
907104 5175868 6082972 85.0878 no decomp ex-upper: 87.2664
| swe-nob || ||
 
  +
733559 5349414 6082973 87.9408 with decomp ex-upper: 92.4382
|-
 
  +
| nno-swe || ||
 
  +
dan-swe
|-
 
  +
unk known tot cov %
| nob-swe || ||
 
  +
1135679 4775050 5910729 80.7861 no decomp ex-upper: 86.2313
|}
 
  +
997339 4913390 5910729 83.1266 with decomp ex-upper: 90.5919
  +
  +
swe-dan
  +
unk known tot cov %
  +
1120955 4760872 5881827 80.9421 no decomp ex-upper: 86.4166
  +
926361 4955466 5881827 84.2505 with decomp ex-upper: 92.2963
  +
  +
dan-nno
  +
unk known tot cov %
  +
870318 5020221 5890539 85.2252 no decomp ex-upper: 87.512
  +
752064 5138475 5890539 87.2327 with decomp ex-upper: 91.4789
  +
  +
dan-nob
  +
unk known tot cov %
  +
832984 5052919 5885903 85.8478 no decomp ex-upper: 88.22
  +
703618 5182285 5885903 88.0457 with decomp ex-upper: 92.308
  +
  +
nno-dan
  +
unk known tot cov %
  +
707162 5295158 6002320 88.2185 no decomp ex-upper: 90.2369
  +
613734 5388586 6002320 89.7751 with decomp ex-upper: 93.4124
  +
  +
nob-dan
  +
unk known tot cov %
  +
594784 5387514 5982298 90.0576 no decomp ex-upper: 91.7191
  +
492014 5490284 5982298 91.7755 with decomp ex-upper: 94.8719
  +
  +
nno-nob
  +
unk known tot cov %
  +
704975 5364593 6069568 88.3851 no decomp ex-upper: 91.2337
  +
613334 5456234 6069568 89.8949 with decomp ex-upper: 94.2736
  +
  +
nob-nno
  +
unk known tot cov %
  +
603139 5456170 6059309 90.0461 no decomp ex-upper: 92.6105
  +
502464 5556845 6059309 91.7076 with decomp ex-upper: 95.5668
  +
</pre>
  +
  +
===script===
  +
<pre>
  +
$ cat scandicov.sh
  +
#!/bin/bash
  +
  +
# kill process group:
  +
trap "kill -- -0" EXIT
  +
  +
sum () {
  +
awk -v note="$1" '
  +
BEGIN{OFS=FS="\t"}
  +
/^\^/{w++}
  +
/\/\*/{u++}
  +
/\/\*.*[[:upper:]]/{Uu++}
  +
/[[:upper:]]/{Uw++}
  +
END{
  +
lw=w-Uw
  +
lu=u-Uu
  +
print u,w-u,w,100*(w-u)/w, note,"ex-upper:",100*(lw-lu)/lw}'
  +
}
  +
  +
for ana in /l/a/*/apertium-{swe-nor,swe-dan,dan-nor,nno-nob}/*.automorf.bin; do
  +
f="$(basename "${ana}")"
  +
echo "${f%%.automorf.bin}"
  +
src="${f%%-*}"
  +
printf "unk\tknown\ttot\tcov %%\n"
  +
xzcat ~/corpora/"${src}".wikicov.xz | lt-proc "${ana}" \
  +
| apertium-cleanstream -n \
  +
| sum " no decomp" &
  +
  +
xzcat ~/corpora/"${src}".wikicov.xz | lt-proc -e "${ana}" \
  +
| apertium-cleanstream -n \
  +
| sum "with decomp" &
  +
  +
wait
  +
echo
  +
done
  +
</pre>
  +
==Testvoc==
  +
===Results===
  +
<pre>
  +
1 ../apertium-dan-nor/nobackup/69419-dan-nno
  +
30 ../apertium-dan-nor/nobackup/69419-dan-nob
  +
37 ../apertium-dan-nor/nobackup/69419-nno-dan
  +
26 ../apertium-dan-nor/nobackup/69419-nob-dan
  +
94 total
  +
  +
136 ../apertium-nno-nob/nobackup/69419-nno-nob
  +
71 ../apertium-nno-nob/nobackup/69419-nob-nno
  +
207 total
  +
  +
157 ../apertium-swe-dan/nobackup/69419-dan-swe
  +
276 ../apertium-swe-dan/nobackup/69419-swe-dan
  +
433 total
  +
  +
0 ../apertium-swe-nor/nobackup/69419-nno-swe
  +
0 ../apertium-swe-nor/nobackup/69419-nob-swe
  +
0 ../apertium-swe-nor/nobackup/69419-swe-nno
  +
0 ../apertium-swe-nor/nobackup/69419-swe-nob.tmp
  +
0 total
  +
</pre>
  +
  +
===script===
  +
<pre>
  +
$ while sleep 30; do
  +
for d in trunk/apertium-{swe-nor,dan-nor,nno-nob,swe-dan}; do
  +
( cd $d && up && make langs && rev=$(svn info|grep ^Revision: |grep -o '[0-9]*') &&
  +
for m in modes/???-???.mode; do
  +
p=${m##modes/} && p=${p%%.mode} && echo $p &&
  +
( test -f nobackup/${rev}-$p ||
  +
( dev/testvoc/generation.sh --hfst $p > nobackup/${rev}-$p.tmp
  +
mv nobackup/${rev}-$p.tmp nobackup/${rev}-$p
  +
)
  +
)
  +
done
  +
)
  +
done
  +
done
  +
</pre>

Latest revision as of 18:20, 1 July 2016

Coverage[edit]

Coverage on Wikipedia dumps ("no decomp" is with decompounding turned off, ie. without the -e switch to lt-proc; ex-upper is excluding anything with uppercase characters from all counts; script at bottom of page).

Results[edit]

$ ./scandicov.sh
nno-swe
unk     known   tot     cov %
805040  5307135 6112175 86.8289   no decomp     ex-upper:       88.251
703448  5408727 6112175 88.491  with decomp     ex-upper:       91.5592

nob-swe
unk     known   tot     cov %
691359  5408402 6099761 88.6658   no decomp     ex-upper:       90.0757
579262  5520499 6099761 90.5035 with decomp     ex-upper:       93.3714

swe-nno
unk     known   tot     cov %
914584  5167659 6082243 84.963    no decomp     ex-upper:       87.082
744040  5338204 6082244 87.767  with decomp     ex-upper:       92.1901

swe-nob
unk     known   tot     cov %
907104  5175868 6082972 85.0878   no decomp     ex-upper:       87.2664
733559  5349414 6082973 87.9408 with decomp     ex-upper:       92.4382

dan-swe
unk     known   tot     cov %
1135679 4775050 5910729 80.7861   no decomp     ex-upper:       86.2313
997339  4913390 5910729 83.1266 with decomp     ex-upper:       90.5919

swe-dan
unk     known   tot     cov %
1120955 4760872 5881827 80.9421   no decomp     ex-upper:       86.4166
926361  4955466 5881827 84.2505 with decomp     ex-upper:       92.2963

dan-nno
unk     known   tot     cov %
870318  5020221 5890539 85.2252   no decomp     ex-upper:       87.512
752064  5138475 5890539 87.2327 with decomp     ex-upper:       91.4789

dan-nob
unk     known   tot     cov %
832984  5052919 5885903 85.8478   no decomp     ex-upper:       88.22
703618  5182285 5885903 88.0457 with decomp     ex-upper:       92.308

nno-dan
unk     known   tot     cov %
707162  5295158 6002320 88.2185   no decomp     ex-upper:       90.2369
613734  5388586 6002320 89.7751 with decomp     ex-upper:       93.4124

nob-dan
unk     known   tot     cov %
594784  5387514 5982298 90.0576   no decomp     ex-upper:       91.7191
492014  5490284 5982298 91.7755 with decomp     ex-upper:       94.8719

nno-nob
unk     known   tot     cov %
704975  5364593 6069568 88.3851   no decomp     ex-upper:       91.2337
613334  5456234 6069568 89.8949 with decomp     ex-upper:       94.2736

nob-nno
unk     known   tot     cov %
603139  5456170 6059309 90.0461   no decomp     ex-upper:       92.6105
502464  5556845 6059309 91.7076 with decomp     ex-upper:       95.5668

script[edit]

$ cat scandicov.sh
#!/bin/bash

# kill process group:
trap "kill -- -0" EXIT

sum () {
    awk -v note="$1" '
BEGIN{OFS=FS="\t"} 
/^\^/{w++} 
/\/\*/{u++}
/\/\*.*[[:upper:]]/{Uu++}
/[[:upper:]]/{Uw++} 
END{
lw=w-Uw
lu=u-Uu
print u,w-u,w,100*(w-u)/w, note,"ex-upper:",100*(lw-lu)/lw}'
}

for ana in /l/a/*/apertium-{swe-nor,swe-dan,dan-nor,nno-nob}/*.automorf.bin; do
    f="$(basename "${ana}")"
    echo "${f%%.automorf.bin}"
    src="${f%%-*}"
    printf "unk\tknown\ttot\tcov %%\n"
    xzcat ~/corpora/"${src}".wikicov.xz | lt-proc "${ana}" \
        | apertium-cleanstream -n \
        | sum "  no decomp" &

    xzcat ~/corpora/"${src}".wikicov.xz | lt-proc -e "${ana}" \
        | apertium-cleanstream -n \
        | sum "with decomp" &

    wait
    echo
done

Testvoc[edit]

Results[edit]

   1 ../apertium-dan-nor/nobackup/69419-dan-nno
  30 ../apertium-dan-nor/nobackup/69419-dan-nob
  37 ../apertium-dan-nor/nobackup/69419-nno-dan
  26 ../apertium-dan-nor/nobackup/69419-nob-dan
  94 total

  136 ../apertium-nno-nob/nobackup/69419-nno-nob
   71 ../apertium-nno-nob/nobackup/69419-nob-nno
  207 total

  157 ../apertium-swe-dan/nobackup/69419-dan-swe
  276 ../apertium-swe-dan/nobackup/69419-swe-dan
  433 total

0 ../apertium-swe-nor/nobackup/69419-nno-swe
0 ../apertium-swe-nor/nobackup/69419-nob-swe
0 ../apertium-swe-nor/nobackup/69419-swe-nno
0 ../apertium-swe-nor/nobackup/69419-swe-nob.tmp
0 total

script[edit]

$ while sleep 30; do 
  for d in trunk/apertium-{swe-nor,dan-nor,nno-nob,swe-dan}; do 
    ( cd $d && up && make langs  && rev=$(svn info|grep ^Revision: |grep -o '[0-9]*') && 
      for m in modes/???-???.mode; do 
        p=${m##modes/} && p=${p%%.mode} && echo $p && 
        ( test -f nobackup/${rev}-$p || 
            ( dev/testvoc/generation.sh --hfst $p > nobackup/${rev}-$p.tmp
              mv nobackup/${rev}-$p.tmp nobackup/${rev}-$p 
            ) 
        )
      done  
    )
  done
done