Difference between revisions of "User:Francis Tyers/TLH"

From Apertium
Jump to navigation Jump to search
Line 80: Line 80:


==Tarea 4==
==Tarea 4==

$ prepare-corpus.sh LexEsp_Etq_Larga.cooked
4179 sentences
256 tags 16481 types 96961 tokens
1 15735 95.474% 69045 71.209%
2 689 4.181% 22621 23.330%
3 51 0.309% 4315 4.450%
4 3 0.018% 151 0.156%
5 3 0.018% 829 0.855%
Mean ambiguity A=1.361176

Entropy H(p)=5.488119

$ cooked2lex.pl < LexEsp_Etq_Larga-train-0.cooked > train.larga.lex
3761 sentences
254 tags 15431 types 87094 tokens
1 14742 95.535% 62581 71.855%
2 637 4.128% 20044 23.014%
3 47 0.305% 3620 4.156%
4 2 0.013% 96 0.110%
5 3 0.019% 753 0.865%
Mean ambiguity A=1.351161

Entropy H(p)=5.485330

$ cooked2ngram.pl < LexEsp_Etq_Larga-train-0.cooked > train.larga.ngrams
$ cooked2raw.pl LexEsp_Etq_Larga-0.cooked > LexEsp_Etq_Larga-0.raw
$ cooked2raw.pl < LexEsp_Etq_Larga-0.cooked > LexEsp_Etq_Larga-0.raw
$ t3 train.larga.ngrams train.larga.lex < LexEsp_Etq_Larga-0.raw > LexEsp_Etq_Larga-0.t3
[ 4 ms::1]
[ 4 ms::1] Trigram POS Tagger (c) Ingo Schr�der, schroeder@informatik.uni-hamburg.de
[ 4 ms::1]
[ 2064 ms::1] model generated from 3761 sentences (thereof 43 one-word)
[ 2064 ms::1] found 11283 uni-, 15044 bi-, and 18762 trigram counts for the boundary tag
[ 12724 ms::1] computed smoothed transition probabilities
[ 13512 ms::1] built suffix tries with 29924 lowercase and 6743 uppercase nodes
[ 13532 ms::1] leaves/single/total LC: 7672 18878 29925
[ 13536 ms::1] leaves/single/total UC: 1320 4874 6744
[ 16329 ms::1] suffix probabilities smoothing done [theta 1.281e-02]
[ 12249377 ms::1] done

$ evaluate.pl LexEsp_Etq_Larga-0.cooked LexEsp_Etq_Larga-0.t3

418 sentences
LexEsp_Etq_Larga-0.t3 9412 455 95.389%

Revision as of 18:52, 23 December 2007

Tarea 1

$ evaluate.pl LexEsp-0.cooked LexEsp-0.t3 
418 sentences
         LexEsp-0.t3     9470      397  95.976%

Tarea 2

$ for i in `seq 1 9`; do 
    cat LexEsp-[1-$i].cooked > LexEsp-ejecucion$i.cooked; 
    cooked2lex.pl < LexEsp-ejecucion$i.cooked > train.$i.lex; 
    cooked2ngram.pl < LexEsp-ejecucion$i.cooked > train.$i.ngrams; 
    t3 train.$i.ngrams train.$i.lex < LexEsp-0.raw > LexEsp-0.$i.t3; 
    evaluate.pl LexEsp-0.cooked LexEsp-0.$i.t3 >> output ; 
done

$ wc -l LexEsp-ejecucion*.cooked
    418 LexEsp-ejecucion1.cooked
    836 LexEsp-ejecucion2.cooked
   1254 LexEsp-ejecucion3.cooked
   1672 LexEsp-ejecucion4.cooked
   2090 LexEsp-ejecucion5.cooked
   2508 LexEsp-ejecucion6.cooked
   2926 LexEsp-ejecucion7.cooked
   3344 LexEsp-ejecucion8.cooked
   3761 LexEsp-ejecucion9.cooked

$ cat output
418 sentences
       LexEsp-0.1.t3     8948      919  90.686%
418 sentences
       LexEsp-0.2.t3     9155      712  92.784%
418 sentences
       LexEsp-0.3.t3     9275      592  94.000%
418 sentences
       LexEsp-0.4.t3     9313      554  94.385%
418 sentences
       LexEsp-0.5.t3     9366      501  94.922%
418 sentences
       LexEsp-0.6.t3     9391      476  95.176%
418 sentences
       LexEsp-0.7.t3     9419      448  95.460%
418 sentences
       LexEsp-0.8.t3     9444      423  95.713%
418 sentences
       LexEsp-0.9.t3     9470      397  95.976%

Tarea 3

$ for i in `seq 1 10`; do 
    t3 -l $i train.ngrams train.lex < LexEsp-0.raw > LexEsp-0.l$i.t3; 
    evaluate.pl LexEsp-0.cooked LexEsp-0.l$i.t3 >> output.l; 
done

$ cat output.l
418 sentences
      LexEsp-0.l1.t3     9411      456  95.379%
418 sentences
      LexEsp-0.l2.t3     9466      401  95.936%
418 sentences
      LexEsp-0.l3.t3     9492      375  96.199%
418 sentences
      LexEsp-0.l4.t3     9490      377  96.179%
418 sentences
      LexEsp-0.l5.t3     9473      394  96.007%
418 sentences
      LexEsp-0.l6.t3     9477      390  96.047%
418 sentences
      LexEsp-0.l7.t3     9473      394  96.007%
418 sentences
      LexEsp-0.l8.t3     9470      397  95.976%
418 sentences
      LexEsp-0.l9.t3     9470      397  95.976%
418 sentences
     LexEsp-0.l10.t3     9470      397  95.976%

Tarea 4

$ prepare-corpus.sh LexEsp_Etq_Larga.cooked 4179 sentences 256 tags 16481 types 96961 tokens

 1     15735  95.474%     69045  71.209% 
 2       689   4.181%     22621  23.330% 
 3        51   0.309%      4315   4.450% 
 4         3   0.018%       151   0.156% 
 5         3   0.018%       829   0.855% 

Mean ambiguity A=1.361176

Entropy H(p)=5.488119

$ cooked2lex.pl < LexEsp_Etq_Larga-train-0.cooked > train.larga.lex 3761 sentences 254 tags 15431 types 87094 tokens

 1     14742  95.535%     62581  71.855% 
 2       637   4.128%     20044  23.014% 
 3        47   0.305%      3620   4.156% 
 4         2   0.013%        96   0.110% 
 5         3   0.019%       753   0.865% 

Mean ambiguity A=1.351161

Entropy H(p)=5.485330

$ cooked2ngram.pl < LexEsp_Etq_Larga-train-0.cooked > train.larga.ngrams $ cooked2raw.pl LexEsp_Etq_Larga-0.cooked > LexEsp_Etq_Larga-0.raw $ cooked2raw.pl < LexEsp_Etq_Larga-0.cooked > LexEsp_Etq_Larga-0.raw $ t3 train.larga.ngrams train.larga.lex < LexEsp_Etq_Larga-0.raw > LexEsp_Etq_Larga-0.t3 [ 4 ms::1] [ 4 ms::1] Trigram POS Tagger (c) Ingo Schr�der, schroeder@informatik.uni-hamburg.de [ 4 ms::1] [ 2064 ms::1] model generated from 3761 sentences (thereof 43 one-word) [ 2064 ms::1] found 11283 uni-, 15044 bi-, and 18762 trigram counts for the boundary tag [ 12724 ms::1] computed smoothed transition probabilities [ 13512 ms::1] built suffix tries with 29924 lowercase and 6743 uppercase nodes [ 13532 ms::1] leaves/single/total LC: 7672 18878 29925 [ 13536 ms::1] leaves/single/total UC: 1320 4874 6744 [ 16329 ms::1] suffix probabilities smoothing done [theta 1.281e-02] [ 12249377 ms::1] done

$ evaluate.pl LexEsp_Etq_Larga-0.cooked LexEsp_Etq_Larga-0.t3

418 sentences LexEsp_Etq_Larga-0.t3 9412 455 95.389%