Talk:Lexical selection in target language
Revision as of 15:24, 30 October 2008 by Francis Tyers (talk | contribs) (New page: ==Ranker== Presuming that you have a language model in the following format: <pre> $ head test.ngrams 3086,1,last 1157,2,the last 1128,1,recent 703,1,recently 501,2,last year 301,2,in r...)
Ranker
Presuming that you have a language model in the following format:
$ head test.ngrams 3086,1,last 1157,2,the last 1128,1,recent 703,1,recently 501,2,last year 301,2,in recent 277,2,recent years 250,2,the recent 231,1,constantly 225,3,in the last
this code should provide a very rudimentary ranker for input sentences of the form specified on the main page. This is far from efficient.
#!/usr/bin/python2.5
# coding=utf-8
# -*- encoding: utf-8 -*-
import sys, codecs;
sys.stdout = codecs.getwriter('utf-8')(sys.stdout);
sys.stderr = codecs.getwriter('utf-8')(sys.stderr);
def return_possibles(ambig): #{
c = sys.stdin.read(1);
count = 1;
unambig = ambig[1];
while c != '}': #{
if c == '|': #{
count = count + 1;
ambig[count] = unambig;
c = sys.stdin.read(1);
continue;
#}
ambig[count] = ambig[count] + c;
c = sys.stdin.read(1);
#}
return ambig;
#}
def rank(ambig, lm): #{
ranked = [];
for i in ambig.keys(): #{
x = ambig[i].replace(' ', ' ').split(' ');
score = 0.0;
grams = '';
for i in range(0, len(x) - 1): #{
grams = grams + x[i] + ' ' + x[i+1] + '\n';
#}
for i in range(0, len(x) - 2): #{
grams = grams + x[i] + ' ' + x[i+1] + ' ' + x[i+2] + '\n';
#}
for i in range(0, len(x) - 3): #{
grams = grams + x[i] + ' ' + x[i+1] + ' ' + x[i+2] + ' ' + x[i+3] + '\n';
#}
for i in range(0, len(x) - 4): #{
grams = grams + x[i] + ' ' + x[i+1] + ' ' + x[i+2] + ' ' + x[i+3] + ' ' + x[i+4] + '\n';
#}
for g in grams.split('\n'): #{
g = g.strip();
if g in lm: #{
score = score + lm[g];
#}
#}
ranked.append((score, x));
#}
return ranked;
#}
def load_lm(f): #{
lmfile = file(f).read();
lm = {};
for line in lmfile.split('\n'): #{
if len(line) < 1: #{
continue;
#}
row = line.split(',');
gram = row[2];
lm[gram] = float(row[0]) * float(row[1]);
#}
return lm;
#}
ambig = {};
ambig[1] = '';
count = 1;
frame = 0;
unambig = '';
output = '';
lm = load_lm('test.ngrams');
c = sys.stdin.read(1);
while c != '': #{
if c == '{' and frame == 0: #{
ambig = return_possibles(ambig);
frame = 1;
#}
if c == '{' and frame == 1: #{
ranked = rank(ambig, lm);
ranked.sort();
ranked.reverse();
output = output + ' '.join(ranked[0][1]);
del ambig;
unambig = '';
ambig = {};
ambig[1] = '';
count = 1;
frame = 0;
c = sys.stdin.read(1);
#}
for i in ambig.keys(): #{
ambig[i] = ambig[i] + c;
#}
c = sys.stdin.read(1);
#}
for i in ambig.keys(): #{
output = output + ambig[i];
#}
sys.stdout.write(output);