Talk:Lexical selection in target language

From Apertium
Jump to navigation Jump to search

Ranker

Presuming that you have a language model in the following format:

$ head test.ngrams 
3086,1,last
1157,2,the last
1128,1,recent
703,1,recently
501,2,last year
301,2,in recent
277,2,recent years
250,2,the recent
231,1,constantly
225,3,in the last

this code should provide a very rudimentary ranker for input sentences of the form specified on the main page. This is far from efficient.

#!/usr/bin/python2.5
# coding=utf-8
# -*- encoding: utf-8 -*-

import sys, codecs;

sys.stdout = codecs.getwriter('utf-8')(sys.stdout);
sys.stderr = codecs.getwriter('utf-8')(sys.stderr);

def return_possibles(ambig): #{
	c = sys.stdin.read(1);

	count = 1;
	unambig = ambig[1];

	while c != '}':  #{
		if c == '|': #{
			count = count + 1;
			ambig[count] = unambig;
			c = sys.stdin.read(1);
			continue;
		#}

		ambig[count] = ambig[count] + c;
		c = sys.stdin.read(1);
	#}	
	
	return ambig;
#}

def rank(ambig, lm): #{
	ranked = [];

	for i in ambig.keys(): #{
		x = ambig[i].replace('  ', ' ').split(' ');
		score = 0.0;	
		grams = '';

		for i in range(0, len(x) - 1): #{
			grams = grams + x[i] + ' ' + x[i+1] + '\n';
		#}
		for i in range(0, len(x) - 2): #{
			grams = grams + x[i] + ' ' + x[i+1] + ' ' + x[i+2] + '\n';
		#}
		for i in range(0, len(x) - 3): #{
			grams = grams + x[i] + ' ' + x[i+1] + ' ' + x[i+2] + ' ' + x[i+3] + '\n';
		#} 
		for i in range(0, len(x) - 4): #{
			grams = grams + x[i] + ' ' + x[i+1] + ' ' + x[i+2] + ' ' + x[i+3] + ' ' + x[i+4] + '\n';
		#}

		for g in grams.split('\n'): #{
			g = g.strip();
			if g in lm: #{
				score = score + lm[g];
			#}
		#}

		ranked.append((score, x));
	#}
		
	return ranked;
#}

def load_lm(f): #{
	lmfile = file(f).read();
	lm = {};

	for line in lmfile.split('\n'): #{
		if len(line) < 1: #{
			continue;
		#}
		row = line.split(',');
		gram = row[2];
	
		lm[gram] = float(row[0]) * float(row[1]);
	#}
	return lm;
#}

ambig = {};
ambig[1] = '';
count = 1;
frame = 0;
unambig = '';
output = '';

lm = load_lm('test.ngrams');

c = sys.stdin.read(1);

while c != '': #{

	if c == '{' and frame == 0: #{
		ambig = return_possibles(ambig);
		frame = 1;
	#}

	if c == '{' and frame == 1: #{
		ranked = rank(ambig, lm);
		ranked.sort();
		ranked.reverse();
		output = output + ' '.join(ranked[0][1]);

		del ambig;
		unambig = '';
		ambig = {};
		ambig[1] = '';
		count = 1;
		frame = 0;
		c = sys.stdin.read(1);
	#}

	for i in ambig.keys(): #{
		ambig[i] = ambig[i] + c;
	#}

	c = sys.stdin.read(1);
#}

for i in ambig.keys(): #{
	output = output + ambig[i];
#}

sys.stdout.write(output);