Difference between revisions of "Running the monolingual rule learning"

From Apertium
Jump to navigation Jump to search
Line 1: Line 1:
* Install Apertium-lex-tools
* Install IRSTLM (http://sourceforge.net/projects/irstlm/)
* Train a target side language model (http://sourceforge.net/apps/mediawiki/irstlm/index.php?title=User_Manual)
Place the following Makefile in the folder where you want to run your training process:
Place the following Makefile in the folder where you want to run your training process:

Revision as of 21:40, 14 June 2013


Place the following Makefile in the folder where you want to run your training process:

#all: data/$(CORPUS).$(DIR).lrx data/$(CORPUS).$(DIR).freq.lrx
all: data/$(CORPUS).$(DIR).freq.lrx.bin
data/$(CORPUS).$(DIR).lines: $(CORPUS).$(DIR).txt
	if [ ! -d data ]; then mkdir data; fi
	seq `cat $< | wc -l` > $@
data/$(CORPUS).$(DIR).biltrans: $(CORPUS).$(DIR).txt
	if [ ! -d data ]; then mkdir data; fi
	cat $(CORPUS).$(DIR).txt | apertium-destxt | apertium -f none -d $(DATA) $(DIR)-pretransfer | lt-proc -b $(DATA)/$(AUTOBIL) > $@
data/$(CORPUS).$(DIR).ambig: data/$(CORPUS).$(DIR).biltrans data/$(CORPUS).$(DIR).lines
	cat -n data/$(CORPUS).$(DIR).biltrans | python3 $(SCRIPTS)/trim-fertile-lines.py | python3 $(SCRIPTS)/biltrans-line-only-pos-ambig.py | python3 $(SCRIPTS)/biltrans-trim-uncovered.py > $@
data/$(CORPUS).$(DIR).multi: data/$(CORPUS).$(DIR).ambig
	cat $< | python $(SCRIPTS)/biltrans-to-multitrans-line-recursive.py > $@
data/$(CORPUS).$(DIR).unranked: data/$(CORPUS).$(DIR).multi
	cat $< | apertium -f none -d $(DATA) $(DIR)-multi > $@
data/$(CORPUS).$(DIR).ranked: data/$(CORPUS).$(DIR).unranked
	cat $< | irstlm-ranker-frac $(MODEL) > $@

data/$(CORPUS).$(DIR).annotated: data/$(CORPUS).$(DIR).multi data/$(CORPUS).$(DIR).ranked
	paste data/$(CORPUS).$(DIR).multi data/$(CORPUS).$(DIR).ranked | cut -f1-4 > $@
data/$(CORPUS).$(DIR).freq: data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated
	python3 $(SCRIPTS)/biltrans-extract-frac-freq.py  data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated > $@
data/$(CORPUS).$(DIR).ngrams: data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated
	python3 $(SCRIPTS)/biltrans-count-patterns-ngrams.py data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated > $@
data/$(CORPUS).$(DIR).patterns: data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).ngrams
	python3 $(SCRIPTS)/ngram-pruning-frac.py data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).ngrams > $@  
data/$(CORPUS).$(DIR).freq.lrx:  data/$(CORPUS).$(DIR).freq
	python3 $(SCRIPTS)/extract-alig-lrx.py $< > $@
data/$(CORPUS).$(DIR).freq.lrx.bin: data/$(CORPUS).$(DIR).freq.lrx
	apertium-lrx-comp $< $@

In the same folder also place your source side corpus.

Set the Makefile variables as follows:

  • CORPUS denotes the base name of your corpus file
  • DIR stands for the language pair
  • DATA is the path to the language resources for the language pair
  • AUTOBIL is the path to binary bilingual dictionary for the language pair
  • SCRIPTS denotes the path to the lex-tools scripts
  • MODEL is the path to the target side (binary) language model used for scoring the possible translations of ambiguous words