Running the monolingual rule learning

From Apertium
Revision as of 21:40, 14 June 2013 by Fpetkovski (talk | contribs)
Jump to navigation Jump to search


Place the following Makefile in the folder where you want to run your training process:

#all: data/$(CORPUS).$(DIR).lrx data/$(CORPUS).$(DIR).freq.lrx
all: data/$(CORPUS).$(DIR).freq.lrx.bin
data/$(CORPUS).$(DIR).lines: $(CORPUS).$(DIR).txt
	if [ ! -d data ]; then mkdir data; fi
	seq `cat $< | wc -l` > $@
data/$(CORPUS).$(DIR).biltrans: $(CORPUS).$(DIR).txt
	if [ ! -d data ]; then mkdir data; fi
	cat $(CORPUS).$(DIR).txt | apertium-destxt | apertium -f none -d $(DATA) $(DIR)-pretransfer | lt-proc -b $(DATA)/$(AUTOBIL) > $@
data/$(CORPUS).$(DIR).ambig: data/$(CORPUS).$(DIR).biltrans data/$(CORPUS).$(DIR).lines
	cat -n data/$(CORPUS).$(DIR).biltrans | python3 $(SCRIPTS)/ | python3 $(SCRIPTS)/ | python3 $(SCRIPTS)/ > $@
data/$(CORPUS).$(DIR).multi: data/$(CORPUS).$(DIR).ambig
	cat $< | python $(SCRIPTS)/ > $@
data/$(CORPUS).$(DIR).unranked: data/$(CORPUS).$(DIR).multi
	cat $< | apertium -f none -d $(DATA) $(DIR)-multi > $@
data/$(CORPUS).$(DIR).ranked: data/$(CORPUS).$(DIR).unranked
	cat $< | irstlm-ranker-frac $(MODEL) > $@

data/$(CORPUS).$(DIR).annotated: data/$(CORPUS).$(DIR).multi data/$(CORPUS).$(DIR).ranked
	paste data/$(CORPUS).$(DIR).multi data/$(CORPUS).$(DIR).ranked | cut -f1-4 > $@
data/$(CORPUS).$(DIR).freq: data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated
	python3 $(SCRIPTS)/  data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated > $@
data/$(CORPUS).$(DIR).ngrams: data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated
	python3 $(SCRIPTS)/ data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated > $@
data/$(CORPUS).$(DIR).patterns: data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).ngrams
	python3 $(SCRIPTS)/ data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).ngrams > $@  
data/$(CORPUS).$(DIR).freq.lrx:  data/$(CORPUS).$(DIR).freq
	python3 $(SCRIPTS)/ $< > $@
data/$(CORPUS).$(DIR).freq.lrx.bin: data/$(CORPUS).$(DIR).freq.lrx
	apertium-lrx-comp $< $@

In the same folder also place your source side corpus.

Set the Makefile variables as follows:

  • CORPUS denotes the base name of your corpus file
  • DIR stands for the language pair
  • DATA is the path to the language resources for the language pair
  • AUTOBIL is the path to binary bilingual dictionary for the language pair
  • SCRIPTS denotes the path to the lex-tools scripts
  • MODEL is the path to the target side (binary) language model used for scoring the possible translations of ambiguous words