Difference between revisions of "Generating lexical-selection rules from monolingual corpora"

From Apertium
Jump to navigation Jump to search
Line 123: Line 123:
 
python3 ~/source/apertium-lex-tools/scripts/ngrams-to-rules-me.py ngrams-trimmed > europarl.en-es.lrx.bin
 
python3 ~/source/apertium-lex-tools/scripts/ngrams-to-rules-me.py ngrams-trimmed > europarl.en-es.lrx.bin
 
</pre>
 
</pre>
  +
  +
  +
== Makefiles ==
  +
  +
=== Direct rule extraction ===
  +
You can use this makefile to generate rules using maximum entropy classifiers.
  +
Your corpus needs to be placed in the same folder with your makefile.
  +
  +
<pre>
  +
CORPUS=setimes
  +
PAIR=mk-en
  +
DATA=/home/philip/Apertium/apertium-mk-en
  +
SL=mk
  +
TL=en
  +
TRAINING_LINES=10000
  +
SCRIPTS=/home/philip/Apertium/apertium-lex-tools/scripts
  +
MODEL=/home/philip/Apertium/corpora/language-models/en/setimes.en.5.blm
  +
LEX_TOOLS=/home/philip/Apertium/apertium-lex-tools
  +
  +
  +
  +
AUTOBIL=$(SL)-$(TL).autobil.bin
  +
DIR=$(SL)-$(TL)
  +
  +
all: data/$(CORPUS).$(DIR).freq.lrx.bin data/$(CORPUS).$(DIR).patterns.lrx
  +
  +
data/$(CORPUS).$(DIR).tagger: $(CORPUS).$(PAIR).$(SL)
  +
if [ ! -d data ]; then mkdir data; fi
  +
cat $(CORPUS).$(PAIR).$(SL) | head -n $(TRAINING_LINES)| sed 's/[^\.]$$/./g' | apertium-destxt | apertium -f none -d $(DATA) $(DIR)-tagger | apertium-pretransfer > $@
  +
  +
data/$(CORPUS).$(DIR).ambig: data/$(CORPUS).$(DIR).tagger
  +
cat data/$(CORPUS).$(DIR).tagger | $(LEX_TOOLS)/process-tagger-output $(DATA)/$(AUTOBIL) -b -t -f -n > $@
  +
  +
data/$(CORPUS).$(DIR).multi-trimmed: data/$(CORPUS).$(DIR).tagger
  +
cat data/$(CORPUS).$(DIR).tagger | $(LEX_TOOLS)/process-tagger-output $(DATA)/$(AUTOBIL) -m -t -f > $@
  +
  +
data/$(CORPUS).$(DIR).annotated: data/$(CORPUS).$(DIR).tagger data/$(CORPUS).$(DIR).multi-trimmed
  +
cat data/$(CORPUS).$(DIR).tagger | $(LEX_TOOLS)/process-tagger-output $(DATA)/$(AUTOBIL) -m -f | apertium -f none -d $(DATA) $(DIR)-multi | $(LEX_TOOLS)/irstlm-ranker $(MODEL) data/$(CORPUS).$(DIR).multi-trimmed -f 2>/dev/null | grep "|@|" | cut -f 1-3 > $@
  +
  +
data/$(CORPUS).$(DIR).freq: data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated
  +
python3 $(SCRIPTS)/biltrans-extract-frac-freq.py data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated > $@
  +
  +
data/$(CORPUS).$(DIR).freq.lrx: data/$(CORPUS).$(DIR).freq
  +
python3 $(SCRIPTS)/extract-alig-lrx.py $< > $@
  +
  +
data/$(CORPUS).$(DIR).freq.lrx.bin: data/$(CORPUS).$(DIR).freq.lrx
  +
lrx-comp $< $@
  +
  +
data/$(CORPUS).$(DIR).ngrams: data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated
  +
python3 $(SCRIPTS)/biltrans-count-patterns-ngrams.py data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated > $@
  +
  +
data/$(CORPUS).$(DIR).patterns: data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).ngrams
  +
python3 $(SCRIPTS)/ngram-pruning-frac.py data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).ngrams > $@
  +
  +
data/$(CORPUS).$(DIR).patterns.lrx: data/$(CORPUS).$(DIR).patterns
  +
python3 $(SCRIPTS)/ngrams-to-rules.py $< $(THR) > $@
  +
  +
  +
</pre>
  +
  +
The corpus file needs to be named as "basename"."language-pair"."source side".
  +
As an illustration, in the Makefile example, the corpus file is named europarl.en-es.es
  +
Set the Makefile variables as follows:
  +
CORPUS denotes the base name of your corpus file
  +
PAIR stands for the language pair
  +
SL and TL stand for source language and target language
  +
DATA is the path to the language resources for the language pair
  +
SCRIPTS denotes the path to the lex-tools scripts
  +
MODEL is the path to the target side (binary) language model used for scoring the possible translations of ambiguous words
  +
  +
Finally, executing the Makefile will generate lexical selection rules for the specified language pair.
  +
  +
=== Maximum entropy ===
  +
You can use this makefile to generate rules using maximum entropy classifiers.
  +
Your corpus needs to be placed in the same folder with your makefile.
  +
  +
<pre>
  +
CORPUS=europarl
  +
PAIR=en-es
  +
DATA=/home/philip/source/apertium-en-es
  +
SL=es
  +
TL=en
  +
MODEL=/home/philip/lm/en.blm
  +
SCRIPTS=/home/philip/source/apertium-lex-tools/scripts
  +
LEX_TOOLS=/home/philip/source/apertium-lex-tools
  +
THR=1
  +
TRAINING_LINES=10000
  +
  +
  +
AUTOBIL=$(SL)-$(TL).autobil.bin
  +
DIR=$(SL)-$(TL)
  +
YASMET=$(LEX_TOOLS)/yasmet
  +
all: data/$(CORPUS).$(DIR).freq.lrx.bin data/$(CORPUS).$(DIR).lm.xml
  +
  +
data/$(CORPUS).$(DIR).tagger: $(CORPUS).$(PAIR).$(SL)
  +
if [ ! -d data ]; then mkdir data; fi
  +
cat $(CORPUS).$(PAIR).$(SL) | head -n $(TRAINING_LINES) | apertium-destxt | apertium -f none -d $(DATA) $(DIR)-tagger | apertium-pretransfer > $@
  +
  +
data/$(CORPUS).$(DIR).ambig: data/$(CORPUS).$(DIR).tagger
  +
cat data/$(CORPUS).$(DIR).tagger | $(LEX_TOOLS)/multitrans $(DATA)/$(AUTOBIL) -b -t -f -n > $@
  +
  +
data/$(CORPUS).$(DIR).multi-trimmed: data/$(CORPUS).$(DIR).tagger
  +
cat data/$(CORPUS).$(DIR).tagger | $(LEX_TOOLS)/multitrans $(DATA)/$(AUTOBIL) -m -t -f > $@
  +
  +
data/$(CORPUS).$(DIR).annotated: data/$(CORPUS).$(DIR).tagger data/$(CORPUS).$(DIR).multi-trimmed
  +
cat data/$(CORPUS).$(DIR).tagger | $(LEX_TOOLS)/multitrans $(DATA)/$(AUTOBIL) -m -f | apertium -f none -d $(DATA) $(DIR)-multi | $(LEX_TOOLS)/irstlm-ranker $(MODEL) data/$(CORPUS).$(DIR).multi-trimmed -f 2>/dev/null > $@
  +
  +
data/$(CORPUS).$(DIR).freq: data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated
  +
python3 $(SCRIPTS)/biltrans-extract-frac-freq.py data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated > $@
  +
  +
data/$(CORPUS).$(DIR).freq.lrx: data/$(CORPUS).$(DIR).freq
  +
python3 $(SCRIPTS)/extract-alig-lrx.py $< > $@
  +
  +
data/$(CORPUS).$(DIR).freq.lrx.bin: data/$(CORPUS).$(DIR).freq.lrx
  +
lrx-comp $< $@
  +
  +
data/$(CORPUS).$(DIR).events data/$(CORPUS).$(DIR).ngrams: data/$(CORPUS).$(DIR).annotated data/$(CORPUS).$(DIR).freq
  +
python3 $(SCRIPTS)/biltrans-count-patterns-frac-maxent.py data/$(CORPUS).$(SL)-$(TL).freq data/$(CORPUS).$(SL)-$(TL).ambig data/$(CORPUS).$(SL)-$(TL).annotated > data/$(CORPUS).$(DIR).events 2>data/$(CORPUS).$(DIR).ngrams
  +
  +
data/$(CORPUS).$(DIR).all-lambdas: data/$(CORPUS).$(DIR).events
  +
cat data/$(CORPUS).$(DIR).events | grep -v -e '\$$ 0\.0 #' -e '\$$ 0 #' > data/$(CORPUS).$(DIR).events.trimmed
  +
cat data/$(CORPUS).$(DIR).events.trimmed | python $(SCRIPTS)/merge-all-lambdas.py $(YASMET)> $@
  +
  +
data/$(CORPUS).$(DIR).rules-all: data/$(CORPUS).$(DIR).ngrams data/$(CORPUS).$(DIR).all-lambdas
  +
python3 $(SCRIPTS)/merge-ngrams-lambdas.py data/$(CORPUS).$(DIR).ngrams data/$(CORPUS).$(DIR).all-lambdas > $@
  +
  +
data/$(CORPUS).$(DIR).ngrams-all: data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).rules-all
  +
python3 $(SCRIPTS)/lambdas-to-rules.py data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).rules-all > $@
  +
  +
data/$(CORPUS).$(DIR).ngrams-trimmed: data/$(CORPUS).$(DIR).ngrams-all
  +
cat $< | python3 $(SCRIPTS)/ngram-pareto-trim.py > $@
  +
  +
data/$(CORPUS).$(DIR).lm.xml: data/$(CORPUS).$(DIR).ngrams-trimmed
  +
python3 $(SCRIPTS)/ngrams-to-rules-me.py data/$(CORPUS).$(DIR).ngrams-trimmed > $@
  +
  +
</pre>
  +
The corpus file needs to be named as "basename"."language-pair"."source side".
  +
As an illustration, in the Makefile example, the corpus file is named europarl.en-es.es
  +
Set the Makefile variables as follows:
  +
CORPUS denotes the base name of your corpus file
  +
PAIR stands for the language pair
  +
SL and TL stand for source language and target language
  +
DATA is the path to the language resources for the language pair
  +
SCRIPTS denotes the path to the lex-tools scripts
  +
MODEL is the path to the target side (binary) language model used for scoring the possible translations of ambiguous words
  +
  +
Finally, executing the Makefile will generate lexical selection rules for the specified language pair.
  +
   
   

Revision as of 06:12, 23 September 2013

This page describes how to generate lexical selection rules without relying on a parallel corpus.

Prerequisites

  • apertium-lex-tools
  • IRSTLM
  • A language pair (e.g. apertium-br-fr)
    • The language pair should have the following two modes:
      • -multi which is all the modules after lexical transfer (see apertium-mk-en/modes.xml)
      • -pretransfer which is all the modules up to lexical transfer (see apertium-mk-en/modes.xml)

Annotation

Important: If you don't want through the whole process step by step, you can use the Makefile script provided in the last section of this page.

We're going to do the example with EuroParl and the English to Spanish pair in Apertium.

Given that you've got all the stuff installed, the work will be as follows:

Take your corpus and make a tagged version of it:

cat europarl.es-en.es | apertium-destxt | apertium -f none -d ~/source/apertium/apertium-en-es en-es-pretransfer > europarl.en-es.es.tagged

Make an ambiguous version of your corpus and trim redundant tags:

cat europarl.en-es.es.tagged | python ~/source/apertium/apertium-lex-tools/multitrans ~/source/apertium/apertium-en-es/en-es.autobil -b -f -t -n > europarl.en-es.es.ambig

Next, generate all the possible disambiguation paths while trimming redundant tags:

cat europarl.en-es.es.tagged | ~/source/apertium/apertium-lex-tools/multitrans ~/source/apertium/apertium-en-es/en-es.autobil -m -f -t -n > europarl.en-es.es.multi-trimmed

Translate and score all possible disambiguation paths:

cat europarl.en-es.es.tagged | python ~/source/apertium/apertium-lex-tools/multitrans ~/source/apertium/apertium-en-es/en-es.autobil -m -f -n |
apertium -f none -d ~/source/apertium/apertium-en-es en-es-multi | ~/source/apertium/apertium-lex-tools/irstlm-ranker 
~/source/corpora/lm/en.blm europarl.en-es.es.multi-trimmed -f > europarl.en-es.es.annotated

Now we have a pseudo-parallel corpus where each possible translation is scored. We start by extracting a frequency lexicon:

	python3 ~/source/apertium/apertium-lex-tools-scripts/biltrans-extract-frac-freq.py  europarl.en-es.es.ambig europarl.en-es.es.annotated > europarl.en-es.freq
	python3 ~/source/apertium/apertium-lex-tools-scripts/extract-alig-lrx.py  europarl.en-es.freq > europarl.en-es.freq.lrx
	lrx-comp europarl.en-es.freq.lrx europarl.en-es.freq.lrx.bin

From here on, we have two paths we can choose. We can extract rules using a maximum entropy classifier, or we can extract rules based only on the scores provided by irstlm-ranker.

Direct rule extraction

When using this method, we directly continue with extracting ngrams from the pseudo parallel corpus:

python3 ~/source/apertium/apertium-lex-tools/scripts/biltrans-count-patterns-ngrams.py europarl.en-es.freq europarl.en-es.es.ambig europarl.en-es.es.annotated > ngrams

Next, we prune the generated ngrams:

python3 ~/source/apertium/apertium-lex-tools/scripts/ngram-pruning-frac.py europarl.en-es.freq ngrams > patterns

Finally, we generate and compile lexical selection rules while thresholding their irstlm-score

crisphold=1;
python3 ~/source/apertium/apertium-lex-tools/scripts//ngrams-to-rules.py patterns $crisphold > patterns.lrx
lrx-comp patterns.lrx patterns.lrx.bin

Maximum entropy rule extraction

When extracting rules using a maximum entropy criterion, we first extract features which we are going to feed to a classifier:

python3 ~/source/apertium/apertium-lex-tools/scripts/biltrans-count-patterns-frac-maxent.py europarl-en-es.freq 
europarl.en-es.ambig europarl.en-es.annotated > events 2>ngrams

We then train classifiers which as a side effect score how much each ngram contributes to a certain translation:

cat events | grep -v -e '\$$ 0\.0 #' -e '\$$ 0 #' > events.trimmed
cat events.trimmed | python ~/source/apertium/apertium-lex-tools/scripts/merge-all-lambdas.py $(YASMET) > all-lambdas
python3 ~/source/apertium-lex-tools/scripts/merge-ngrams-lambdas.py ngrams all-lambdas > rules-all

Finally, we extract ngrams:

python3 ~/source/apertium-lex-tools/scripts/lambdas-to-rules.py europarl-en-es.freq rules-all > ngrams-all

we trim them:

python3 ~/source/apertium-lex-tools/scripts/ngram-pareto-trim.py ngrams-all > ngrams-trimmed

and generate lexical selection rules:

python3 ~/source/apertium-lex-tools/scripts/ngrams-to-rules-me.py ngrams-trimmed > europarl.en-es.lrx.bin


Makefiles

Direct rule extraction

You can use this makefile to generate rules using maximum entropy classifiers. Your corpus needs to be placed in the same folder with your makefile.

CORPUS=setimes
PAIR=mk-en
DATA=/home/philip/Apertium/apertium-mk-en
SL=mk
TL=en
TRAINING_LINES=10000
SCRIPTS=/home/philip/Apertium/apertium-lex-tools/scripts
MODEL=/home/philip/Apertium/corpora/language-models/en/setimes.en.5.blm
LEX_TOOLS=/home/philip/Apertium/apertium-lex-tools



AUTOBIL=$(SL)-$(TL).autobil.bin
DIR=$(SL)-$(TL)

all: data/$(CORPUS).$(DIR).freq.lrx.bin data/$(CORPUS).$(DIR).patterns.lrx

data/$(CORPUS).$(DIR).tagger: $(CORPUS).$(PAIR).$(SL)
	if [ ! -d data ]; then mkdir data; fi
	cat $(CORPUS).$(PAIR).$(SL) | head -n $(TRAINING_LINES)| sed 's/[^\.]$$/./g' | apertium-destxt | apertium -f none -d $(DATA) $(DIR)-tagger | apertium-pretransfer > $@
 
data/$(CORPUS).$(DIR).ambig: data/$(CORPUS).$(DIR).tagger
	cat data/$(CORPUS).$(DIR).tagger | $(LEX_TOOLS)/process-tagger-output $(DATA)/$(AUTOBIL) -b -t -f -n > $@

data/$(CORPUS).$(DIR).multi-trimmed: data/$(CORPUS).$(DIR).tagger
	cat data/$(CORPUS).$(DIR).tagger | $(LEX_TOOLS)/process-tagger-output $(DATA)/$(AUTOBIL) -m -t -f > $@

data/$(CORPUS).$(DIR).annotated: data/$(CORPUS).$(DIR).tagger data/$(CORPUS).$(DIR).multi-trimmed
	cat data/$(CORPUS).$(DIR).tagger | $(LEX_TOOLS)/process-tagger-output $(DATA)/$(AUTOBIL) -m -f | apertium -f none -d $(DATA) $(DIR)-multi | $(LEX_TOOLS)/irstlm-ranker $(MODEL) data/$(CORPUS).$(DIR).multi-trimmed -f 2>/dev/null | grep "|@|" | cut -f 1-3 > $@ 

data/$(CORPUS).$(DIR).freq: data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated
	python3 $(SCRIPTS)/biltrans-extract-frac-freq.py data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated > $@
 
data/$(CORPUS).$(DIR).freq.lrx:  data/$(CORPUS).$(DIR).freq
	python3 $(SCRIPTS)/extract-alig-lrx.py $< > $@

data/$(CORPUS).$(DIR).freq.lrx.bin: data/$(CORPUS).$(DIR).freq.lrx
	lrx-comp $< $@

data/$(CORPUS).$(DIR).ngrams: data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated
	python3 $(SCRIPTS)/biltrans-count-patterns-ngrams.py data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated > $@
 
data/$(CORPUS).$(DIR).patterns: data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).ngrams
	python3 $(SCRIPTS)/ngram-pruning-frac.py data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).ngrams > $@  
 
data/$(CORPUS).$(DIR).patterns.lrx:  data/$(CORPUS).$(DIR).patterns
	python3 $(SCRIPTS)/ngrams-to-rules.py $< $(THR) > $@


The corpus file needs to be named as "basename"."language-pair"."source side". As an illustration, in the Makefile example, the corpus file is named europarl.en-es.es Set the Makefile variables as follows: CORPUS denotes the base name of your corpus file PAIR stands for the language pair SL and TL stand for source language and target language DATA is the path to the language resources for the language pair SCRIPTS denotes the path to the lex-tools scripts MODEL is the path to the target side (binary) language model used for scoring the possible translations of ambiguous words

Finally, executing the Makefile will generate lexical selection rules for the specified language pair.

Maximum entropy

You can use this makefile to generate rules using maximum entropy classifiers. Your corpus needs to be placed in the same folder with your makefile.

CORPUS=europarl
PAIR=en-es
DATA=/home/philip/source/apertium-en-es
SL=es
TL=en
MODEL=/home/philip/lm/en.blm
SCRIPTS=/home/philip/source/apertium-lex-tools/scripts
LEX_TOOLS=/home/philip/source/apertium-lex-tools
THR=1
TRAINING_LINES=10000


AUTOBIL=$(SL)-$(TL).autobil.bin
DIR=$(SL)-$(TL)
YASMET=$(LEX_TOOLS)/yasmet
all: data/$(CORPUS).$(DIR).freq.lrx.bin data/$(CORPUS).$(DIR).lm.xml

data/$(CORPUS).$(DIR).tagger: $(CORPUS).$(PAIR).$(SL)
	if [ ! -d data ]; then mkdir data; fi
	cat $(CORPUS).$(PAIR).$(SL) | head -n $(TRAINING_LINES) | apertium-destxt | apertium -f none -d $(DATA) $(DIR)-tagger | apertium-pretransfer > $@
 
data/$(CORPUS).$(DIR).ambig: data/$(CORPUS).$(DIR).tagger
	cat data/$(CORPUS).$(DIR).tagger | $(LEX_TOOLS)/multitrans $(DATA)/$(AUTOBIL) -b -t -f -n > $@

data/$(CORPUS).$(DIR).multi-trimmed: data/$(CORPUS).$(DIR).tagger
	cat data/$(CORPUS).$(DIR).tagger | $(LEX_TOOLS)/multitrans $(DATA)/$(AUTOBIL) -m -t -f > $@

data/$(CORPUS).$(DIR).annotated: data/$(CORPUS).$(DIR).tagger data/$(CORPUS).$(DIR).multi-trimmed
	cat data/$(CORPUS).$(DIR).tagger | $(LEX_TOOLS)/multitrans $(DATA)/$(AUTOBIL) -m -f | apertium -f none -d $(DATA) $(DIR)-multi | $(LEX_TOOLS)/irstlm-ranker $(MODEL) data/$(CORPUS).$(DIR).multi-trimmed -f 2>/dev/null > $@ 

data/$(CORPUS).$(DIR).freq: data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated
	python3 $(SCRIPTS)/biltrans-extract-frac-freq.py data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated > $@
 
data/$(CORPUS).$(DIR).freq.lrx:  data/$(CORPUS).$(DIR).freq
	python3 $(SCRIPTS)/extract-alig-lrx.py $< > $@

data/$(CORPUS).$(DIR).freq.lrx.bin: data/$(CORPUS).$(DIR).freq.lrx
	lrx-comp $< $@

data/$(CORPUS).$(DIR).events data/$(CORPUS).$(DIR).ngrams: data/$(CORPUS).$(DIR).annotated data/$(CORPUS).$(DIR).freq
	python3 $(SCRIPTS)/biltrans-count-patterns-frac-maxent.py data/$(CORPUS).$(SL)-$(TL).freq data/$(CORPUS).$(SL)-$(TL).ambig data/$(CORPUS).$(SL)-$(TL).annotated > data/$(CORPUS).$(DIR).events 2>data/$(CORPUS).$(DIR).ngrams

data/$(CORPUS).$(DIR).all-lambdas: data/$(CORPUS).$(DIR).events
	cat data/$(CORPUS).$(DIR).events | grep -v -e '\$$ 0\.0 #' -e '\$$ 0 #' > data/$(CORPUS).$(DIR).events.trimmed
	cat data/$(CORPUS).$(DIR).events.trimmed | python $(SCRIPTS)/merge-all-lambdas.py $(YASMET)> $@

data/$(CORPUS).$(DIR).rules-all: data/$(CORPUS).$(DIR).ngrams data/$(CORPUS).$(DIR).all-lambdas
	python3 $(SCRIPTS)/merge-ngrams-lambdas.py data/$(CORPUS).$(DIR).ngrams data/$(CORPUS).$(DIR).all-lambdas > $@

data/$(CORPUS).$(DIR).ngrams-all: data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).rules-all
	python3 $(SCRIPTS)/lambdas-to-rules.py data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).rules-all > $@

data/$(CORPUS).$(DIR).ngrams-trimmed: data/$(CORPUS).$(DIR).ngrams-all
	cat $< | python3 $(SCRIPTS)/ngram-pareto-trim.py > $@

data/$(CORPUS).$(DIR).lm.xml: data/$(CORPUS).$(DIR).ngrams-trimmed
	python3 $(SCRIPTS)/ngrams-to-rules-me.py data/$(CORPUS).$(DIR).ngrams-trimmed > $@

The corpus file needs to be named as "basename"."language-pair"."source side". As an illustration, in the Makefile example, the corpus file is named europarl.en-es.es Set the Makefile variables as follows: CORPUS denotes the base name of your corpus file PAIR stands for the language pair SL and TL stand for source language and target language DATA is the path to the language resources for the language pair SCRIPTS denotes the path to the lex-tools scripts MODEL is the path to the target side (binary) language model used for scoring the possible translations of ambiguous words

Finally, executing the Makefile will generate lexical selection rules for the specified language pair.