Difference between revisions of "Running the monolingual rule learning"

From Apertium
Jump to navigation Jump to search
(Created page with 'Place the following Makefile in the folder where you want to run your training process: <pre> CORPUS=setimes DIR=sh-mk DATA=/home/philip/Apertium/apertium-sh-mk/ AUTOBIL=sh-mk.a…')
 
 
(11 intermediate revisions by 3 users not shown)
Line 1: Line 1:
Prerequisites:
* Install [[apertium-lex-tools]]
* Install IRSTLM (https://github.com/douglasbagnall/irstlm)
* Train a target side language model (http://hermes.fbk.eu/people/bertoldi/teaching/lab_2010-2011/img/irstlm-manual.pdf)
* The language pair must support the pretransfer and multi modes. See apertium-sh-mk/modes.xml
as a reference on how to add these modes if they do not exist.
Place the following Makefile in the folder where you want to run your training process:
Place the following Makefile in the folder where you want to run your training process:


Line 8: Line 14:
SCRIPTS=/home/philip/Apertium/apertium-lex-tools/scripts
SCRIPTS=/home/philip/Apertium/apertium-lex-tools/scripts
MODEL=/home/philip/Apertium/corpora/language-models/mk/setimes.mk.5.blm
MODEL=/home/philip/Apertium/corpora/language-models/mk/setimes.mk.5.blm
LEX_TOOLS=/home/philip/Apertium/apertium-lex-tools
THR=0

#all: data/$(CORPUS).$(DIR).lrx data/$(CORPUS).$(DIR).freq.lrx
#all: data/$(CORPUS).$(DIR).lrx data/$(CORPUS).$(DIR).freq.lrx
all: data/$(CORPUS).$(DIR).freq.lrx.bin
all: data/$(CORPUS).$(DIR).freq.lrx.bin data/$(CORPUS).$(DIR).patterns.lrx

data/$(CORPUS).$(DIR).lines: $(CORPUS).$(DIR).txt
data/$(CORPUS).$(DIR).tagger: $(CORPUS).$(DIR).txt
if [ ! -d data ]; then mkdir data; fi
if [ ! -d data ]; then mkdir data; fi
cat $(CORPUS).$(DIR).txt | sed 's/[^\.]$$/./g' | apertium-destxt | apertium -f none -d $(DATA) $(DIR)-tagger | apertium-pretransfer > $@
seq `cat $< | wc -l` > $@
data/$(CORPUS).$(DIR).biltrans: $(CORPUS).$(DIR).txt
data/$(CORPUS).$(DIR).ambig: data/$(CORPUS).$(DIR).tagger
cat data/$(CORPUS).$(DIR).tagger | $(LEX_TOOLS)/multitrans $(DATA)$(DIR).autobil.bin -b -t > $@
if [ ! -d data ]; then mkdir data; fi
cat $(CORPUS).$(DIR).txt | apertium-destxt | apertium -f none -d $(DATA) $(DIR)-pretransfer | lt-proc -b $(DATA)/$(AUTOBIL) > $@
data/$(CORPUS).$(DIR).ambig: data/$(CORPUS).$(DIR).biltrans data/$(CORPUS).$(DIR).lines
cat -n data/$(CORPUS).$(DIR).biltrans | python3 $(SCRIPTS)/trim-fertile-lines.py | python3 $(SCRIPTS)/biltrans-line-only-pos-ambig.py | python3 $(SCRIPTS)/biltrans-trim-uncovered.py > $@
data/$(CORPUS).$(DIR).multi: data/$(CORPUS).$(DIR).ambig
cat $< | python $(SCRIPTS)/biltrans-to-multitrans-line-recursive.py > $@
data/$(CORPUS).$(DIR).unranked: data/$(CORPUS).$(DIR).multi
cat $< | apertium -f none -d $(DATA) $(DIR)-multi > $@
data/$(CORPUS).$(DIR).ranked: data/$(CORPUS).$(DIR).unranked
cat $< | irstlm-ranker-frac $(MODEL) > $@


data/$(CORPUS).$(DIR).annotated: data/$(CORPUS).$(DIR).multi data/$(CORPUS).$(DIR).ranked
data/$(CORPUS).$(DIR).multi-trimmed: data/$(CORPUS).$(DIR).tagger
paste data/$(CORPUS).$(DIR).multi data/$(CORPUS).$(DIR).ranked | cut -f1-4 > $@
cat data/$(CORPUS).$(DIR).tagger | $(LEX_TOOLS)/multitrans $(DATA)$(DIR).autobil.bin -m -t > $@

data/$(CORPUS).$(DIR).ranked: data/$(CORPUS).$(DIR).tagger
cat $< | $(LEX_TOOLS)/multitrans $(DATA)$(DIR).autobil.bin -m | apertium -f none -d $(DATA) $(DIR)-multi | irstlm-ranker-frac $(MODEL) > $@

data/$(CORPUS).$(DIR).annotated: data/$(CORPUS).$(DIR).multi-trimmed data/$(CORPUS).$(DIR).ranked
paste data/$(CORPUS).$(DIR).multi-trimmed data/$(CORPUS).$(DIR).ranked | cut -f1-4 > $@
data/$(CORPUS).$(DIR).freq: data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated
data/$(CORPUS).$(DIR).freq: data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated
python3 $(SCRIPTS)/biltrans-extract-frac-freq.py data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated > $@
python3 $(SCRIPTS)/biltrans-extract-frac-freq.py data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated > $@
data/$(CORPUS).$(DIR).freq.lrx: data/$(CORPUS).$(DIR).freq
python3 $(SCRIPTS)/extract-alig-lrx.py $< > $@

data/$(CORPUS).$(DIR).freq.lrx.bin: data/$(CORPUS).$(DIR).freq.lrx
lrx-comp $< $@

data/$(CORPUS).$(DIR).ngrams: data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated
data/$(CORPUS).$(DIR).ngrams: data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated
python3 $(SCRIPTS)/biltrans-count-patterns-ngrams.py data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated > $@
python3 $(SCRIPTS)/biltrans-count-patterns-ngrams.py data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated > $@
Line 44: Line 51:
python3 $(SCRIPTS)/ngram-pruning-frac.py data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).ngrams > $@
python3 $(SCRIPTS)/ngram-pruning-frac.py data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).ngrams > $@
data/$(CORPUS).$(DIR).freq.lrx: data/$(CORPUS).$(DIR).freq
data/$(CORPUS).$(DIR).patterns.lrx: data/$(CORPUS).$(DIR).patterns
python3 $(SCRIPTS)/extract-alig-lrx.py $< > $@
python3 $(SCRIPTS)/ngrams-to-rules.py $< $(THR) > $@
data/$(CORPUS).$(DIR).freq.lrx.bin: data/$(CORPUS).$(DIR).freq.lrx
apertium-lrx-comp $< $@

</pre>
</pre>

In the same folder also place your source side corpus file. The corpus file needs to be named as "basename"."language-pair".txt. <br/>
As an illustration, in the Makefile example, the corpus file is named setimes.sh-mk.txt.

Set the Makefile variables as follows: <br/>
* CORPUS denotes the base name of your corpus file
* DIR stands for the language pair
* DATA is the path to the language resources for the language pair
* AUTOBIL is the path to binary bilingual dictionary for the language pair
* SCRIPTS denotes the path to the lex-tools scripts
* MODEL is the path to the target side (binary) language model used for scoring the possible translations of ambiguous words

Finally, executing the Makefile will generate lexical selection rules for the specified language pair.

[[Category:Lexical selection]]

Latest revision as of 19:01, 17 August 2018

Prerequisites:

as a reference on how to add these modes if they do not exist. Place the following Makefile in the folder where you want to run your training process:

CORPUS=setimes
DIR=sh-mk
DATA=/home/philip/Apertium/apertium-sh-mk/
AUTOBIL=sh-mk.autobil.bin
SCRIPTS=/home/philip/Apertium/apertium-lex-tools/scripts
MODEL=/home/philip/Apertium/corpora/language-models/mk/setimes.mk.5.blm
LEX_TOOLS=/home/philip/Apertium/apertium-lex-tools
THR=0

#all: data/$(CORPUS).$(DIR).lrx data/$(CORPUS).$(DIR).freq.lrx
all: data/$(CORPUS).$(DIR).freq.lrx.bin data/$(CORPUS).$(DIR).patterns.lrx

data/$(CORPUS).$(DIR).tagger: $(CORPUS).$(DIR).txt
	if [ ! -d data ]; then mkdir data; fi
	cat $(CORPUS).$(DIR).txt | sed 's/[^\.]$$/./g' | apertium-destxt | apertium -f none -d $(DATA) $(DIR)-tagger | apertium-pretransfer > $@
 
data/$(CORPUS).$(DIR).ambig: data/$(CORPUS).$(DIR).tagger
	cat data/$(CORPUS).$(DIR).tagger | $(LEX_TOOLS)/multitrans $(DATA)$(DIR).autobil.bin -b -t > $@

data/$(CORPUS).$(DIR).multi-trimmed: data/$(CORPUS).$(DIR).tagger
	cat data/$(CORPUS).$(DIR).tagger | $(LEX_TOOLS)/multitrans $(DATA)$(DIR).autobil.bin -m -t > $@

data/$(CORPUS).$(DIR).ranked: data/$(CORPUS).$(DIR).tagger
	cat $< | $(LEX_TOOLS)/multitrans $(DATA)$(DIR).autobil.bin -m | apertium -f none -d $(DATA) $(DIR)-multi | irstlm-ranker-frac $(MODEL) > $@

data/$(CORPUS).$(DIR).annotated: data/$(CORPUS).$(DIR).multi-trimmed data/$(CORPUS).$(DIR).ranked
	paste data/$(CORPUS).$(DIR).multi-trimmed data/$(CORPUS).$(DIR).ranked | cut -f1-4 > $@
 
data/$(CORPUS).$(DIR).freq: data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated
	python3 $(SCRIPTS)/biltrans-extract-frac-freq.py  data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated > $@
 
data/$(CORPUS).$(DIR).freq.lrx:  data/$(CORPUS).$(DIR).freq
	python3 $(SCRIPTS)/extract-alig-lrx.py $< > $@

data/$(CORPUS).$(DIR).freq.lrx.bin: data/$(CORPUS).$(DIR).freq.lrx
	lrx-comp $< $@

data/$(CORPUS).$(DIR).ngrams: data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated
	python3 $(SCRIPTS)/biltrans-count-patterns-ngrams.py data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).ambig data/$(CORPUS).$(DIR).annotated > $@
 
data/$(CORPUS).$(DIR).patterns: data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).ngrams
	python3 $(SCRIPTS)/ngram-pruning-frac.py data/$(CORPUS).$(DIR).freq data/$(CORPUS).$(DIR).ngrams > $@  
 
data/$(CORPUS).$(DIR).patterns.lrx:  data/$(CORPUS).$(DIR).patterns
	python3 $(SCRIPTS)/ngrams-to-rules.py $< $(THR) > $@

In the same folder also place your source side corpus file. The corpus file needs to be named as "basename"."language-pair".txt.
As an illustration, in the Makefile example, the corpus file is named setimes.sh-mk.txt.

Set the Makefile variables as follows:

  • CORPUS denotes the base name of your corpus file
  • DIR stands for the language pair
  • DATA is the path to the language resources for the language pair
  • AUTOBIL is the path to binary bilingual dictionary for the language pair
  • SCRIPTS denotes the path to the lex-tools scripts
  • MODEL is the path to the target side (binary) language model used for scoring the possible translations of ambiguous words

Finally, executing the Makefile will generate lexical selection rules for the specified language pair.