Adding orthography conversion to a language module
Jump to navigation
Jump to search
One approach[edit]
This accomplishes the following goal:
- Analysis of two orthographies (by default)
- Generation in orthography of choice (defaults to transducer orthography)
- Conversion from orthography 1 to orthography 2
What's missing/broken:
- Explicit conversion of orthography 2 to orthography 1
- Will work in some simple cases. Other cases will need explicit orthography conversion
lexd
andtwol
from orthography 2 to orthography 1
- Will work in some simple cases. Other cases will need explicit orthography conversion
- Orthography-specific spellrelax
- See apertium-krc's Makefile for an example of how to do this.
A diff[edit]
- Replace
abc
with language code - Replace
ORTH1
with abbreviation for orthography 1 - Replace
ORTH2
with abbreviation for orthography 2
diff --git a/Makefile.am b/Makefile.am index 3537fd7..86d9751 100644 --- a/Makefile.am +++ b/Makefile.am @@ -3,6 +3,10 @@ ############################################################################### LANG1=abc +SCRIPT1=ORTH1 +SCRIPT2=ORTH2 +LANG1SCRIPT1=$(LANG1)@$(SCRIPT1) +LANG1SCRIPT2=$(LANG1)@$(SCRIPT2) BASENAME=apertium-$(LANG1) TARGETS_COMMON = \ @@ -14,6 +18,12 @@ TARGETS_COMMON = \ $(LANG1).autogen.att.gz \ $(LANG1).autopgen.bin \ $(LANG1).rlx.bin \ + $(LANG1SCRIPT1).autogen.hfst \ + $(LANG1SCRIPT2).autogen.hfst \ + $(LANG1SCRIPT1).autogen.bin \ + $(LANG1SCRIPT2).autogen.bin \ + $(LANG1).$(SCRIPT1)-$(SCRIPT2).hfst \ + $(LANG1).$(SCRIPT2)-$(SCRIPT1).hfst $(LANG1).zhfst # This include defines goals for install-modes, .deps/.d, autobil.prefixes and .mode files: @@ -49,14 +59,73 @@ TARGETS_COMMON = \ .deps/$(LANG1).LR.hfst: .deps/$(LANG1).LR.lexd.hfst .deps/$(LANG1).twol.hfst hfst-compose-intersect -1 .deps/$(LANG1).LR.lexd.hfst -2 .deps/$(LANG1).twol.hfst -o $@ -$(LANG1).autogen.hfst: .deps/$(LANG1).RL.hfst +# Default autogen is SCRIPT1 + +$(LANG1SCRIPT1).autogen.hfst: .deps/$(LANG1).RL.hfst hfst-fst2fst -O $< -o $@ +$(LANG1).autogen.hfst: $(LANG1SCRIPT1).autogen.hfst + cp $< $@ + .deps/$(LANG1).spellrelax.hfst: $(BASENAME).$(LANG1).spellrelax .deps/.d hfst-regexp2fst -S -o $@ $< -$(LANG1).automorf.hfst: .deps/$(LANG1).LR.hfst .deps/$(LANG1).spellrelax.hfst - hfst-compose -1 $< -2 .deps/$(LANG1).spellrelax.hfst | hfst-invert | hfst-fst2fst -O -o $@ +# SCRIPT2 autogen +$(LANG1SCRIPT2).autogen.hfst: .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).hfst .deps/$(LANG1).RL.hfst .deps/.d + hfst-compose -1 `echo $(word 2,$^)` -2 $< | hfst-fst2fst -w -o $@ + +# Base orthographic converter + +# SCRIPT1 automorf +.deps/$(LANG1SCRIPT1).automorf.hfst: .deps/$(LANG1).LR.hfst .deps/$(LANG1).spellrelax.hfst .deps/.d + hfst-compose-intersect -1 $< -2 .deps/$(LANG1).spellrelax.hfst | hfst-invert -o $@ + +# SCRIPT2 automorf +.deps/$(LANG1SCRIPT2).automorf.hfst: .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).hfst .deps/$(LANG1).LR.hfst .deps/$(LANG1).spellrelax.hfst .deps/.d + hfst-compose -1 `echo $(word 2,$^)` -2 $< | hfst-compose-intersect -1 - -2 `echo $(word 3,$^)` | hfst-invert -o $@ + +# automorf that analyses SCRIPT1 and SCRIPT2 +$(LANG1).automorf.hfst: .deps/$(LANG1SCRIPT1).automorf.hfst .deps/$(LANG1SCRIPT2).automorf.hfst + hfst-invert $< -o .deps/$(LANG1SCRIPT1).REVautomorf.hfst + hfst-invert `echo $(word 2,$^)` -o .deps/$(LANG1SCRIPT2).REVautomorf.hfst + hfst-union -1 .deps/$(LANG1SCRIPT1).REVautomorf.hfst -2 .deps/$(LANG1SCRIPT2).REVautomorf.hfst | hfst-invert | hfst-minimise | hfst-fst2fst -w -o $@ + +# SCRIPT1 to SCRIPT2 transducer +.deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).hfst: .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).lexd.hfst .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).twol.hfst .deps/.d + hfst-compose-intersect -1 $< -2 .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).twol.hfst -o $@ + +# compile the first stage of the SCRIPT1-SCRIPT2 transliteration transducer +.deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).lexd.hfst: $(BASENAME).$(SCRIPT1)-$(SCRIPT2).lexd .deps/.d + lexd $< .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).lexd.att + hfst-txt2fst .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).lexd.att -o $@ + +# compile the second stage of the SCRIPT1-SCRIPT2 transliteration transducer +.deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).twol.hfst: $(BASENAME).$(SCRIPT1)-$(SCRIPT2).twol + hfst-twolc $< -o $@ + +# SCRIPT1 to SCRIPT2 orthographic converter + +$(LANG1).$(SCRIPT1)-$(SCRIPT2).hfst: .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).hfst .deps/.d + hfst-fst2fst $< -Oo $@ + +.deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).att: .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).hfst .deps/.d + hfst-fst2txt $< -o $@ + +$(LANG1).$(SCRIPT1)-$(SCRIPT2).bin: .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).att .deps/.d + lt-comp -H lr $< $@ + +# SCRIPT2 to SCRIPT1 orthographic converter + +$(LANG1).$(SCRIPT2)-$(SCRIPT1).hfst: .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).hfst .deps/.d + hfst-invert $< | hfst-fst2fst -Oo $@ + +.deps/$(LANG1SCRIPT2)-$(LANG1SCRIPT1).att: .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).hfst .deps/.d + hfst-invert | hfst-fst2txt -o $@ + +$(LANG1).$(SCRIPT2)-$(SCRIPT1).bin: .deps/$(LANG1SCRIPT2)-$(LANG1SCRIPT1).att .deps/.d + lt-comp -H lr $< $@ + +# bin files of automorfs and autogens $(LANG1).autogen.att.gz: $(LANG1).autogen.hfst hfst-fst2txt $< | gzip -9 -c -n > $@ @@ -64,10 +133,13 @@ $(LANG1).autogen.att.gz: $(LANG1).autogen.hfst $(LANG1).automorf.att.gz: $(LANG1).automorf.hfst hfst-fst2txt $< | gzip -9 -c -n > $@ -$(LANG1).autogen.bin: $(LANG1).autogen.att.gz .deps/.d +$(LANG1SCRIPT1).autogen.bin: $(LANG1).autogen.att.gz .deps/.d zcat < $< > .deps/$(LANG1).autogen.att lt-comp lr .deps/$(LANG1).autogen.att $@ +$(LANG1).autogen.bin: $(LANG1SCRIPT1).autogen.bin + cp $< $@ + $(LANG1).automorf.bin: $(LANG1).automorf.att.gz .deps/.d zcat < $< > .deps/$(LANG1).automorf.att lt-comp lr .deps/$(LANG1).automorf.att $@ @@ -75,6 +147,14 @@ $(LANG1).automorf.bin: $(LANG1).automorf.att.gz .deps/.d $(LANG1).autopgen.bin: $(BASENAME).post-$(LANG1).dix lt-comp lr $< $@ +$(LANG1SCRIPT2).autogen.att.gz: $(LANG1SCRIPT2).autogen.hfst + hfst-fst2txt $< | gzip -9 -c -n > $@ + +$(LANG1SCRIPT2).autogen.bin: $(LANG1SCRIPT2).autogen.att.gz .deps/.d + zcat < $< > .deps/$(LANG1SCRIPT2).autogen.att + lt-comp lr .deps/$(LANG1SCRIPT2).autogen.att $@ + + ############################################################################### ## Debugging transducers (for testvoc) ############################################################################### diff --git a/modes.xml b/modes.xml index 49740ed..1e152f9 100644 --- a/modes.xml +++ b/modes.xml @@ -36,6 +36,38 @@ </pipeline> </mode> + <mode name="abc_ORTH1-gener" install="yes"> + <pipeline> + <program name="lt-proc -g"> + <file name="abc@ORTH1.autogen.bin"/> + </program> + </pipeline> + </mode> + + <mode name="abc_ORTH2-gener" install="yes"> + <pipeline> + <program name="lt-proc -g"> + <file name="abc@ORTH2.autogen.bin"/> + </program> + </pipeline> + </mode> + + <mode name="abc_ORTH1-abc_ORTH2" install="yes"> + <pipeline> + <program name="hfst-proc"> + <file name="abc.ORTH1-ORTH2.hfst"/> + </program> + </pipeline> + </mode> + + <mode name="abc_ORTH2-abc_ORTH1" install="yes"> + <pipeline> + <program name="hfst-proc"> + <file name="abc.ORTH2-ORTH1.hfst"/> + </program> + </pipeline> + </mode> + <mode name="abc-tagger" install="yes"> <pipeline> <program name="lt-proc -w">