Adding orthography conversion to a language module

From Apertium
Jump to navigation Jump to search


One approach[edit]

This accomplishes the following goal:

  • Analysis of two orthographies (by default)
  • Generation in orthography of choice (defaults to transducer orthography)
  • Conversion from orthography 1 to orthography 2

What's missing/broken:

  • Explicit conversion of orthography 2 to orthography 1
    • Will work in some simple cases. Other cases will need explicit orthography conversion lexd and twol from orthography 2 to orthography 1
  • Orthography-specific spellrelax

A diff[edit]

  • Replace abc with language code
  • Replace ORTH1 with abbreviation for orthography 1
  • Replace ORTH2 with abbreviation for orthography 2
diff --git a/Makefile.am b/Makefile.am
index 3537fd7..86d9751 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -3,6 +3,10 @@
 ###############################################################################
 
 LANG1=abc
+SCRIPT1=ORTH1
+SCRIPT2=ORTH2
+LANG1SCRIPT1=$(LANG1)@$(SCRIPT1)
+LANG1SCRIPT2=$(LANG1)@$(SCRIPT2)
 BASENAME=apertium-$(LANG1)
 
 TARGETS_COMMON =			\
@@ -14,6 +18,12 @@ TARGETS_COMMON =			\
 	$(LANG1).autogen.att.gz		\
 	$(LANG1).autopgen.bin		\
 	$(LANG1).rlx.bin            \
+	$(LANG1SCRIPT1).autogen.hfst \
+	$(LANG1SCRIPT2).autogen.hfst \
+	$(LANG1SCRIPT1).autogen.bin \
+	$(LANG1SCRIPT2).autogen.bin \
+	$(LANG1).$(SCRIPT1)-$(SCRIPT2).hfst \
+	$(LANG1).$(SCRIPT2)-$(SCRIPT1).hfst
 	$(LANG1).zhfst
 
 # This include defines goals for install-modes, .deps/.d, autobil.prefixes and .mode files:
@@ -49,14 +59,73 @@ TARGETS_COMMON =			\
 .deps/$(LANG1).LR.hfst: .deps/$(LANG1).LR.lexd.hfst .deps/$(LANG1).twol.hfst
 	hfst-compose-intersect -1 .deps/$(LANG1).LR.lexd.hfst -2 .deps/$(LANG1).twol.hfst -o $@
 
-$(LANG1).autogen.hfst: .deps/$(LANG1).RL.hfst
+# Default autogen is SCRIPT1
+
+$(LANG1SCRIPT1).autogen.hfst: .deps/$(LANG1).RL.hfst
 	hfst-fst2fst -O $< -o $@
 
+$(LANG1).autogen.hfst: $(LANG1SCRIPT1).autogen.hfst
+	cp $< $@
+
 .deps/$(LANG1).spellrelax.hfst: $(BASENAME).$(LANG1).spellrelax .deps/.d
 	hfst-regexp2fst -S -o $@ $<
 
-$(LANG1).automorf.hfst: .deps/$(LANG1).LR.hfst .deps/$(LANG1).spellrelax.hfst
-	hfst-compose -1 $< -2 .deps/$(LANG1).spellrelax.hfst | hfst-invert | hfst-fst2fst -O -o $@
+# SCRIPT2 autogen
+$(LANG1SCRIPT2).autogen.hfst: .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).hfst .deps/$(LANG1).RL.hfst .deps/.d
+	hfst-compose -1 `echo $(word 2,$^)` -2 $< | hfst-fst2fst -w -o $@
+
+# Base orthographic converter
+
+# SCRIPT1 automorf
+.deps/$(LANG1SCRIPT1).automorf.hfst: .deps/$(LANG1).LR.hfst .deps/$(LANG1).spellrelax.hfst .deps/.d
+	hfst-compose-intersect -1 $< -2 .deps/$(LANG1).spellrelax.hfst | hfst-invert -o $@
+
+# SCRIPT2 automorf
+.deps/$(LANG1SCRIPT2).automorf.hfst: .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).hfst .deps/$(LANG1).LR.hfst .deps/$(LANG1).spellrelax.hfst .deps/.d
+	hfst-compose -1 `echo $(word 2,$^)` -2 $< | hfst-compose-intersect -1 - -2 `echo $(word 3,$^)` | hfst-invert -o $@
+
+# automorf that analyses SCRIPT1 and SCRIPT2
+$(LANG1).automorf.hfst: .deps/$(LANG1SCRIPT1).automorf.hfst .deps/$(LANG1SCRIPT2).automorf.hfst
+	hfst-invert $< -o .deps/$(LANG1SCRIPT1).REVautomorf.hfst
+	hfst-invert `echo $(word 2,$^)` -o .deps/$(LANG1SCRIPT2).REVautomorf.hfst
+	hfst-union -1 .deps/$(LANG1SCRIPT1).REVautomorf.hfst -2 .deps/$(LANG1SCRIPT2).REVautomorf.hfst | hfst-invert | hfst-minimise | hfst-fst2fst -w -o $@
+
+# SCRIPT1 to SCRIPT2 transducer
+.deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).hfst: .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).lexd.hfst .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).twol.hfst .deps/.d
+	hfst-compose-intersect -1 $< -2 .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).twol.hfst -o $@
+
+# compile the first stage of the SCRIPT1-SCRIPT2 transliteration transducer
+.deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).lexd.hfst: $(BASENAME).$(SCRIPT1)-$(SCRIPT2).lexd .deps/.d
+	lexd $< .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).lexd.att
+	hfst-txt2fst .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).lexd.att -o $@
+
+# compile the second stage of the SCRIPT1-SCRIPT2 transliteration transducer
+.deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).twol.hfst: $(BASENAME).$(SCRIPT1)-$(SCRIPT2).twol
+	hfst-twolc $< -o $@
+
+# SCRIPT1 to SCRIPT2 orthographic converter
+
+$(LANG1).$(SCRIPT1)-$(SCRIPT2).hfst: .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).hfst .deps/.d
+	hfst-fst2fst $< -Oo $@
+
+.deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).att: .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).hfst .deps/.d
+	hfst-fst2txt $< -o $@
+
+$(LANG1).$(SCRIPT1)-$(SCRIPT2).bin: .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).att .deps/.d
+	lt-comp -H lr $< $@
+
+# SCRIPT2 to SCRIPT1 orthographic converter
+
+$(LANG1).$(SCRIPT2)-$(SCRIPT1).hfst: .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).hfst .deps/.d
+	hfst-invert $< | hfst-fst2fst -Oo $@
+
+.deps/$(LANG1SCRIPT2)-$(LANG1SCRIPT1).att: .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).hfst .deps/.d
+	hfst-invert | hfst-fst2txt -o $@
+
+$(LANG1).$(SCRIPT2)-$(SCRIPT1).bin: .deps/$(LANG1SCRIPT2)-$(LANG1SCRIPT1).att .deps/.d
+	lt-comp -H lr $< $@
+
+# bin files of automorfs and autogens
 
 $(LANG1).autogen.att.gz: $(LANG1).autogen.hfst
 	hfst-fst2txt $< | gzip -9 -c -n > $@
@@ -64,10 +133,13 @@ $(LANG1).autogen.att.gz: $(LANG1).autogen.hfst
 $(LANG1).automorf.att.gz: $(LANG1).automorf.hfst
 	hfst-fst2txt $< | gzip -9 -c -n > $@
 
-$(LANG1).autogen.bin: $(LANG1).autogen.att.gz .deps/.d
+$(LANG1SCRIPT1).autogen.bin: $(LANG1).autogen.att.gz .deps/.d
 	zcat < $< > .deps/$(LANG1).autogen.att
 	lt-comp lr .deps/$(LANG1).autogen.att $@
 
+$(LANG1).autogen.bin: $(LANG1SCRIPT1).autogen.bin
+	cp $< $@
+
 $(LANG1).automorf.bin: $(LANG1).automorf.att.gz .deps/.d
 	zcat < $< > .deps/$(LANG1).automorf.att
 	lt-comp lr .deps/$(LANG1).automorf.att $@
@@ -75,6 +147,14 @@ $(LANG1).automorf.bin: $(LANG1).automorf.att.gz .deps/.d
 $(LANG1).autopgen.bin: $(BASENAME).post-$(LANG1).dix
 	lt-comp lr $< $@
 
+$(LANG1SCRIPT2).autogen.att.gz: $(LANG1SCRIPT2).autogen.hfst
+	hfst-fst2txt $< | gzip -9 -c -n > $@
+
+$(LANG1SCRIPT2).autogen.bin: $(LANG1SCRIPT2).autogen.att.gz .deps/.d
+	zcat < $< > .deps/$(LANG1SCRIPT2).autogen.att
+	lt-comp lr .deps/$(LANG1SCRIPT2).autogen.att $@
+
+
 ###############################################################################
 ## Debugging transducers (for testvoc)
 ###############################################################################
diff --git a/modes.xml b/modes.xml
index 49740ed..1e152f9 100644
--- a/modes.xml
+++ b/modes.xml
@@ -36,6 +36,38 @@
     </pipeline>
   </mode>
 
+  <mode name="abc_ORTH1-gener" install="yes">
+    <pipeline>
+      <program name="lt-proc -g">
+        <file name="abc@ORTH1.autogen.bin"/>
+      </program>
+    </pipeline>
+  </mode>
+
+  <mode name="abc_ORTH2-gener" install="yes">
+    <pipeline>
+      <program name="lt-proc -g">
+        <file name="abc@ORTH2.autogen.bin"/>
+      </program>
+    </pipeline>
+  </mode>
+
+  <mode name="abc_ORTH1-abc_ORTH2" install="yes">
+    <pipeline>
+      <program name="hfst-proc">
+        <file name="abc.ORTH1-ORTH2.hfst"/>
+      </program>
+    </pipeline>
+  </mode>
+
+  <mode name="abc_ORTH2-abc_ORTH1" install="yes">
+    <pipeline>
+      <program name="hfst-proc">
+        <file name="abc.ORTH2-ORTH1.hfst"/>
+      </program>
+    </pipeline>
+  </mode>
+
   <mode name="abc-tagger" install="yes">
     <pipeline>
       <program name="lt-proc -w">