Adding orthography conversion to a language module
Revision as of 15:40, 12 January 2023 by Firespeaker (talk | contribs) (Created page with " == One approach == This accomplishes the following goal: * Analysis of two orthographies (by default) * Generation in orthography of choice (defaults to transducer orthogra...")
One approach
This accomplishes the following goal:
- Analysis of two orthographies (by default)
- Generation in orthography of choice (defaults to transducer orthography)
- Conversion from orthography 1 to orthography 2
What's missing/broken:
- Explicit conversion of orthography 2 to orthography 1
- Will work in some simple cases. Other cases will need explicit orthography conversion
lexdandtwolfrom orthography 2 to orthography 1
- Will work in some simple cases. Other cases will need explicit orthography conversion
- Orthography-specific spellrelax
- See apertium-krc's Makefile for an example of how to do this.
A diff
- Replace
abcwith language code - Replace
ORTH1with abbreviation for orthography 1 - Replace
ORTH2with abbreviation for orthography 2
diff --git a/Makefile.am b/Makefile.am
index 3537fd7..86d9751 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -3,6 +3,10 @@
###############################################################################
LANG1=abc
+SCRIPT1=ORTH1
+SCRIPT2=ORTH2
+LANG1SCRIPT1=$(LANG1)@$(SCRIPT1)
+LANG1SCRIPT2=$(LANG1)@$(SCRIPT2)
BASENAME=apertium-$(LANG1)
TARGETS_COMMON = \
@@ -14,6 +18,12 @@ TARGETS_COMMON = \
$(LANG1).autogen.att.gz \
$(LANG1).autopgen.bin \
$(LANG1).rlx.bin \
+ $(LANG1SCRIPT1).autogen.hfst \
+ $(LANG1SCRIPT2).autogen.hfst \
+ $(LANG1SCRIPT1).autogen.bin \
+ $(LANG1SCRIPT2).autogen.bin \
+ $(LANG1).$(SCRIPT1)-$(SCRIPT2).hfst \
+ $(LANG1).$(SCRIPT2)-$(SCRIPT1).hfst
$(LANG1).zhfst
# This include defines goals for install-modes, .deps/.d, autobil.prefixes and .mode files:
@@ -49,14 +59,73 @@ TARGETS_COMMON = \
.deps/$(LANG1).LR.hfst: .deps/$(LANG1).LR.lexd.hfst .deps/$(LANG1).twol.hfst
hfst-compose-intersect -1 .deps/$(LANG1).LR.lexd.hfst -2 .deps/$(LANG1).twol.hfst -o $@
-$(LANG1).autogen.hfst: .deps/$(LANG1).RL.hfst
+# Default autogen is SCRIPT1
+
+$(LANG1SCRIPT1).autogen.hfst: .deps/$(LANG1).RL.hfst
hfst-fst2fst -O $< -o $@
+$(LANG1).autogen.hfst: $(LANG1SCRIPT1).autogen.hfst
+ cp $< $@
+
.deps/$(LANG1).spellrelax.hfst: $(BASENAME).$(LANG1).spellrelax .deps/.d
hfst-regexp2fst -S -o $@ $<
-$(LANG1).automorf.hfst: .deps/$(LANG1).LR.hfst .deps/$(LANG1).spellrelax.hfst
- hfst-compose -1 $< -2 .deps/$(LANG1).spellrelax.hfst | hfst-invert | hfst-fst2fst -O -o $@
+# SCRIPT2 autogen
+$(LANG1SCRIPT2).autogen.hfst: .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).hfst .deps/$(LANG1).RL.hfst .deps/.d
+ hfst-compose -1 `echo $(word 2,$^)` -2 $< | hfst-fst2fst -w -o $@
+
+# Base orthographic converter
+
+# SCRIPT1 automorf
+.deps/$(LANG1SCRIPT1).automorf.hfst: .deps/$(LANG1).LR.hfst .deps/$(LANG1).spellrelax.hfst .deps/.d
+ hfst-compose-intersect -1 $< -2 .deps/$(LANG1).spellrelax.hfst | hfst-invert -o $@
+
+# SCRIPT2 automorf
+.deps/$(LANG1SCRIPT2).automorf.hfst: .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).hfst .deps/$(LANG1).LR.hfst .deps/$(LANG1).spellrelax.hfst .deps/.d
+ hfst-compose -1 `echo $(word 2,$^)` -2 $< | hfst-compose-intersect -1 - -2 `echo $(word 3,$^)` | hfst-invert -o $@
+
+# automorf that analyses SCRIPT1 and SCRIPT2
+$(LANG1).automorf.hfst: .deps/$(LANG1SCRIPT1).automorf.hfst .deps/$(LANG1SCRIPT2).automorf.hfst
+ hfst-invert $< -o .deps/$(LANG1SCRIPT1).REVautomorf.hfst
+ hfst-invert `echo $(word 2,$^)` -o .deps/$(LANG1SCRIPT2).REVautomorf.hfst
+ hfst-union -1 .deps/$(LANG1SCRIPT1).REVautomorf.hfst -2 .deps/$(LANG1SCRIPT2).REVautomorf.hfst | hfst-invert | hfst-minimise | hfst-fst2fst -w -o $@
+
+# SCRIPT1 to SCRIPT2 transducer
+.deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).hfst: .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).lexd.hfst .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).twol.hfst .deps/.d
+ hfst-compose-intersect -1 $< -2 .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).twol.hfst -o $@
+
+# compile the first stage of the SCRIPT1-SCRIPT2 transliteration transducer
+.deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).lexd.hfst: $(BASENAME).$(SCRIPT1)-$(SCRIPT2).lexd .deps/.d
+ lexd $< .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).lexd.att
+ hfst-txt2fst .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).lexd.att -o $@
+
+# compile the second stage of the SCRIPT1-SCRIPT2 transliteration transducer
+.deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).twol.hfst: $(BASENAME).$(SCRIPT1)-$(SCRIPT2).twol
+ hfst-twolc $< -o $@
+
+# SCRIPT1 to SCRIPT2 orthographic converter
+
+$(LANG1).$(SCRIPT1)-$(SCRIPT2).hfst: .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).hfst .deps/.d
+ hfst-fst2fst $< -Oo $@
+
+.deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).att: .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).hfst .deps/.d
+ hfst-fst2txt $< -o $@
+
+$(LANG1).$(SCRIPT1)-$(SCRIPT2).bin: .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).att .deps/.d
+ lt-comp -H lr $< $@
+
+# SCRIPT2 to SCRIPT1 orthographic converter
+
+$(LANG1).$(SCRIPT2)-$(SCRIPT1).hfst: .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).hfst .deps/.d
+ hfst-invert $< | hfst-fst2fst -Oo $@
+
+.deps/$(LANG1SCRIPT2)-$(LANG1SCRIPT1).att: .deps/$(LANG1SCRIPT1)-$(LANG1SCRIPT2).hfst .deps/.d
+ hfst-invert | hfst-fst2txt -o $@
+
+$(LANG1).$(SCRIPT2)-$(SCRIPT1).bin: .deps/$(LANG1SCRIPT2)-$(LANG1SCRIPT1).att .deps/.d
+ lt-comp -H lr $< $@
+
+# bin files of automorfs and autogens
$(LANG1).autogen.att.gz: $(LANG1).autogen.hfst
hfst-fst2txt $< | gzip -9 -c -n > $@
@@ -64,10 +133,13 @@ $(LANG1).autogen.att.gz: $(LANG1).autogen.hfst
$(LANG1).automorf.att.gz: $(LANG1).automorf.hfst
hfst-fst2txt $< | gzip -9 -c -n > $@
-$(LANG1).autogen.bin: $(LANG1).autogen.att.gz .deps/.d
+$(LANG1SCRIPT1).autogen.bin: $(LANG1).autogen.att.gz .deps/.d
zcat < $< > .deps/$(LANG1).autogen.att
lt-comp lr .deps/$(LANG1).autogen.att $@
+$(LANG1).autogen.bin: $(LANG1SCRIPT1).autogen.bin
+ cp $< $@
+
$(LANG1).automorf.bin: $(LANG1).automorf.att.gz .deps/.d
zcat < $< > .deps/$(LANG1).automorf.att
lt-comp lr .deps/$(LANG1).automorf.att $@
@@ -75,6 +147,14 @@ $(LANG1).automorf.bin: $(LANG1).automorf.att.gz .deps/.d
$(LANG1).autopgen.bin: $(BASENAME).post-$(LANG1).dix
lt-comp lr $< $@
+$(LANG1SCRIPT2).autogen.att.gz: $(LANG1SCRIPT2).autogen.hfst
+ hfst-fst2txt $< | gzip -9 -c -n > $@
+
+$(LANG1SCRIPT2).autogen.bin: $(LANG1SCRIPT2).autogen.att.gz .deps/.d
+ zcat < $< > .deps/$(LANG1SCRIPT2).autogen.att
+ lt-comp lr .deps/$(LANG1SCRIPT2).autogen.att $@
+
+
###############################################################################
## Debugging transducers (for testvoc)
###############################################################################
diff --git a/modes.xml b/modes.xml
index 49740ed..1e152f9 100644
--- a/modes.xml
+++ b/modes.xml
@@ -36,6 +36,38 @@
</pipeline>
</mode>
+ <mode name="abc_ORTH1-gener" install="yes">
+ <pipeline>
+ <program name="lt-proc -g">
+ <file name="abc@ORTH1.autogen.bin"/>
+ </program>
+ </pipeline>
+ </mode>
+
+ <mode name="abc_ORTH2-gener" install="yes">
+ <pipeline>
+ <program name="lt-proc -g">
+ <file name="abc@ORTH2.autogen.bin"/>
+ </program>
+ </pipeline>
+ </mode>
+
+ <mode name="abc_ORTH1-abc_ORTH2" install="yes">
+ <pipeline>
+ <program name="hfst-proc">
+ <file name="abc.ORTH1-ORTH2.hfst"/>
+ </program>
+ </pipeline>
+ </mode>
+
+ <mode name="abc_ORTH2-abc_ORTH1" install="yes">
+ <pipeline>
+ <program name="hfst-proc">
+ <file name="abc.ORTH2-ORTH1.hfst"/>
+ </program>
+ </pipeline>
+ </mode>
+
<mode name="abc-tagger" install="yes">
<pipeline>
<program name="lt-proc -w">