Difference between revisions of "Partial hack for prefix inflection"
(New page: {{TOCD}} This is a description of a partial hack for dealing with prefix inflection. Essentially it re-uses as much as possible components that we already have, although requires one more ...) |
|||
(11 intermediate revisions by 2 users not shown) | |||
Line 1: | Line 1: | ||
{{deprecated}} |
|||
{{TOCD}} |
{{TOCD}} |
||
This is a description of a partial hack for dealing with prefix inflection. Essentially it re-uses as much as possible components that we already have, although requires one more component, a <code>pretagger</code> module. |
This is a description of a partial hack for dealing with prefix inflection (see [[prefixes and infixes]]). Essentially it re-uses as much as possible components that we already have, although requires one more component, a <code>pretagger</code> module. |
||
The first dictionary would be written in the normal manner, the second dictionary would be auto-generated from the first by means of lt-expand output coupled with the pretagger output. |
The first dictionary would be written in the normal manner, the second dictionary would be auto-generated from the first by means of <code>lt-expand</code> output coupled with the pretagger output. |
||
Note: This is basically a hack on Mikel's suggestion #1 on the [[prefixes and infixes]] page. |
|||
The pipeline would look like this: |
|||
One of the advantages of this system is that it is that the "normalisation" scheme (dictionary 2) can be generated from the dictionary + the pretagger and doesn't need to be hand-coded. Disadvantages are that it adds another layer, and probably misses a lot of nuances (e.g. multiwords, etc.). |
|||
==Pipeline== |
|||
<pre> |
|||
$ echo "watu mtu" | lt-proc sw.automorf.bin |
|||
^watu/m<pl>tu<n>$ ^mtu/m<sg>tu<n>$ |
|||
$ echo "watu mtu" | lt-proc sw.automorf.bin | pretagger.py |
|||
^watu/mtu<n><pl>$ ^mtu/mtu<n><sg>$ |
|||
$ echo "watu mtu" | lt-proc sw.automorf.bin | pretagger.py | apertium-tagger -g sw.prob |
|||
^mtu<n><pl>$ ^mtu<n><sg>$ |
|||
$ echo "watu mtu" | lt-proc sw.automorf.bin | pretagger.py | apertium-tagger -g sw.prob | lt-proc -g sw.automangle.bin |
|||
m<pl>tu<n> m<sg>tu<n> |
|||
</pre> |
|||
Then add the missing '^' and '$': |
|||
<pre> |
|||
$ echo "^m<pl>tu<n>$ ^m<sg>tu<n>$" | lt-proc -g sw.autogen.bin |
|||
watu mtu |
|||
</pre> |
|||
==Analysis== |
==Analysis== |
||
Line 111: | Line 134: | ||
</dictionary> |
</dictionary> |
||
</pre> |
</pre> |
||
==Pretagger== |
|||
A very rough 5 minute proof-of-concept: |
|||
<pre> |
|||
#!/usr/bin/python |
|||
# coding=utf-8 |
|||
# -*- encoding: utf-8 -*- |
|||
import sys, codecs, copy; |
|||
sys.stdout = codecs.getwriter('utf-8')(sys.stdout); |
|||
sys.stderr = codecs.getwriter('utf-8')(sys.stderr); |
|||
form = ''; |
|||
tag = ''; |
|||
tags = []; |
|||
lemma = ''; |
|||
reading = ''; |
|||
def pop_tags(tags, out): #{ |
|||
if len(tags) < 1: #{ |
|||
return out; |
|||
#} |
|||
out = out + tags.pop(); |
|||
return pop_tags(tags, out); |
|||
#} |
|||
def pull_tags(tags, out): #{ |
|||
for tag in tags: #{ |
|||
out = out + tag; |
|||
#} |
|||
return out; |
|||
#} |
|||
for char in sys.stdin.read().decode('utf-8'): #{ |
|||
if char == '^': #{ |
|||
sys.stdout.write(char); |
|||
reading = 'form'; |
|||
continue; |
|||
#} |
|||
if char == '/': #{ |
|||
reading = 'lemma'; |
|||
if len(form) < 1 and len(lemma) > 1: #{ |
|||
sys.stdout.write(lemma + pop_tags(tags, '')); |
|||
lemma = ''; |
|||
tags = []; |
|||
#} |
|||
sys.stdout.write(form); |
|||
sys.stdout.write(char); |
|||
form = ''; |
|||
continue; |
|||
#} |
|||
if char == '#': #{ |
|||
reading = 'pass'; |
|||
if len(form) < 1 and len(lemma) > 1: #{ |
|||
sys.stdout.write(lemma + pop_tags(tags, '')); |
|||
form = ''; |
|||
lemma = ''; |
|||
tags = []; |
|||
#} |
|||
sys.stdout.write(char); |
|||
continue; |
|||
#} |
|||
if char == '+': #{ |
|||
reading = 'pass'; |
|||
if len(form) < 1 and len(lemma) > 1: #{ |
|||
sys.stdout.write(lemma + pull_tags(tags, '')); |
|||
form = ''; |
|||
lemma = ''; |
|||
tags = []; |
|||
#} |
|||
sys.stdout.write(char); |
|||
continue; |
|||
#} |
|||
if char == '<' and reading is not 'pass': #{ |
|||
tag = ''; |
|||
reading = 'tag'; |
|||
#} |
|||
if char == '>' and reading is not 'pass': #{ |
|||
tag = tag + char; |
|||
tags.append(tag); |
|||
reading = 'lemma'; |
|||
continue; |
|||
#} |
|||
if char == '$': #{ |
|||
if len(form) < 1 and len(lemma) > 1: #{ |
|||
sys.stdout.write(lemma + pop_tags(tags, '')); |
|||
reading = 'lemma'; |
|||
#} |
|||
tags = []; |
|||
form = ''; |
|||
lemma = ''; |
|||
sys.stdout.write(char); |
|||
reading = 'pass'; |
|||
continue; |
|||
#} |
|||
if reading == 'form': #{ |
|||
form = form + char; |
|||
continue; |
|||
#} |
|||
if reading == 'lemma': #{ |
|||
lemma = lemma + char; |
|||
continue; |
|||
#} |
|||
if reading == 'tag': #{ |
|||
tag = tag + char; |
|||
continue; |
|||
#} |
|||
if reading == 'pass': #{ |
|||
sys.stdout.write(char); |
|||
continue; |
|||
#} |
|||
#} |
|||
</pre> |
|||
[[Category:Development]] |
|||
[[Category:Documentation in English]] |
Latest revision as of 18:47, 15 April 2012
This discussion page is deprecated as the functionality now exists.
This is a description of a partial hack for dealing with prefix inflection (see prefixes and infixes). Essentially it re-uses as much as possible components that we already have, although requires one more component, a pretagger
module.
The first dictionary would be written in the normal manner, the second dictionary would be auto-generated from the first by means of lt-expand
output coupled with the pretagger output.
Note: This is basically a hack on Mikel's suggestion #1 on the prefixes and infixes page.
One of the advantages of this system is that it is that the "normalisation" scheme (dictionary 2) can be generated from the dictionary + the pretagger and doesn't need to be hand-coded. Disadvantages are that it adds another layer, and probably misses a lot of nuances (e.g. multiwords, etc.).
Pipeline[edit]
$ echo "watu mtu" | lt-proc sw.automorf.bin ^watu/m<pl>tu<n>$ ^mtu/m<sg>tu<n>$ $ echo "watu mtu" | lt-proc sw.automorf.bin | pretagger.py ^watu/mtu<n><pl>$ ^mtu/mtu<n><sg>$ $ echo "watu mtu" | lt-proc sw.automorf.bin | pretagger.py | apertium-tagger -g sw.prob ^mtu<n><pl>$ ^mtu<n><sg>$ $ echo "watu mtu" | lt-proc sw.automorf.bin | pretagger.py | apertium-tagger -g sw.prob | lt-proc -g sw.automangle.bin m<pl>tu<n> m<sg>tu<n>
Then add the missing '^' and '$':
$ echo "^m<pl>tu<n>$ ^m<sg>tu<n>$" | lt-proc -g sw.autogen.bin watu mtu
Analysis[edit]
$ echo "watu" | lt-proc sw.automorf.bin ^watu/m<pl>tu<n>$
This would then be passed through the pretagger which would re-order the output thus:
^watu/mtu<n><pl>$
Basically just shunting all the tags to the end. This is now ready to be passed to the tagger.
Generation[edit]
Slightly more complicated. Out of final stage transfer you get:
^mtu<n><pl>$
This is then passed through the dictionary 2:
$ echo "^mtu<n><pl>$" | lt-proc -g sw.automangle.bin m<pl>tu<n>
Note: obviously we're missing the '^' and '$' here, but thats fairly straightforward to add.
Then it is passed through dictionary 1 in generation mode:
$ echo "^m<pl>tu<n>$" | lt-proc -g sw.autogen.bin watu
Dictionaries[edit]
Dictionary 1 (sw-1.dix)[edit]
- Compile
$ lt-comp lr sw.dix sw.automorf.bin $ lt-comp rl sw.dix sw.autogen.bin
<dictionary> <alphabet>abcdefghijklmnopqrstuvwxyz</alphabet> <sdefs> <sdef n="n"/> <sdef n="sg"/> <sdef n="pl"/> </sdefs> <pardefs> <pardef n="C__1_2__n"> <e><p><l>m</l><r>m<s n="sg"/></r></p></e> <e><p><l>wa</l><r>m<s n="pl"/></r></p></e> </pardef> <pardef n="mtu__n"> <e><p><l/><r><s n="n"/></r></p></e> </pardef> </pardefs> <section id="main" type="standard"> <e lm="mtu"><par n="C__1_2__n"/><i>tu</i><par n="mtu__n"/></e> </section> </dictionary>
Dictionary 2 (sw-2.dix)[edit]
- Compile
$ lt-comp lr sw-2.dix sw.automangle.bin
<dictionary> <alphabet>abcdefghijklmnopqrstuvwxyz</alphabet> <sdefs> <sdef n="n"/> <sdef n="sg"/> <sdef n="pl"/> </sdefs> <section id="main" type="standard"> <e> <p> <l>mtu<s n="n"/><s n="sg"/></l> <r>m<s n="sg"/>tu<s n="n"/></r> </p> </e> <e> <p> <l>mtu<s n="n"/><s n="pl"/></l> <r>m<s n="pl"/>tu<s n="n"/></r> </p> </e> </section> </dictionary>
Pretagger[edit]
A very rough 5 minute proof-of-concept:
#!/usr/bin/python # coding=utf-8 # -*- encoding: utf-8 -*- import sys, codecs, copy; sys.stdout = codecs.getwriter('utf-8')(sys.stdout); sys.stderr = codecs.getwriter('utf-8')(sys.stderr); form = ''; tag = ''; tags = []; lemma = ''; reading = ''; def pop_tags(tags, out): #{ if len(tags) < 1: #{ return out; #} out = out + tags.pop(); return pop_tags(tags, out); #} def pull_tags(tags, out): #{ for tag in tags: #{ out = out + tag; #} return out; #} for char in sys.stdin.read().decode('utf-8'): #{ if char == '^': #{ sys.stdout.write(char); reading = 'form'; continue; #} if char == '/': #{ reading = 'lemma'; if len(form) < 1 and len(lemma) > 1: #{ sys.stdout.write(lemma + pop_tags(tags, '')); lemma = ''; tags = []; #} sys.stdout.write(form); sys.stdout.write(char); form = ''; continue; #} if char == '#': #{ reading = 'pass'; if len(form) < 1 and len(lemma) > 1: #{ sys.stdout.write(lemma + pop_tags(tags, '')); form = ''; lemma = ''; tags = []; #} sys.stdout.write(char); continue; #} if char == '+': #{ reading = 'pass'; if len(form) < 1 and len(lemma) > 1: #{ sys.stdout.write(lemma + pull_tags(tags, '')); form = ''; lemma = ''; tags = []; #} sys.stdout.write(char); continue; #} if char == '<' and reading is not 'pass': #{ tag = ''; reading = 'tag'; #} if char == '>' and reading is not 'pass': #{ tag = tag + char; tags.append(tag); reading = 'lemma'; continue; #} if char == '$': #{ if len(form) < 1 and len(lemma) > 1: #{ sys.stdout.write(lemma + pop_tags(tags, '')); reading = 'lemma'; #} tags = []; form = ''; lemma = ''; sys.stdout.write(char); reading = 'pass'; continue; #} if reading == 'form': #{ form = form + char; continue; #} if reading == 'lemma': #{ lemma = lemma + char; continue; #} if reading == 'tag': #{ tag = tag + char; continue; #} if reading == 'pass': #{ sys.stdout.write(char); continue; #} #}