Difference between revisions of "Partial hack for prefix inflection"
| Line 117: | Line 117: | ||
<pre> |
<pre> |
||
#!/usr/bin/python |
#!/usr/bin/python |
||
# coding=utf-8 |
# coding=utf-8 |
||
| Line 126: | Line 127: | ||
form = ''; |
form = ''; |
||
tag = ''; |
|||
tags = []; |
|||
lemma = ''; |
lemma = ''; |
||
reading = ''; |
reading = ''; |
||
def reverse_order(tags): #{ |
def reverse_order(tags, out): #{ |
||
if len(tags) < 1: #{ |
|||
l = tags.strip('><').split('><'); |
|||
| ⚫ | |||
| ⚫ | |||
for i in reversed(l): #{ |
|||
new = new + '<' + i + '>'; |
|||
#} |
#} |
||
| ⚫ | |||
out = out + tags.pop(); |
|||
return reverse_order(tags, out); |
|||
#} |
#} |
||
| Line 150: | Line 152: | ||
#} |
#} |
||
if char == '<': #{ |
if char == '<': #{ |
||
| ⚫ | |||
reading = 'tag'; |
reading = 'tag'; |
||
#} |
#} |
||
if char == '>': #{ |
if char == '>': #{ |
||
tag = tag + char; |
|||
tags.append(tag); |
|||
reading = 'lemma'; |
reading = 'lemma'; |
||
continue; |
continue; |
||
| Line 160: | Line 164: | ||
reading = ''; |
reading = ''; |
||
continue; |
continue; |
||
#} |
|||
if char == ' ' or char == '\n': #{ |
|||
reading = 'output'; |
|||
#} |
#} |
||
| Line 169: | Line 176: | ||
#} |
#} |
||
if reading == 'tag': #{ |
if reading == 'tag': #{ |
||
tag = tag + char; |
|||
#} |
|||
if reading == 'output': #{ |
|||
| ⚫ | |||
tags = []; |
|||
tag = ''; |
|||
lemma = ''; |
|||
form = ''; |
|||
sys.stdout.write(char); |
|||
#} |
#} |
||
#} |
#} |
||
| ⚫ | |||
</pre> |
</pre> |
||
Revision as of 22:53, 20 December 2007
This is a description of a partial hack for dealing with prefix inflection (see prefixes and infixes). Essentially it re-uses as much as possible components that we already have, although requires one more component, a pretagger module.
The first dictionary would be written in the normal manner, the second dictionary would be auto-generated from the first by means of lt-expand output coupled with the pretagger output.
Note: This is basically a hack on Mikel's suggestion #1 on the prefixes and infixes page.
Analysis
$ echo "watu" | lt-proc sw.automorf.bin ^watu/m<pl>tu<n>$
This would then be passed through the pretagger which would re-order the output thus:
^watu/mtu<n><pl>$
Basically just shunting all the tags to the end. This is now ready to be passed to the tagger.
Generation
Slightly more complicated. Out of final stage transfer you get:
^mtu<n><pl>$
This is then passed through the dictionary 2:
$ echo "^mtu<n><pl>$" | lt-proc -g sw.automangle.bin m<pl>tu<n>
Note: obviously we're missing the '^' and '$' here, but thats fairly straightforward to add.
Then it is passed through dictionary 1 in generation mode:
$ echo "^m<pl>tu<n>$" | lt-proc -g sw.autogen.bin watu
Dictionaries
Dictionary 1 (sw-1.dix)
- Compile
$ lt-comp lr sw.dix sw.automorf.bin $ lt-comp rl sw.dix sw.autogen.bin
<dictionary>
<alphabet>abcdefghijklmnopqrstuvwxyz</alphabet>
<sdefs>
<sdef n="n"/>
<sdef n="sg"/>
<sdef n="pl"/>
</sdefs>
<pardefs>
<pardef n="C__1_2__n">
<e><p><l>m</l><r>m<s n="sg"/></r></p></e>
<e><p><l>wa</l><r>m<s n="pl"/></r></p></e>
</pardef>
<pardef n="mtu__n">
<e><p><l/><r><s n="n"/></r></p></e>
</pardef>
</pardefs>
<section id="main" type="standard">
<e lm="mtu"><par n="C__1_2__n"/><i>tu</i><par n="mtu__n"/></e>
</section>
</dictionary>
Dictionary 2 (sw-2.dix)
- Compile
$ lt-comp lr sw-2.dix sw.automangle.bin
<dictionary>
<alphabet>abcdefghijklmnopqrstuvwxyz</alphabet>
<sdefs>
<sdef n="n"/>
<sdef n="sg"/>
<sdef n="pl"/>
</sdefs>
<section id="main" type="standard">
<e>
<p>
<l>mtu<s n="n"/><s n="sg"/></l>
<r>m<s n="sg"/>tu<s n="n"/></r>
</p>
</e>
<e>
<p>
<l>mtu<s n="n"/><s n="pl"/></l>
<r>m<s n="pl"/>tu<s n="n"/></r>
</p>
</e>
</section>
</dictionary>
Pretagger
A very rough 5 minute proof-of-concept:
#!/usr/bin/python
# coding=utf-8
# -*- encoding: utf-8 -*-
import sys, codecs, copy;
sys.stdout = codecs.getwriter('utf-8')(sys.stdout);
sys.stderr = codecs.getwriter('utf-8')(sys.stderr);
form = '';
tag = '';
tags = [];
lemma = '';
reading = '';
def reverse_order(tags, out): #{
if len(tags) < 1: #{
return out;
#}
out = out + tags.pop();
return reverse_order(tags, out);
#}
for char in sys.stdin.read().decode('utf-8'): #{
if char == '^': #{
reading = 'form';
continue;
#}
if char == '/': #{
reading = 'lemma';
continue;
#}
if char == '<': #{
tag = '';
reading = 'tag';
#}
if char == '>': #{
tag = tag + char;
tags.append(tag);
reading = 'lemma';
continue;
#}
if char == '$': #{
reading = '';
continue;
#}
if char == ' ' or char == '\n': #{
reading = 'output';
#}
if reading == 'form': #{
form = form + char;
#}
if reading == 'lemma': #{
lemma = lemma + char;
#}
if reading == 'tag': #{
tag = tag + char;
#}
if reading == 'output': #{
sys.stdout.write('^' + form + '/' + lemma + reverse_order(tags, '') + '$');
tags = [];
tag = '';
lemma = '';
form = '';
sys.stdout.write(char);
#}
#}