Difference between revisions of "User:Techievena/GSoC 2018 Work Product Submission"

From Apertium
Jump to navigation Jump to search
(Add Initial description)
(Add work done)
Line 30: Line 30:
=== Extend lttoolbox to have the power of HFST ===
=== Extend lttoolbox to have the power of HFST ===
==== Work Done ====
==== Work Done ====
'''CODING CHALLENGE:''' https://github.com/Techievena/lexc2dix



*'''att_compiler: Support for weights to lttoolbox binary format'''<br/>Make all the tweaks necessary to have a minimal implementation of weight based analyses in the att_compiler.

$ cat test.att
0 1 c c 4.567895
1 2 a a 0.989532
2 3 t t 2.796193
3 4 @0@ + -3.824564
4 5 @0@ n 1.824564
5 0.525487
4 5 @0@ v 2.845989
$ lt-comp lr test.att test.bin
main@standard 6 6
$ lt-print test.bin
0 1 c c 4.567895
1 2 a a 0.989532
2 3 t t 2.796193
3 4 ε + -3.824564
4 5 ε n 1.824564
4 5 ε v 2.845989
5 0.525487

*'''lt-proc: Implement option to output n-best paths'''<br/>Using the same option names as hfst-proc we add options in lt-proc to output n-best paths using the weight values.

$ echo "cats" | lt-proc test.bin
$ echo "cats" | lt-proc -W test.bin
$ echo "cats" | lt-proc -N 1 test.bin
$ echo "cats" | lt-proc -W -N 1 test.bin

$ lt-proc -h
lt-proc: process a stream with a letter transducer
USAGE: lt-proc [ -a | -b | -c | -d | -e | -g | -n | -p | -s | -t | -v | -h -z -w ] [-W] [-N N] [-L N] [ -i icx_file ] [ -r rcx_file ] fst_file [input_file [output_file]]
-a, --analysis: morphological analysis (default behavior)
-b, --bilingual: lexical transfer
-c, --case-sensitive: use the literal case of the incoming characters
-d, --debugged-gen morph. generation with all the stuff
-e, --decompose-nouns: Try to decompound unknown words
-g, --generation: morphological generation
-i, --ignored-chars: specify file with characters to ignore
-r, --restore-chars: specify file with characters to diacritic restoration
-l, --tagged-gen: morphological generation keeping lexical forms
-m, --tagged-nm-gen: same as -l but without unknown word marks
-n, --non-marked-gen morph. generation without unknown word marks
-o, --surf-bilingual: lexical transfer with surface forms
-p, --post-generation: post-generation
-s, --sao: SAO annotation system input processing
-t, --transliteration: apply transliteration dictionary
-v, --version: version
-z, --null-flush: flush output on the null character
-w, --dictionary-case: use dictionary case instead of surface case
-C, --careful-case: use dictionary case if present, else surface
-I, --no-default-ignore: skips loading the default ignore characters
-W, --show-weights: Print final analysis weights (if any)
-N, --analyses: Output no more than N analyses (if the transducer is weighted, the N best analyses)
-L, --weight-classes: Output no more than N best weight classes (where analyses with equal weight constitute a class)
-h, --help: show this help

*'''Allow weights on entries in lttoolbox XML'''<br/>Modify the DTD and parser to allow weights on entries in lttoolbox XML.

$ cat test.dix
<?xml version="1.0" encoding="UTF-8"?>
<sdef n="n" c="Noun"/>

<sdef n="ma" c="Masculine (animate)"/>
<sdef n="mi" c="Masculine (inanimate)"/>
<sdef n="nt" c="Neuter"/>
<sdef n="f" c="Feminine"/>

<sdef n="sg" c="Singular"/>
<sdef n="du" c="Dual"/>
<sdef n="pl" c="Plural"/>

<sdef n="nom" c="Nominative"/>
<sdef n="gen" c="Genitive"/>
<sdef n="dat" c="Dative"/>
<sdef n="acc" c="Accusative"/>
<sdef n="ins" c="Instrumental"/>
<sdef n="loc" c="Locative"/>
<sdef n="voc" c="Vocative"/>
<pardef n="nan__n_ma">
<e w="1.56"><p><l></l><r><s n="n"/><s n="ma"/><s n="sg"/><s n="nom"/></r></p></e>
<e w="2.56"><p><l>a</l><r><s n="n"/><s n="ma"/><s n="sg"/><s n="gen"/></r></p></e>
<e w="3.56"><p><l>ej</l><r><s n="n"/><s n="ma"/><s n="sg"/><s n="dat"/></r></p></e>
<e w="4.56"><p><l>a</l><r><s n="n"/><s n="ma"/><s n="sg"/><s n="acc"/></r></p></e>
<e w="5.56"><p><l>om</l><r><s n="n"/><s n="ma"/><s n="sg"/><s n="ins"/></r></p></e>
<e w="6.56"><p><l>je</l><r><s n="n"/><s n="ma"/><s n="sg"/><s n="loc"/></r></p></e>
<e w="7.56"><p><l>o</l><r><s n="n"/><s n="ma"/><s n="sg"/><s n="voc"/></r></p></e>

<e w="8.56"><p><l>aj</l><r><s n="n"/><s n="ma"/><s n="du"/><s n="nom"/></r></p></e>
<e w="9.56"><p><l>ow</l><r><s n="n"/><s n="ma"/><s n="du"/><s n="gen"/></r></p></e>
<e w="10.56"><p><l>omaj</l><r><s n="n"/><s n="ma"/><s n="du"/><s n="dat"/></r></p></e>
<e w="11.56"><p><l>ow</l><r><s n="n"/><s n="ma"/><s n="du"/><s n="acc"/></r></p></e>
<e w="12.56"><p><l>omaj</l><r><s n="n"/><s n="ma"/><s n="du"/><s n="ins"/></r></p></e>
<e w="13.56"><p><l>omaj</l><r><s n="n"/><s n="ma"/><s n="du"/><s n="loc"/></r></p></e>
<e w="14.56"><p><l>aj</l><r><s n="n"/><s n="ma"/><s n="du"/><s n="voc"/></r></p></e>

<e w="15.56"><p><l>ojo</l><r><s n="n"/><s n="ma"/><s n="pl"/><s n="nom"/></r></p></e>
<e w="16.56"><p><l>ow</l><r><s n="n"/><s n="ma"/><s n="pl"/><s n="gen"/></r></p></e>
<e w="17.56"><p><l>am</l><r><s n="n"/><s n="ma"/><s n="pl"/><s n="dat"/></r></p></e>
<e w="18.56"><p><l>ow</l><r><s n="n"/><s n="ma"/><s n="pl"/><s n="acc"/></r></p></e>
<e w="19.56"><p><l>ami</l><r><s n="n"/><s n="ma"/><s n="pl"/><s n="ins"/></r></p></e>
<e w="20.56"><p><l>ach</l><r><s n="n"/><s n="ma"/><s n="pl"/><s n="loc"/></r></p></e>
<e w="21.56"><p><l>ojo</l><r><s n="n"/><s n="ma"/><s n="pl"/><s n="voc"/></r></p></e>

<section id="main" type="standard">
<e lm="nan" w="22.56"><i>nan</i><par n="nan__n_ma"/></e>

$ lt-comp lr test.dix test-mor.bin
main@standard 35 54
$ lt-print test-mor.bin
0 1 n n 0.000000
1 2 a a 0.000000
2 3 n n 22.560000
3 4 ε <n> 0.000000
3 5 a <n> 0.000000
3 6 e <n> 0.000000
3 7 o <n> 0.000000
3 8 j <n> 0.000000
4 9 ε <ma> 0.000000
5 10 ε <ma> 0.000000
5 11 j <ma> 0.000000
5 12 m <ma> 0.000000
5 13 c <ma> 0.000000
6 14 j <ma> 0.000000
7 15 ε <ma> 0.000000
7 16 j <ma> 0.000000
7 17 m <ma> 0.000000
7 18 w <ma> 0.000000
8 19 e <ma> 0.000000
9 20 ε <sg> 0.000000
10 21 ε <sg> 0.000000
11 22 ε <du> 0.000000
12 23 ε <pl> 0.000000
12 24 i <pl> 0.000000
13 25 h <pl> 0.000000
14 26 ε <sg> 0.000000
15 27 ε <sg> 0.000000
16 28 o <pl> 0.000000
17 29 ε <sg> 0.000000
17 30 a <du> 0.000000
18 31 ε <du> 0.000000
18 32 ε <pl> 0.000000
19 33 ε <sg> 0.000000
20 34 ε <nom> 1.560000
21 34 ε <gen> 2.560000
21 34 ε <acc> 4.560000
22 34 ε <nom> 8.560000
22 34 ε <voc> 14.560000
23 34 ε <dat> 17.560000
24 34 ε <ins> 19.560000
25 34 ε <loc> 20.560000
26 34 ε <dat> 3.560000
27 34 ε <voc> 7.560000
28 34 ε <nom> 15.560000
28 34 ε <voc> 21.560000
29 34 ε <ins> 5.560000
30 34 j <dat> 10.560000
30 34 j <ins> 12.560000
30 34 j <loc> 13.560000
31 34 ε <gen> 9.560000
31 34 ε <acc> 11.560000
32 34 ε <gen> 16.560000
32 34 ε <acc> 18.560000
33 34 ε <loc> 6.560000
34 0.000000
$ echo "nanow" | lt-proc -W test-mor.bin

*'''Other merged pull requests'''
**https://github.com/apertium/lttoolbox/pull/14 (Fix inconsistencies in the weighted branch)
**https://github.com/apertium/lttoolbox/pull/25 (Use default values in lttoolbox to prevent apertium-separable from failing)

==== Challenges ====
==== Challenges ====
==== Work to be done ====
==== Work to be done ====

Revision as of 05:12, 9 August 2018


Abinash Senapati

Hi I am Abinash, a final year undergraduate in the department of Electronics and Electrical Communication Engineering, Indian Institute of Technology, Kharagpur. I was a Google Summer of Code student for Apertium over the summer of 2018 and primarily worked on lttoolbox and apertium-core. My project involved extending the capability of performing morphographemics and adding lexical weights to the lttoolbox transducer in order to enable more complex translations with the transducer.

Project Title

Extend lttoolbox to have the power of HFST

GSoC Blog


Public Profiles

GitHub: Techievena
GitLab: Techievena
IRC nick: Techievena
Apertium wiki: Techievena
E-mail: abinashsena@gmail.com


Francis Tyers and Tommi Pirinen


tarball: Download
zip: Download

Link to commits and repositories I have worked on


Extend lttoolbox to have the power of HFST

Work Done

CODING CHALLENGE: https://github.com/Techievena/lexc2dix



$ cat test.att
0	1	c	c	4.567895
1	2	a	a	0.989532
2	3	t	t	2.796193
3	4	@0@	+	-3.824564
4	5	@0@	n	1.824564
5	0.525487
4	5	@0@	v	2.845989
$ lt-comp lr test.att test.bin 
main@standard 6 6
$ lt-print test.bin
0	1	c	c	4.567895	
1	2	a	a	0.989532	
2	3	t	t	2.796193	
3	4	ε	+	-3.824564	
4	5	ε	n	1.824564	
4	5	ε	v	2.845989	
5	0.525487
$ echo "cats" | lt-proc test.bin
$ echo "cats" | lt-proc -W test.bin
$ echo "cats" | lt-proc -N 1 test.bin
$ echo "cats" | lt-proc -W -N 1 test.bin
$ lt-proc -h
lt-proc: process a stream with a letter transducer
USAGE: lt-proc [ -a | -b | -c | -d | -e | -g | -n | -p | -s | -t | -v | -h -z -w ] [-W] [-N N] [-L N] [ -i icx_file ] [ -r rcx_file ] fst_file [input_file [output_file]]
  -a, --analysis:          morphological analysis (default behavior)
  -b, --bilingual:         lexical transfer
  -c, --case-sensitive:    use the literal case of the incoming characters
  -d, --debugged-gen       morph. generation with all the stuff
  -e, --decompose-nouns:   Try to decompound unknown words
  -g, --generation:        morphological generation
  -i, --ignored-chars:     specify file with characters to ignore
  -r, --restore-chars:     specify file with characters to diacritic restoration
  -l, --tagged-gen:        morphological generation keeping lexical forms
  -m, --tagged-nm-gen:     same as -l but without unknown word marks
  -n, --non-marked-gen     morph. generation without unknown word marks
  -o, --surf-bilingual:    lexical transfer with surface forms
  -p, --post-generation:   post-generation
  -s, --sao:               SAO annotation system input processing
  -t, --transliteration:   apply transliteration dictionary
  -v, --version:           version
  -z, --null-flush:        flush output on the null character 
  -w, --dictionary-case:   use dictionary case instead of surface case
  -C, --careful-case:      use dictionary case if present, else surface
  -I, --no-default-ignore: skips loading the default ignore characters
  -W, --show-weights:      Print final analysis weights (if any)
  -N, --analyses:          Output no more than N analyses (if the transducer is weighted, the N best analyses)
  -L, --weight-classes:    Output no more than N best weight classes (where analyses with equal weight constitute a class)
  -h, --help:              show this help
$ cat test.dix
<?xml version="1.0" encoding="UTF-8"?>
  <sdef n="n"     c="Noun"/>

  <sdef n="ma"    c="Masculine (animate)"/>
  <sdef n="mi"    c="Masculine (inanimate)"/>
  <sdef n="nt"    c="Neuter"/>
  <sdef n="f"     c="Feminine"/>

  <sdef n="sg"    c="Singular"/>
  <sdef n="du"    c="Dual"/>
  <sdef n="pl"    c="Plural"/>

  <sdef n="nom"   c="Nominative"/>
  <sdef n="gen"   c="Genitive"/>
  <sdef n="dat"   c="Dative"/>
  <sdef n="acc"   c="Accusative"/>
  <sdef n="ins"   c="Instrumental"/>
  <sdef n="loc"   c="Locative"/>
  <sdef n="voc"   c="Vocative"/>
  <pardef n="nan__n_ma">
    <e w="1.56"><p><l></l><r><s n="n"/><s n="ma"/><s n="sg"/><s n="nom"/></r></p></e>
    <e w="2.56"><p><l>a</l><r><s n="n"/><s n="ma"/><s n="sg"/><s n="gen"/></r></p></e>
    <e w="3.56"><p><l>ej</l><r><s n="n"/><s n="ma"/><s n="sg"/><s n="dat"/></r></p></e>
    <e w="4.56"><p><l>a</l><r><s n="n"/><s n="ma"/><s n="sg"/><s n="acc"/></r></p></e>
    <e w="5.56"><p><l>om</l><r><s n="n"/><s n="ma"/><s n="sg"/><s n="ins"/></r></p></e>
    <e w="6.56"><p><l>je</l><r><s n="n"/><s n="ma"/><s n="sg"/><s n="loc"/></r></p></e>
    <e w="7.56"><p><l>o</l><r><s n="n"/><s n="ma"/><s n="sg"/><s n="voc"/></r></p></e>

    <e w="8.56"><p><l>aj</l><r><s n="n"/><s n="ma"/><s n="du"/><s n="nom"/></r></p></e>
    <e w="9.56"><p><l>ow</l><r><s n="n"/><s n="ma"/><s n="du"/><s n="gen"/></r></p></e>
    <e w="10.56"><p><l>omaj</l><r><s n="n"/><s n="ma"/><s n="du"/><s n="dat"/></r></p></e>
    <e w="11.56"><p><l>ow</l><r><s n="n"/><s n="ma"/><s n="du"/><s n="acc"/></r></p></e>
    <e w="12.56"><p><l>omaj</l><r><s n="n"/><s n="ma"/><s n="du"/><s n="ins"/></r></p></e>
    <e w="13.56"><p><l>omaj</l><r><s n="n"/><s n="ma"/><s n="du"/><s n="loc"/></r></p></e>
    <e w="14.56"><p><l>aj</l><r><s n="n"/><s n="ma"/><s n="du"/><s n="voc"/></r></p></e>

    <e w="15.56"><p><l>ojo</l><r><s n="n"/><s n="ma"/><s n="pl"/><s n="nom"/></r></p></e>
    <e w="16.56"><p><l>ow</l><r><s n="n"/><s n="ma"/><s n="pl"/><s n="gen"/></r></p></e>
    <e w="17.56"><p><l>am</l><r><s n="n"/><s n="ma"/><s n="pl"/><s n="dat"/></r></p></e>
    <e w="18.56"><p><l>ow</l><r><s n="n"/><s n="ma"/><s n="pl"/><s n="acc"/></r></p></e>
    <e w="19.56"><p><l>ami</l><r><s n="n"/><s n="ma"/><s n="pl"/><s n="ins"/></r></p></e>
    <e w="20.56"><p><l>ach</l><r><s n="n"/><s n="ma"/><s n="pl"/><s n="loc"/></r></p></e>
    <e w="21.56"><p><l>ojo</l><r><s n="n"/><s n="ma"/><s n="pl"/><s n="voc"/></r></p></e>

  <section id="main" type="standard">
    <e lm="nan" w="22.56"><i>nan</i><par n="nan__n_ma"/></e>    

$ lt-comp lr test.dix test-mor.bin
main@standard 35 54
$ lt-print test-mor.bin 
0	1	n	n	0.000000	
1	2	a	a	0.000000	
2	3	n	n	22.560000	
3	4	ε	<n>	0.000000	
3	5	a	<n>	0.000000	
3	6	e	<n>	0.000000	
3	7	o	<n>	0.000000	
3	8	j	<n>	0.000000	
4	9	ε	<ma>	0.000000	
5	10	ε	<ma>	0.000000	
5	11	j	<ma>	0.000000	
5	12	m	<ma>	0.000000	
5	13	c	<ma>	0.000000	
6	14	j	<ma>	0.000000	
7	15	ε	<ma>	0.000000	
7	16	j	<ma>	0.000000	
7	17	m	<ma>	0.000000	
7	18	w	<ma>	0.000000	
8	19	e	<ma>	0.000000	
9	20	ε	<sg>	0.000000	
10	21	ε	<sg>	0.000000	
11	22	ε	<du>	0.000000	
12	23	ε	<pl>	0.000000	
12	24	i	<pl>	0.000000	
13	25	h	<pl>	0.000000	
14	26	ε	<sg>	0.000000	
15	27	ε	<sg>	0.000000	
16	28	o	<pl>	0.000000	
17	29	ε	<sg>	0.000000	
17	30	a	<du>	0.000000	
18	31	ε	<du>	0.000000	
18	32	ε	<pl>	0.000000	
19	33	ε	<sg>	0.000000	
20	34	ε	<nom>	1.560000	
21	34	ε	<gen>	2.560000	
21	34	ε	<acc>	4.560000	
22	34	ε	<nom>	8.560000	
22	34	ε	<voc>	14.560000	
23	34	ε	<dat>	17.560000	
24	34	ε	<ins>	19.560000	
25	34	ε	<loc>	20.560000	
26	34	ε	<dat>	3.560000	
27	34	ε	<voc>	7.560000	
28	34	ε	<nom>	15.560000	
28	34	ε	<voc>	21.560000	
29	34	ε	<ins>	5.560000	
30	34	j	<dat>	10.560000	
30	34	j	<ins>	12.560000	
30	34	j	<loc>	13.560000	
31	34	ε	<gen>	9.560000	
31	34	ε	<acc>	11.560000	
32	34	ε	<gen>	16.560000	
32	34	ε	<acc>	18.560000	
33	34	ε	<loc>	6.560000	
34	0.000000
$ echo "nanow" | lt-proc -W test-mor.bin 


Work to be done
