Difference between revisions of "Weighted transfer rules"

From Apertium
Jump to navigation Jump to search
(Created page with "== Related links == Idea description [[Weighted_transfer_rules_at_GSoC_2016|Nikita Medyankin's project at GSoC 201...")
 
Line 49: Line 49:
 
|-
 
|-
 
|}
 
|}
  +
  +
== Expertiment ==
  +
The sample file new-software-sample.txt contains three selected lines with 'new software' and 'this new software' patterns, each of which triggers a pair of ambiguous rules from apertium-en-es.en-es.t1x file, namely ['adj-nom', 'adj-nom-ns'] and ['det-adj-nom', 'det-adj-nom-ns']. Speaking informally, these rules are used to transfer sequences of (adjective, noun) and (determiner, adjective, noun). The first rule in each ambiguous pair specifies that the translations of the adjective and the noun are to be swapped, which is usual for Spanish, hence these rule are specified before their '-ns' counterparts indicating that these are the default rules. The second rule in each ambiguous pair specifies that the translations of the adjective and the noun are not to be swapped, which sometimes happens and depends on lexical units involved.
  +
  +
The contents of the unpruned w1x file without generalizing patterns should look like the following:
  +
  +
<pre>
  +
<?xml version='1.0' encoding='UTF-8'?>
  +
<transfer-weights>
  +
<rule-group>
  +
<rule comment="REGLA: ADJ NOM no-swap-version" id="1" md5="64121bebaee1b179cfc0002db6b06fc3">
  +
<pattern weight="1.625228556310039">
  +
<pattern-item tags="adj.sint"/>
  +
<pattern-item tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="1.625228556310039">
  +
<pattern-item tags="adj.sint"/>
  +
<pattern-item lemma="software" tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="1.625228556310039">
  +
<pattern-item lemma="new" tags="adj.sint"/>
  +
<pattern-item tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="1.625228556310039">
  +
<pattern-item lemma="new" tags="adj.sint"/>
  +
<pattern-item lemma="software" tags="n.sg"/>
  +
</pattern>
  +
</rule>
  +
<rule comment="REGLA: ADJ NOM" id="2" md5="8eed4b8aee5567fcfebc0de7698f4bdb">
  +
<pattern weight="0.3747714436899609">
  +
<pattern-item tags="adj.sint"/>
  +
<pattern-item tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.3747714436899609">
  +
<pattern-item tags="adj.sint"/>
  +
<pattern-item lemma="software" tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.3747714436899609">
  +
<pattern-item lemma="new" tags="adj.sint"/>
  +
<pattern-item tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.3747714436899609">
  +
<pattern-item lemma="new" tags="adj.sint"/>
  +
<pattern-item lemma="software" tags="n.sg"/>
  +
</pattern>
  +
</rule>
  +
</rule-group>
  +
<rule-group>
  +
<rule comment="REGLA: DET ADJ NOM no-swap-version" id="3" md5="05d8b437ee595c7d0c992c5ae066a199">
  +
<pattern weight="0.9844006834162787">
  +
<pattern-item tags="det.dem.sg"/>
  +
<pattern-item tags="adj.sint"/>
  +
<pattern-item tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.9844006834162787">
  +
<pattern-item tags="det.dem.sg"/>
  +
<pattern-item tags="adj.sint"/>
  +
<pattern-item lemma="software" tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.9844006834162787">
  +
<pattern-item tags="det.dem.sg"/>
  +
<pattern-item lemma="new" tags="adj.sint"/>
  +
<pattern-item tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.9844006834162787">
  +
<pattern-item tags="det.dem.sg"/>
  +
<pattern-item lemma="new" tags="adj.sint"/>
  +
<pattern-item lemma="software" tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.9376183345269524">
  +
<pattern-item tags="det.pos.sp"/>
  +
<pattern-item tags="adj"/>
  +
<pattern-item lemma="code" tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.9376183345269524">
  +
<pattern-item tags="det.pos.sp"/>
  +
<pattern-item tags="adj"/>
  +
<pattern-item tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.9376183345269524">
  +
<pattern-item tags="det.pos.sp"/>
  +
<pattern-item lemma="own" tags="adj"/>
  +
<pattern-item lemma="code" tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.9376183345269524">
  +
<pattern-item tags="det.pos.sp"/>
  +
<pattern-item lemma="own" tags="adj"/>
  +
<pattern-item tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.9376183345269524">
  +
<pattern-item lemma="its" tags="det.pos.sp"/>
  +
<pattern-item tags="adj"/>
  +
<pattern-item lemma="code" tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.9376183345269524">
  +
<pattern-item lemma="its" tags="det.pos.sp"/>
  +
<pattern-item tags="adj"/>
  +
<pattern-item tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.9376183345269524">
  +
<pattern-item lemma="its" tags="det.pos.sp"/>
  +
<pattern-item lemma="own" tags="adj"/>
  +
<pattern-item lemma="code" tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.9376183345269524">
  +
<pattern-item lemma="its" tags="det.pos.sp"/>
  +
<pattern-item lemma="own" tags="adj"/>
  +
<pattern-item tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.9844006834162787">
  +
<pattern-item lemma="this" tags="det.dem.sg"/>
  +
<pattern-item tags="adj.sint"/>
  +
<pattern-item tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.9844006834162787">
  +
<pattern-item lemma="this" tags="det.dem.sg"/>
  +
<pattern-item tags="adj.sint"/>
  +
<pattern-item lemma="software" tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.9844006834162787">
  +
<pattern-item lemma="this" tags="det.dem.sg"/>
  +
<pattern-item lemma="new" tags="adj.sint"/>
  +
<pattern-item tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.9844006834162787">
  +
<pattern-item lemma="this" tags="det.dem.sg"/>
  +
<pattern-item lemma="new" tags="adj.sint"/>
  +
<pattern-item lemma="software" tags="n.sg"/>
  +
</pattern>
  +
</rule>
  +
<rule comment="REGLA: DET ADJ NOM" id="4" md5="87fb69c4cd8792f06e0b51c6fd79f127">
  +
<pattern weight="0.0155993165837215">
  +
<pattern-item tags="det.dem.sg"/>
  +
<pattern-item tags="adj.sint"/>
  +
<pattern-item tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.0155993165837215">
  +
<pattern-item tags="det.dem.sg"/>
  +
<pattern-item tags="adj.sint"/>
  +
<pattern-item lemma="software" tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.0155993165837215">
  +
<pattern-item tags="det.dem.sg"/>
  +
<pattern-item lemma="new" tags="adj.sint"/>
  +
<pattern-item tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.0155993165837215">
  +
<pattern-item tags="det.dem.sg"/>
  +
<pattern-item lemma="new" tags="adj.sint"/>
  +
<pattern-item lemma="software" tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.06238166547304746">
  +
<pattern-item tags="det.pos.sp"/>
  +
<pattern-item tags="adj"/>
  +
<pattern-item lemma="code" tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.06238166547304746">
  +
<pattern-item tags="det.pos.sp"/>
  +
<pattern-item tags="adj"/>
  +
<pattern-item tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.06238166547304746">
  +
<pattern-item tags="det.pos.sp"/>
  +
<pattern-item lemma="own" tags="adj"/>
  +
<pattern-item lemma="code" tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.06238166547304746">
  +
<pattern-item tags="det.pos.sp"/>
  +
<pattern-item lemma="own" tags="adj"/>
  +
<pattern-item tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.06238166547304746">
  +
<pattern-item lemma="its" tags="det.pos.sp"/>
  +
<pattern-item tags="adj"/>
  +
<pattern-item lemma="code" tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.06238166547304746">
  +
<pattern-item lemma="its" tags="det.pos.sp"/>
  +
<pattern-item tags="adj"/>
  +
<pattern-item tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.06238166547304746">
  +
<pattern-item lemma="its" tags="det.pos.sp"/>
  +
<pattern-item lemma="own" tags="adj"/>
  +
<pattern-item lemma="code" tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.06238166547304746">
  +
<pattern-item lemma="its" tags="det.pos.sp"/>
  +
<pattern-item lemma="own" tags="adj"/>
  +
<pattern-item tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.0155993165837215">
  +
<pattern-item lemma="this" tags="det.dem.sg"/>
  +
<pattern-item tags="adj.sint"/>
  +
<pattern-item tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.0155993165837215">
  +
<pattern-item lemma="this" tags="det.dem.sg"/>
  +
<pattern-item tags="adj.sint"/>
  +
<pattern-item lemma="software" tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.0155993165837215">
  +
<pattern-item lemma="this" tags="det.dem.sg"/>
  +
<pattern-item lemma="new" tags="adj.sint"/>
  +
<pattern-item tags="n.sg"/>
  +
</pattern>
  +
<pattern weight="0.0155993165837215">
  +
<pattern-item lemma="this" tags="det.dem.sg"/>
  +
<pattern-item lemma="new" tags="adj.sint"/>
  +
<pattern-item lemma="software" tags="n.sg"/>
  +
</pattern>
  +
</rule>
  +
</rule-group>
  +
</transfer-weights>
  +
</pre>

Revision as of 14:15, 17 April 2018

Related links

Idea description

Nikita Medyankin's project at GSoC 2016

https://github.com/apertium/apertium-weights-learner/tree/629b48b306116565bc1d748c298bc28b41506f63

https://svn.code.sf.net/p/apertium/svn/branches/weighted-transfer/

Fixes

Nikita's code should work okay now. To run it, download apertium-weights-learner from https://github.com/apertium/apertium-weights-learner/tree/experimental, English - Spanish language pair with ambiguous rules from https://github.com/apertium/apertium-en-es/tree/ambiguous-rules and Apertium core with modified transfer module from https://svn.code.sf.net/p/apertium/svn/branches/weighted-transfer/apertium/.

Coverages

The number of all possible coverages was calculated 100 times for 100 random sentences for 5 language pairs.

language pair corpus mean number of coverages
English - Spanish Tatoeba 3.72
English - Spanish Europarl 194.35
Spanish - Catalan Tatoeba 2.94
Spanish - Catalan Europarl 53.04
Basque - Spanish Tatoeba 9.19
Swedish - Norwegian Europarl 488.57
Crimean Tatar - Turkish Crimean Tatar Wikipedia 3.12

Expertiment

The sample file new-software-sample.txt contains three selected lines with 'new software' and 'this new software' patterns, each of which triggers a pair of ambiguous rules from apertium-en-es.en-es.t1x file, namely ['adj-nom', 'adj-nom-ns'] and ['det-adj-nom', 'det-adj-nom-ns']. Speaking informally, these rules are used to transfer sequences of (adjective, noun) and (determiner, adjective, noun). The first rule in each ambiguous pair specifies that the translations of the adjective and the noun are to be swapped, which is usual for Spanish, hence these rule are specified before their '-ns' counterparts indicating that these are the default rules. The second rule in each ambiguous pair specifies that the translations of the adjective and the noun are not to be swapped, which sometimes happens and depends on lexical units involved.

The contents of the unpruned w1x file without generalizing patterns should look like the following:

<?xml version='1.0' encoding='UTF-8'?>
<transfer-weights>
  <rule-group>
    <rule comment="REGLA: ADJ NOM no-swap-version" id="1" md5="64121bebaee1b179cfc0002db6b06fc3">
      <pattern weight="1.625228556310039">
        <pattern-item tags="adj.sint"/>
        <pattern-item tags="n.sg"/>
      </pattern>
      <pattern weight="1.625228556310039">
        <pattern-item tags="adj.sint"/>
        <pattern-item lemma="software" tags="n.sg"/>
      </pattern>
      <pattern weight="1.625228556310039">
        <pattern-item lemma="new" tags="adj.sint"/>
        <pattern-item tags="n.sg"/>
      </pattern>
      <pattern weight="1.625228556310039">
        <pattern-item lemma="new" tags="adj.sint"/>
        <pattern-item lemma="software" tags="n.sg"/>
      </pattern>
    </rule>
    <rule comment="REGLA: ADJ NOM" id="2" md5="8eed4b8aee5567fcfebc0de7698f4bdb">
      <pattern weight="0.3747714436899609">
        <pattern-item tags="adj.sint"/>
        <pattern-item tags="n.sg"/>
      </pattern>
      <pattern weight="0.3747714436899609">
        <pattern-item tags="adj.sint"/>
        <pattern-item lemma="software" tags="n.sg"/>
      </pattern>
      <pattern weight="0.3747714436899609">
        <pattern-item lemma="new" tags="adj.sint"/>
        <pattern-item tags="n.sg"/>
      </pattern>
      <pattern weight="0.3747714436899609">
        <pattern-item lemma="new" tags="adj.sint"/>
        <pattern-item lemma="software" tags="n.sg"/>
      </pattern>
    </rule>
  </rule-group>
  <rule-group>
    <rule comment="REGLA: DET ADJ NOM no-swap-version" id="3" md5="05d8b437ee595c7d0c992c5ae066a199">
      <pattern weight="0.9844006834162787">
        <pattern-item tags="det.dem.sg"/>
        <pattern-item tags="adj.sint"/>
        <pattern-item tags="n.sg"/>
      </pattern>
      <pattern weight="0.9844006834162787">
        <pattern-item tags="det.dem.sg"/>
        <pattern-item tags="adj.sint"/>
        <pattern-item lemma="software" tags="n.sg"/>
      </pattern>
      <pattern weight="0.9844006834162787">
        <pattern-item tags="det.dem.sg"/>
        <pattern-item lemma="new" tags="adj.sint"/>
        <pattern-item tags="n.sg"/>
      </pattern>
      <pattern weight="0.9844006834162787">
        <pattern-item tags="det.dem.sg"/>
        <pattern-item lemma="new" tags="adj.sint"/>
        <pattern-item lemma="software" tags="n.sg"/>
      </pattern>
      <pattern weight="0.9376183345269524">
        <pattern-item tags="det.pos.sp"/>
        <pattern-item tags="adj"/>
        <pattern-item lemma="code" tags="n.sg"/>
      </pattern>
      <pattern weight="0.9376183345269524">
        <pattern-item tags="det.pos.sp"/>
        <pattern-item tags="adj"/>
        <pattern-item tags="n.sg"/>
      </pattern>
      <pattern weight="0.9376183345269524">
        <pattern-item tags="det.pos.sp"/>
        <pattern-item lemma="own" tags="adj"/>
        <pattern-item lemma="code" tags="n.sg"/>
      </pattern>
      <pattern weight="0.9376183345269524">
        <pattern-item tags="det.pos.sp"/>
        <pattern-item lemma="own" tags="adj"/>
        <pattern-item tags="n.sg"/>
      </pattern>
      <pattern weight="0.9376183345269524">
        <pattern-item lemma="its" tags="det.pos.sp"/>
        <pattern-item tags="adj"/>
        <pattern-item lemma="code" tags="n.sg"/>
      </pattern>
      <pattern weight="0.9376183345269524">
        <pattern-item lemma="its" tags="det.pos.sp"/>
        <pattern-item tags="adj"/>
        <pattern-item tags="n.sg"/>
      </pattern>
      <pattern weight="0.9376183345269524">
        <pattern-item lemma="its" tags="det.pos.sp"/>
        <pattern-item lemma="own" tags="adj"/>
        <pattern-item lemma="code" tags="n.sg"/>
      </pattern>
      <pattern weight="0.9376183345269524">
        <pattern-item lemma="its" tags="det.pos.sp"/>
        <pattern-item lemma="own" tags="adj"/>
        <pattern-item tags="n.sg"/>
      </pattern>
      <pattern weight="0.9844006834162787">
        <pattern-item lemma="this" tags="det.dem.sg"/>
        <pattern-item tags="adj.sint"/>
        <pattern-item tags="n.sg"/>
      </pattern>
      <pattern weight="0.9844006834162787">
        <pattern-item lemma="this" tags="det.dem.sg"/>
        <pattern-item tags="adj.sint"/>
        <pattern-item lemma="software" tags="n.sg"/>
      </pattern>
      <pattern weight="0.9844006834162787">
        <pattern-item lemma="this" tags="det.dem.sg"/>
        <pattern-item lemma="new" tags="adj.sint"/>
        <pattern-item tags="n.sg"/>
      </pattern>
      <pattern weight="0.9844006834162787">
        <pattern-item lemma="this" tags="det.dem.sg"/>
        <pattern-item lemma="new" tags="adj.sint"/>
        <pattern-item lemma="software" tags="n.sg"/>
      </pattern>
    </rule>
    <rule comment="REGLA: DET ADJ NOM" id="4" md5="87fb69c4cd8792f06e0b51c6fd79f127">
      <pattern weight="0.0155993165837215">
        <pattern-item tags="det.dem.sg"/>
        <pattern-item tags="adj.sint"/>
        <pattern-item tags="n.sg"/>
      </pattern>
      <pattern weight="0.0155993165837215">
        <pattern-item tags="det.dem.sg"/>
        <pattern-item tags="adj.sint"/>
        <pattern-item lemma="software" tags="n.sg"/>
      </pattern>
      <pattern weight="0.0155993165837215">
        <pattern-item tags="det.dem.sg"/>
        <pattern-item lemma="new" tags="adj.sint"/>
        <pattern-item tags="n.sg"/>
      </pattern>
      <pattern weight="0.0155993165837215">
        <pattern-item tags="det.dem.sg"/>
        <pattern-item lemma="new" tags="adj.sint"/>
        <pattern-item lemma="software" tags="n.sg"/>
      </pattern>
      <pattern weight="0.06238166547304746">
        <pattern-item tags="det.pos.sp"/>
        <pattern-item tags="adj"/>
        <pattern-item lemma="code" tags="n.sg"/>
      </pattern>
      <pattern weight="0.06238166547304746">
        <pattern-item tags="det.pos.sp"/>
        <pattern-item tags="adj"/>
        <pattern-item tags="n.sg"/>
      </pattern>
      <pattern weight="0.06238166547304746">
        <pattern-item tags="det.pos.sp"/>
        <pattern-item lemma="own" tags="adj"/>
        <pattern-item lemma="code" tags="n.sg"/>
      </pattern>
      <pattern weight="0.06238166547304746">
        <pattern-item tags="det.pos.sp"/>
        <pattern-item lemma="own" tags="adj"/>
        <pattern-item tags="n.sg"/>
      </pattern>
      <pattern weight="0.06238166547304746">
        <pattern-item lemma="its" tags="det.pos.sp"/>
        <pattern-item tags="adj"/>
        <pattern-item lemma="code" tags="n.sg"/>
      </pattern>
      <pattern weight="0.06238166547304746">
        <pattern-item lemma="its" tags="det.pos.sp"/>
        <pattern-item tags="adj"/>
        <pattern-item tags="n.sg"/>
      </pattern>
      <pattern weight="0.06238166547304746">
        <pattern-item lemma="its" tags="det.pos.sp"/>
        <pattern-item lemma="own" tags="adj"/>
        <pattern-item lemma="code" tags="n.sg"/>
      </pattern>
      <pattern weight="0.06238166547304746">
        <pattern-item lemma="its" tags="det.pos.sp"/>
        <pattern-item lemma="own" tags="adj"/>
        <pattern-item tags="n.sg"/>
      </pattern>
      <pattern weight="0.0155993165837215">
        <pattern-item lemma="this" tags="det.dem.sg"/>
        <pattern-item tags="adj.sint"/>
        <pattern-item tags="n.sg"/>
      </pattern>
      <pattern weight="0.0155993165837215">
        <pattern-item lemma="this" tags="det.dem.sg"/>
        <pattern-item tags="adj.sint"/>
        <pattern-item lemma="software" tags="n.sg"/>
      </pattern>
      <pattern weight="0.0155993165837215">
        <pattern-item lemma="this" tags="det.dem.sg"/>
        <pattern-item lemma="new" tags="adj.sint"/>
        <pattern-item tags="n.sg"/>
      </pattern>
      <pattern weight="0.0155993165837215">
        <pattern-item lemma="this" tags="det.dem.sg"/>
        <pattern-item lemma="new" tags="adj.sint"/>
        <pattern-item lemma="software" tags="n.sg"/>
      </pattern>
    </rule>
  </rule-group>
</transfer-weights>