Difference between revisions of "User:Wei2912"
(→Conversion of PDF dictionary to lttoolbox format: Update dix format) |
|||
Line 21: | Line 21: | ||
A report comparing the above method, LRLM and RLLM (longest left to right matching and longest right to left matching respectively) is available at https://www.dropbox.com/sh/57wtof3gbcbsl7c/AABI-Mcw2E-c942BXxsMbEAja |
A report comparing the above method, LRLM and RLLM (longest left to right matching and longest right to left matching respectively) is available at https://www.dropbox.com/sh/57wtof3gbcbsl7c/AABI-Mcw2E-c942BXxsMbEAja |
||
== Conversion of |
== Conversion of Sakha-English dictionary to lttoolbox format == |
||
'''NOTE: This document is a draft.''' |
'''NOTE: This document is a draft.''' |
||
Line 29: | Line 29: | ||
We copy the text directly from the PDF file, as PDF to text converters are currently unable to convert the text properly (thanks to the arcane PDF format). |
We copy the text directly from the PDF file, as PDF to text converters are currently unable to convert the text properly (thanks to the arcane PDF format). |
||
Then, we |
Then, we obtain the script for converting our dictionary: |
||
<pre> |
<pre> |
||
svn co https://svn.code.sf.net/p/apertium/svn/trunk/dixscrapers/ |
|||
#!/usr/bin/python3 |
|||
cat orig.txt | dixscrapers/sakhadic2dix.py > sakhadic.xml |
|||
# -*- coding: utf-8 -*- |
|||
import fileinput |
|||
import itertools |
|||
import re |
|||
import xml.etree.cElementTree as ET |
|||
BRACKETS_RE = re.compile(r'(\(.+?\)|\[.+?\])') |
|||
PAGENUMBER_RE = re.compile(r'^\d+$') |
|||
SPLIT_RE = re.compile(r'[;,]\s+') |
|||
ABBRVS = { |
|||
'a.': ['adj'], |
|||
'adv.': ['adv'], |
|||
'arch.': [], |
|||
# cf. see also -- has been wiped out |
|||
'comp.': [], |
|||
# conv. converb, modifying verb -- covered later |
|||
'dial.': [], |
|||
'det.': ['det'], |
|||
'Evk.': [], |
|||
'exc.': ['ij'], |
|||
'int.': ['itg'], |
|||
'Mongo.': [], |
|||
'n.': ['n'], |
|||
'num.': ['det', 'qnt'], |
|||
'ono.': [], |
|||
'pl.': ['pl'], |
|||
'pp.': ['post'], |
|||
'pro.': ['prn'], |
|||
'Russ.': [], |
|||
'v.': ['v', 'TD'] |
|||
} |
|||
class Entry(object): |
|||
def __split(self, line): |
|||
return SPLIT_RE.split(line) |
|||
def __init__(self, line): |
|||
tags = line.split() |
|||
self.words = [] |
|||
self.abbrvs = [] |
|||
self.meanings = [] |
|||
found_abbrv = False |
|||
found_conv = False |
|||
for tag in tags: |
|||
if tag in ABBRVS.keys(): # abbreviations |
|||
found_abbrv = True |
|||
self.abbrvs.extend(ABBRVS[tag]) |
|||
continue |
|||
elif tag == "conv.": |
|||
found_abbrv = True |
|||
found_conv = True |
|||
self.abbrvs.append("vaux") |
|||
continue |
|||
if not found_abbrv: # entrys |
|||
self.words.append(tag) |
|||
else: # translated |
|||
self.meanings.append(tag) |
|||
# if there's "cf" in a word, we trim off everything else |
|||
for i, word in enumerate(self.words): |
|||
if word == "cf": |
|||
self.words = self.words[:i] |
|||
# if there's a converb, just look at the last word |
|||
if found_conv: |
|||
self.words = self.words[-1] |
|||
else: |
|||
self.words = " ".join(self.words) |
|||
self.meanings = " ".join(self.meanings) |
|||
self.words = strip_brackets(self.words) |
|||
self.meanings = strip_brackets(self.meanings) |
|||
if not self.abbrvs: |
|||
self.words = None |
|||
self.abbrvs = None |
|||
self.meanings = None |
|||
return |
|||
# preprocessing meanings |
|||
self.meanings = self.meanings.replace("to", "") |
|||
# split up meanings and entrys |
|||
self.words = [x.strip() for x in self.__split(self.words)] |
|||
self.meanings = [x.strip() for x in self.__split(self.meanings)] |
|||
def insert_blanks(element, line): |
|||
words = line.split() |
|||
if not words: |
|||
return |
|||
element.text = words[0] |
|||
element.tail = None |
|||
blank = None |
|||
for i in words[1:]: |
|||
blank = ET.SubElement(element, 'b') |
|||
blank.tail = i |
|||
def is_page_num(line): |
|||
return PAGENUMBER_RE.match(line) |
|||
def strip_brackets(line): |
|||
brackets = BRACKETS_RE.search(line) |
|||
if brackets: |
|||
for bracket in brackets.groups(): |
|||
line = line.replace(bracket, "") |
|||
return line |
|||
def is_cyrillic(word): |
|||
num_non_cyrillic = 0 |
|||
num_cyrillic = 0 |
|||
for c in word: |
|||
ordc = ord(c) |
|||
if 0x0400 <= ordc <= 0x04FF: |
|||
num_cyrillic += 1 |
|||
else: |
|||
num_non_cyrillic += 1 |
|||
return num_cyrillic > num_non_cyrillic |
|||
def preprocess(lines): |
|||
def preprocess_line(line): |
|||
if not line: |
|||
return None |
|||
line = line.strip() |
|||
line = line.replace("•", "") |
|||
line = line.replace("=", "") |
|||
line = line.replace("cf.", "cf") |
|||
line = strip_brackets(line) |
|||
if not line or is_page_num(line): |
|||
return None |
|||
return line |
|||
new_lines = [] |
|||
for i, line in enumerate(lines): |
|||
line = preprocess_line(line) |
|||
if not line: |
|||
continue |
|||
# check if next line should be merged with this line |
|||
if i+1 < len(lines): |
|||
words = line.split() |
|||
next_line = preprocess_line(lines[i+1]) |
|||
if next_line: |
|||
if (len(words) == 1 or |
|||
not is_cyrillic(next_line.split()[0])): |
|||
lines[i+1] = line + " " + next_line |
|||
continue |
|||
orig_word = "" |
|||
for j, word in enumerate(words): |
|||
if j+1 >= len(words): |
|||
continue |
|||
next_word = words[j+1] |
|||
if word.endswith("."): |
|||
orig_word = " ".join(words[:j]) |
|||
if word.endswith(";"): |
|||
# if semicolon seperates dictionary entries |
|||
if is_cyrillic(next_word): |
|||
words[j] = word.replace(";", "") |
|||
line = " ".join(words[:j+1]) |
|||
next_line = " ".join(words[j+1:]) |
|||
lines.insert(i+1, next_line) |
|||
break |
|||
# if semicolon seperates abbreviations |
|||
elif next_word.endswith("."): |
|||
words[j] = word.replace(";", "") |
|||
line = " ".join(words[:j+1]) |
|||
next_line = orig_word + " " + " ".join(words[j+1:]) |
|||
lines.insert(i+1, next_line) |
|||
break |
|||
line = line.strip() |
|||
if line: |
|||
new_lines.append(line) |
|||
return new_lines |
|||
def main(): |
|||
dictionary = ET.Element("dictionary") |
|||
section = ET.SubElement(dictionary, "section") |
|||
section.set("id", "main") |
|||
section.set("type", "standard") |
|||
lines = list(fileinput.input()) |
|||
new_lines = preprocess(lines) |
|||
for line in new_lines: |
|||
comment = ET.Comment(text=line) |
|||
section.append(comment) |
|||
entry = Entry(line) |
|||
if not (entry.words and entry.abbrvs and entry.meanings): |
|||
continue |
|||
for word, meaning in itertools.product(entry.words, entry.meanings): |
|||
e = ET.SubElement(section, "e") |
|||
p = ET.SubElement(e, 'p') |
|||
# add word and meaning |
|||
left = ET.SubElement(p, 'l') |
|||
insert_blanks(left, word) |
|||
right = ET.SubElement(p, 'r') |
|||
insert_blanks(right, meaning) |
|||
# add abbreviations |
|||
for abbrv in entry.abbrvs: |
|||
s = ET.Element('s') |
|||
s.set('n', abbrv) |
|||
left.append(s) |
|||
right.append(s) |
|||
ET.dump(dictionary) |
|||
main() |
|||
</pre> |
</pre> |
||
Line 257: | Line 39: | ||
<pre> |
<pre> |
||
$ xmllint --format --encode utf8 |
$ xmllint --format --encode utf8 sakhadic.xml > sakhadic.dix |
||
</pre> |
</pre> |
||
The `--encode utf8` option prevents `xmllint` from escaping our unicode. |
The `--encode utf8` option prevents `xmllint` from escaping our unicode. |
||
The final file format |
The final file format should look like this: |
||
<pre> |
<pre> |
||
Line 268: | Line 50: | ||
<dictionary> |
<dictionary> |
||
<section id="main" type="standard"> |
<section id="main" type="standard"> |
||
<!--аа exc. Oh! See!--> |
|||
<e> |
<e> |
||
<!--аа exc. Oh! See!--> |
|||
<p> |
<p> |
||
<l>аа<s n="ij"/></l> |
<l>аа<s n="ij"/></l> |
||
Line 275: | Line 57: | ||
</p> |
</p> |
||
</e> |
</e> |
||
... |
|||
<!--ааҕыс v. to reckon with--> |
|||
</section> |
|||
</dictionary> |
|||
<p> |
|||
<l>ааҕыс<s n="v"/><s n="TD"/></l> |
|||
<r>reckon<b/>with<s n="v"/><s n="TD"/></r> |
|||
</p> |
|||
</e> |
|||
... |
|||
</pre> |
</pre> |
Revision as of 08:56, 5 December 2014
My name is Wei En and I'm currently a GCI student. My blog is at http://wei2912.github.io.
I decided to help out at Apertium because I find the work here quite interesting and I believe Apertium will benefit many.
The following are projects related to Apertium.
Wiktionary Crawler
https://github.com/wei2912/WiktionaryCrawler is a crawler for Wiktionary which aims to extract data from pages. It was created for a GCI task which you can read about at Task ideas for Google Code-in/Scrape inflection information from Wiktionary.
The crawler crawls a starting category (usually Category:XXX language)for subcategories, then crawls these subcategories for pages. It then passes the page to language-specific parsers which turn it into the Speling format.
The current languages supported are Chinese (zh), Thai (th) and Lao (lo). You are welcome to contribute to this project.
Spaceless Segmentation
Spaceless Segmentation has been merged into Apertium under https://svn.code.sf.net/p/apertium/svn/branches/tokenisation. It serves to tokenize languages without any whitespace. More information can be found under Task ideas for Google Code-in/Tokenisation for spaceless orthographies.
The tokeniser looks for possible tokenisations in the corpus text and selects the tokenisation which tokens appears the most in corpus.
A report comparing the above method, LRLM and RLLM (longest left to right matching and longest right to left matching respectively) is available at https://www.dropbox.com/sh/57wtof3gbcbsl7c/AABI-Mcw2E-c942BXxsMbEAja
Conversion of Sakha-English dictionary to lttoolbox format
NOTE: This document is a draft.
In this example we're converting the following PDF file: http://home.uchicago.edu/straughn/sakhadic.pdf
We copy the text directly from the PDF file, as PDF to text converters are currently unable to convert the text properly (thanks to the arcane PDF format).
Then, we obtain the script for converting our dictionary:
svn co https://svn.code.sf.net/p/apertium/svn/trunk/dixscrapers/ cat orig.txt | dixscrapers/sakhadic2dix.py > sakhadic.xml
This will give us a XML dump of the dictionary, converted to the lttoolbox format. We format the XML file as shown here:
$ xmllint --format --encode utf8 sakhadic.xml > sakhadic.dix
The `--encode utf8` option prevents `xmllint` from escaping our unicode.
The final file format should look like this:
<?xml version="1.0" encoding="utf-8"?> <dictionary> <section id="main" type="standard"> <e> <!--аа exc. Oh! See!--> <p> <l>аа<s n="ij"/></l> <r>Oh!<b/>See!<s n="ij"/></r> </p> </e> ... </section> </dictionary>