Difference between revisions of "User:Wei2912"
(→Conversion of PDF dictionary to lttoolbox format: Add final version of document if all goes well) |
|||
| Line 47: | Line 47: | ||
'a.': ['adj'], |
'a.': ['adj'], |
||
'adv.': ['adv'], |
'adv.': ['adv'], |
||
'arch.': [], |
|||
# cf. see also |
# cf. see also -- has been wiped out |
||
'comp.': [], |
|||
# conv. converb, modifying verb |
# conv. converb, modifying verb -- covered later |
||
'dial.': [], |
|||
'det.': ['det'], |
'det.': ['det'], |
||
'Evk.': [], |
|||
'exc.': ['ij'], |
'exc.': ['ij'], |
||
'int.': ['itg'], |
'int.': ['itg'], |
||
'Mongo.': [], |
|||
'n.': ['n'], |
'n.': ['n'], |
||
'num.': ['det', 'qnt'], |
'num.': ['det', 'qnt'], |
||
'ono.': [], |
|||
'pl.': ['pl'], |
'pl.': ['pl'], |
||
'pp.': ['post'], |
'pp.': ['post'], |
||
'pro.': ['prn'], |
'pro.': ['prn'], |
||
'Russ.': [], |
|||
'v.': ['v', 'TD'] |
'v.': ['v', 'TD'] |
||
} |
} |
||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
class Entry(object): |
class Entry(object): |
||
| Line 144: | Line 123: | ||
self.words = [x.strip() for x in self.__split(self.words)] |
self.words = [x.strip() for x in self.__split(self.words)] |
||
self.meanings = [x.strip() for x in self.__split(self.meanings)] |
self.meanings = [x.strip() for x in self.__split(self.meanings)] |
||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
def is_cyrillic(word): |
def is_cyrillic(word): |
||
| Line 157: | Line 157: | ||
def preprocess(lines): |
def preprocess(lines): |
||
def preprocess_line(line): |
|||
| ⚫ | |||
| ⚫ | |||
| ⚫ | |||
if not line: |
if not line: |
||
return None |
|||
line = line.strip() |
line = line.strip() |
||
line = line.replace("•", "") |
line = line.replace("•", "") |
||
line = line.replace("=", "") |
line = line.replace("=", "") |
||
line = line.replace("cf.", "cf") |
line = line.replace("cf.", "cf") |
||
| ⚫ | |||
if not line or is_page_num(line): |
if not line or is_page_num(line): |
||
return None |
|||
return line |
|||
| ⚫ | |||
| ⚫ | |||
line = preprocess_line(line) |
|||
if not line: |
|||
continue |
continue |
||
| Line 173: | Line 178: | ||
if i+1 < len(lines): |
if i+1 < len(lines): |
||
words = line.split() |
words = line.split() |
||
next_line = preprocess_line(lines[i+1]) |
|||
if next_line: |
|||
if (len(words) == 1 or |
|||
not is_cyrillic(next_line.split()[0])): |
|||
lines[i+1] = line + " " + next_line |
|||
continue |
|||
orig_word = "" |
orig_word = "" |
||
Revision as of 11:19, 3 December 2014
My name is Wei En and I'm currently a GCI student. My blog is at http://wei2912.github.io.
I decided to help out at Apertium because I find the work here quite interesting and I believe Apertium will benefit many.
The following are projects related to Apertium.
Wiktionary Crawler
https://github.com/wei2912/WiktionaryCrawler is a crawler for Wiktionary which aims to extract data from pages. It was created for a GCI task which you can read about at Task ideas for Google Code-in/Scrape inflection information from Wiktionary.
The crawler crawls a starting category (usually Category:XXX language)for subcategories, then crawls these subcategories for pages. It then passes the page to language-specific parsers which turn it into the Speling format.
The current languages supported are Chinese (zh), Thai (th) and Lao (lo). You are welcome to contribute to this project.
Spaceless Segmentation
Spaceless Segmentation has been merged into Apertium under https://svn.code.sf.net/p/apertium/svn/branches/tokenisation. It serves to tokenize languages without any whitespace. More information can be found under Task ideas for Google Code-in/Tokenisation for spaceless orthographies.
The tokeniser looks for possible tokenisations in the corpus text and selects the tokenisation which tokens appears the most in corpus.
A report comparing the above method, LRLM and RLLM (longest left to right matching and longest right to left matching respectively) is available at https://www.dropbox.com/sh/57wtof3gbcbsl7c/AABI-Mcw2E-c942BXxsMbEAja
Conversion of PDF dictionary to lttoolbox format
NOTE: This document is a draft.
In this example we're converting the following PDF file: http://home.uchicago.edu/straughn/sakhadic.pdf
We copy the text directly from the PDF file, as PDF to text converters are currently unable to convert the text properly (thanks to the arcane PDF format).
Then, we pipe the text to our script:
#!/usr/bin/python3
# -*- coding: utf-8 -*-
import fileinput
import itertools
import re
import xml.etree.cElementTree as ET
BRACKETS_RE = re.compile(r'(\(.+?\)|\[.+?\])')
PAGENUMBER_RE = re.compile(r'^\d+$')
SPLIT_RE = re.compile(r'[;,]\s+')
ABBRVS = {
'a.': ['adj'],
'adv.': ['adv'],
'arch.': [],
# cf. see also -- has been wiped out
'comp.': [],
# conv. converb, modifying verb -- covered later
'dial.': [],
'det.': ['det'],
'Evk.': [],
'exc.': ['ij'],
'int.': ['itg'],
'Mongo.': [],
'n.': ['n'],
'num.': ['det', 'qnt'],
'ono.': [],
'pl.': ['pl'],
'pp.': ['post'],
'pro.': ['prn'],
'Russ.': [],
'v.': ['v', 'TD']
}
class Entry(object):
def __split(self, line):
return SPLIT_RE.split(line)
def __init__(self, line):
tags = line.split()
self.words = []
self.abbrvs = []
self.meanings = []
found_abbrv = False
found_conv = False
for tag in tags:
if tag in ABBRVS.keys(): # abbreviations
found_abbrv = True
self.abbrvs.extend(ABBRVS[tag])
continue
elif tag == "conv.":
found_abbrv = True
found_conv = True
self.abbrvs.append("vaux")
continue
if not found_abbrv: # entrys
self.words.append(tag)
else: # translated
self.meanings.append(tag)
# if there's "cf" in a word, we trim off everything else
for i, word in enumerate(self.words):
if word == "cf":
self.words = self.words[:i]
# if there's a converb, just look at the last word
if found_conv:
self.words = self.words[-1]
else:
self.words = " ".join(self.words)
self.meanings = " ".join(self.meanings)
self.words = strip_brackets(self.words)
self.meanings = strip_brackets(self.meanings)
if not self.abbrvs:
self.words = None
self.abbrvs = None
self.meanings = None
return
# preprocessing meanings
self.meanings = self.meanings.replace("to", "")
# split up meanings and entrys
self.words = [x.strip() for x in self.__split(self.words)]
self.meanings = [x.strip() for x in self.__split(self.meanings)]
def insert_blanks(element, line):
words = line.split()
if not words:
return
element.text = words[0]
element.tail = None
blank = None
for i in words[1:]:
blank = ET.SubElement(element, 'b')
blank.tail = i
def is_page_num(line):
return PAGENUMBER_RE.match(line)
def strip_brackets(line):
brackets = BRACKETS_RE.search(line)
if brackets:
for bracket in brackets.groups():
line = line.replace(bracket, "")
return line
def is_cyrillic(word):
num_non_cyrillic = 0
num_cyrillic = 0
for c in word:
ordc = ord(c)
if 0x0400 <= ordc <= 0x04FF:
num_cyrillic += 1
else:
num_non_cyrillic += 1
return num_cyrillic > num_non_cyrillic
def preprocess(lines):
def preprocess_line(line):
if not line:
return None
line = line.strip()
line = line.replace("•", "")
line = line.replace("=", "")
line = line.replace("cf.", "cf")
line = strip_brackets(line)
if not line or is_page_num(line):
return None
return line
new_lines = []
for i, line in enumerate(lines):
line = preprocess_line(line)
if not line:
continue
# check if next line should be merged with this line
if i+1 < len(lines):
words = line.split()
next_line = preprocess_line(lines[i+1])
if next_line:
if (len(words) == 1 or
not is_cyrillic(next_line.split()[0])):
lines[i+1] = line + " " + next_line
continue
orig_word = ""
for j, word in enumerate(words):
if j+1 >= len(words):
continue
next_word = words[j+1]
if word.endswith("."):
orig_word = " ".join(words[:j])
if word.endswith(";"):
# if semicolon seperates dictionary entries
if is_cyrillic(next_word):
words[j] = word.replace(";", "")
line = " ".join(words[:j+1])
next_line = " ".join(words[j+1:])
lines.insert(i+1, next_line)
break
# if semicolon seperates abbreviations
elif next_word.endswith("."):
words[j] = word.replace(";", "")
line = " ".join(words[:j+1])
next_line = orig_word + " " + " ".join(words[j+1:])
lines.insert(i+1, next_line)
break
line = line.strip()
if line:
new_lines.append(line)
return new_lines
def main():
dictionary = ET.Element("dictionary")
pardefs = ET.SubElement(dictionary, "pardefs")
lines = list(fileinput.input())
new_lines = preprocess(lines)
for line in new_lines:
comment = ET.Comment(text=line)
pardefs.append(comment)
entry = Entry(line)
if not (entry.words and entry.abbrvs and entry.meanings):
continue
for word, meaning in itertools.product(entry.words, entry.meanings):
e = ET.SubElement(pardefs, "e")
e.set('r', 'LR')
p = ET.SubElement(e, 'p')
# add word and meaning
left = ET.SubElement(p, 'l')
insert_blanks(left, word)
right = ET.SubElement(p, 'r')
insert_blanks(right, meaning)
# add abbreviations
for abbrv in entry.abbrvs:
s = ET.Element('s')
s.set('n', abbrv)
left.append(s)
right.append(s)
ET.dump(dictionary)
main()
This will give us a XML dump of the dictionary, converted to the lttoolbox format. We format the XML file as shown here:
$ xmllint --format --encode utf8 file.xml > file.dix
The `--encode utf8` option prevents `xmllint` from escaping our unicode.
The final file format looks like this:
<?xml version="1.0" encoding="utf-8"?>
<dictionary>
<pardefs>
<!--аа exc. Oh! See!-->
<e r="LR">
<p>
<l>аа<s n="ij"/></l>
<r>Oh!<b/>See!<s n="ij"/></r>
</p>
</e>
<!--ааҕыс v. to reckon with-->
<e r="LR">
<p>
<l>ааҕыс<s n="v"/><s n="TD"/></l>
<r>reckon<b/>with<s n="v"/><s n="TD"/></r>
</p>
</e>
<!--аайы a. each, every-->
<e r="LR">
<p>
<l>аайы<s n="adj"/></l>
<r>each<s n="adj"/></r>
</p>
</e>
<e r="LR">
<p>
<l>аайы<s n="adj"/></l>
<r>every<s n="adj"/></r>
</p>
</e>
<!--күн аайы every day-->
<!--аак cf аах n. document, paper-->
<e r="LR">
<p>
<l>аак<s n="n"/></l>
<r>document<s n="n"/></r>
</p>
</e>
<e r="LR">
<p>
<l>аак<s n="n"/></l>
<r>paper<s n="n"/></r>
</p>
</e>
...