Difference between revisions of "User:Wei2912"
(→Conversion of PDF dictionary to lttoolbox format: Finished description on how to convert) |
|||
Line 29: | Line 29: | ||
We copy the text directly from the PDF file, as PDF to text converters are currently unable to convert the text properly (thanks to the arcane PDF format). |
We copy the text directly from the PDF file, as PDF to text converters are currently unable to convert the text properly (thanks to the arcane PDF format). |
||
Then, we pipe the text to our script: |
|||
All of this preprocessing is contained in this script which we supply a filename to. |
|||
<pre> |
|||
#!/bin/bash |
|||
cat $1 | perl -wpne 's/•//g; s/^\d+$//g; s/=//g; s/\; /\n/g; s/cf\./cf/;' > $1.new |
|||
</pre> |
|||
After the preprocessing, we get the following file: |
|||
<pre> |
|||
... blank lines omitted ... |
|||
аа exc. Oh! See! |
|||
ааҕыс v. to reckon with |
|||
аайы a. each, every |
|||
күн аайы every day |
|||
... |
|||
</pre> |
|||
The blank lines weren't removed so that you can tell when a page starts and end, and hence coordinate the manual processing with the dictionary. |
|||
Unfortunately for us, our preprocessor replaces "; " with "\n" in order to get a list of words seperated by newlines. Definitions may be seperated by "; " too, or spread over to the next line. Hence, we'll need to merge these lines together to get the same format as the dictionary. |
|||
Some words have different word forms. To handle this, we copy over the original word to create a new entry. This: |
|||
<pre> |
|||
албас a. cunning; n. trick, ruse |
|||
</pre> |
|||
becomes |
|||
<pre> |
|||
албас a. cunning |
|||
албас n. trick, ruse |
|||
</pre> |
|||
The good part about this is that they're also seperated by "; " and will be placed on a newline after the preprocessing, so it's easy to spot the lines where we need to handle this. |
|||
The final format for each entry looks similar to this: |
|||
<pre> |
|||
word1, word2 abbrv1. abbrv2. abbrv3. definition1, definition2, definition3; definition4 |
|||
</pre> |
|||
Words and definitions are seperated by either commas or semicolons. Abbreviations are seperated by whitespace and indicated with the use of ".". |
|||
We pass the filename of our dictionary file to this script: |
|||
<pre> |
<pre> |
||
#!/usr/bin/python3 |
#!/usr/bin/python3 |
||
# -*- coding: utf-8 -*- |
|||
import fileinput |
import fileinput |
||
Line 85: | Line 41: | ||
BRACKETS_RE = re.compile(r'(\(.+?\)|\[.+?\])') |
BRACKETS_RE = re.compile(r'(\(.+?\)|\[.+?\])') |
||
PAGENUMBER_RE = re.compile(r'^\d+$') |
|||
SPLIT_RE = re.compile(r'[;,]\s+') |
SPLIT_RE = re.compile(r'[;,]\s+') |
||
Line 110: | Line 67: | ||
} |
} |
||
def insert_blanks(element, line): |
|||
class Entry(object): |
|||
words = line.split() |
|||
if not words: |
|||
brackets = BRACKETS_RE.search(line) |
|||
return |
|||
element.text = words[0] |
|||
return brackets.groups() |
|||
element.tail = None |
|||
blank = None |
|||
for i in words[1:]: |
|||
blank = ET.SubElement(element, 'b') |
|||
blank.tail = i |
|||
def is_page_num(line): |
|||
return PAGENUMBER_RE.match(line) |
|||
def strip_brackets(line): |
|||
brackets = BRACKETS_RE.search(line) |
|||
if brackets: |
|||
for bracket in brackets.groups(): |
|||
line.replace(bracket, "") |
|||
return line |
|||
class Entry(object): |
|||
def __split(self, line): |
def __split(self, line): |
||
return SPLIT_RE.split(line) |
return SPLIT_RE.split(line) |
||
Line 149: | Line 122: | ||
self.words = self.words[:i] |
self.words = self.words[:i] |
||
# if there's a converb, just look at the last word |
|||
if found_conv: |
if found_conv: |
||
self.words = self.words[-1] |
self.words = self.words[-1] |
||
Line 155: | Line 129: | ||
self.meanings = " ".join(self.meanings) |
self.meanings = " ".join(self.meanings) |
||
self.words = strip_brackets(self.words) |
|||
# preprocessing to place stuff |
|||
self.meanings = strip_brackets(self.meanings) |
|||
# we can't parse in comments |
|||
if not self.abbrvs: |
if not self.abbrvs: |
||
self.words = None |
self.words = None |
||
Line 162: | Line 137: | ||
self.meanings = None |
self.meanings = None |
||
return |
return |
||
# remove the brackets |
|||
brackets = self.__find_brackets(self.words) |
|||
if brackets: |
|||
for bracket in brackets: |
|||
self.words = self.words.replace(bracket, "") |
|||
brackets = self.__find_brackets(self.meanings) |
|||
if brackets: |
|||
for bracket in brackets: |
|||
self.meanings = self.meanings.replace(bracket, "") |
|||
# preprocessing meanings |
# preprocessing meanings |
||
Line 181: | Line 145: | ||
self.meanings = [x.strip() for x in self.__split(self.meanings)] |
self.meanings = [x.strip() for x in self.__split(self.meanings)] |
||
def |
def is_cyrillic(word): |
||
num_non_cyrillic = 0 |
|||
words = line.split() |
|||
num_cyrillic = 0 |
|||
for c in word: |
|||
ordc = ord(c) |
|||
if 0x0400 <= ordc <= 0x04FF: |
|||
element.tail = None |
|||
num_cyrillic += 1 |
|||
blank = None |
|||
else: |
|||
num_non_cyrillic += 1 |
|||
blank = ET.SubElement(element, 'b') |
|||
return num_cyrillic > num_non_cyrillic |
|||
blank.tail = i |
|||
def preprocess(lines): |
|||
new_lines = [] |
|||
for i, line in enumerate(lines): |
|||
line = lines[i] |
|||
if not line: |
|||
continue |
|||
line = line.strip() |
|||
line = line.replace("•", "") |
|||
line = line.replace("=", "") |
|||
line = line.replace("cf.", "cf") |
|||
if not line or is_page_num(line): |
|||
continue |
|||
# check if next line should be merged with this line |
|||
if i+1 < len(lines): |
|||
words = line.split() |
|||
if (len(words) == 1 or |
|||
not is_cyrillic(lines[i+1].split()[0])): |
|||
lines[i+1] = line + " " + lines[i+1] |
|||
continue |
|||
orig_word = "" |
|||
for j, word in enumerate(words): |
|||
if j+1 >= len(words): |
|||
continue |
|||
next_word = words[j+1] |
|||
if word.endswith("."): |
|||
orig_word = " ".join(words[:j]) |
|||
if word.endswith(";"): |
|||
# if semicolon seperates dictionary entries |
|||
if is_cyrillic(next_word): |
|||
words[j] = word.replace(";", "") |
|||
line = " ".join(words[:j+1]) |
|||
next_line = " ".join(words[j+1:]) |
|||
lines.insert(i+1, next_line) |
|||
break |
|||
# if semicolon seperates abbreviations |
|||
elif next_word.endswith("."): |
|||
words[j] = word.replace(";", "") |
|||
line = " ".join(words[:j+1]) |
|||
next_line = orig_word + " " + " ".join(words[j+1:]) |
|||
lines.insert(i+1, next_line) |
|||
break |
|||
line = line.strip() |
|||
if line: |
|||
new_lines.append(line) |
|||
return new_lines |
|||
def main(): |
def main(): |
||
Line 196: | Line 212: | ||
pardefs = ET.SubElement(dictionary, "pardefs") |
pardefs = ET.SubElement(dictionary, "pardefs") |
||
lines = list(fileinput.input()) |
|||
new_lines = preprocess(lines) |
|||
if not line: |
|||
continue |
|||
for line in new_lines: |
|||
comment = ET.Comment(text=line) |
comment = ET.Comment(text=line) |
||
pardefs.append(comment) |
pardefs.append(comment) |
||
Line 214: | Line 229: | ||
p = ET.SubElement(e, 'p') |
p = ET.SubElement(e, 'p') |
||
# add word and meaning |
|||
left = ET.SubElement(p, 'l') |
left = ET.SubElement(p, 'l') |
||
insert_blanks(left, word) |
insert_blanks(left, word) |
||
Line 243: | Line 258: | ||
<pre> |
<pre> |
||
<?xml version="1.0" encoding=" |
<?xml version="1.0" encoding="utf-8"?> |
||
<dictionary> |
<dictionary> |
||
<pardefs> |
<pardefs> |
||
Line 265: | Line 280: | ||
<l>аайы<s n="adj"/></l> |
<l>аайы<s n="adj"/></l> |
||
<r>each<s n="adj"/></r> |
<r>each<s n="adj"/></r> |
||
</p> |
|||
</e> |
|||
<e r="LR"> |
|||
<p> |
|||
<l>аайы<s n="adj"/></l> |
|||
<r>every<s n="adj"/></r> |
|||
</p> |
|||
</e> |
|||
<!--күн аайы every day--> |
|||
<!--аак cf аах n. document, paper--> |
|||
<e r="LR"> |
|||
<p> |
|||
<l>аак<s n="n"/></l> |
|||
<r>document<s n="n"/></r> |
|||
</p> |
|||
</e> |
|||
<e r="LR"> |
|||
<p> |
|||
<l>аак<s n="n"/></l> |
|||
<r>paper<s n="n"/></r> |
|||
</p> |
</p> |
||
</e> |
</e> |
Revision as of 07:28, 3 December 2014
My name is Wei En and I'm currently a GCI student. My blog is at http://wei2912.github.io.
I decided to help out at Apertium because I find the work here quite interesting and I believe Apertium will benefit many.
The following are projects related to Apertium.
Wiktionary Crawler
https://github.com/wei2912/WiktionaryCrawler is a crawler for Wiktionary which aims to extract data from pages. It was created for a GCI task which you can read about at Task ideas for Google Code-in/Scrape inflection information from Wiktionary.
The crawler crawls a starting category (usually Category:XXX language)for subcategories, then crawls these subcategories for pages. It then passes the page to language-specific parsers which turn it into the Speling format.
The current languages supported are Chinese (zh), Thai (th) and Lao (lo). You are welcome to contribute to this project.
Spaceless Segmentation
Spaceless Segmentation has been merged into Apertium under https://svn.code.sf.net/p/apertium/svn/branches/tokenisation. It serves to tokenize languages without any whitespace. More information can be found under Task ideas for Google Code-in/Tokenisation for spaceless orthographies.
The tokeniser looks for possible tokenisations in the corpus text and selects the tokenisation which tokens appears the most in corpus.
A report comparing the above method, LRLM and RLLM (longest left to right matching and longest right to left matching respectively) is available at https://www.dropbox.com/sh/57wtof3gbcbsl7c/AABI-Mcw2E-c942BXxsMbEAja
Conversion of PDF dictionary to lttoolbox format
NOTE: This document is a draft.
In this example we're converting the following PDF file: http://home.uchicago.edu/straughn/sakhadic.pdf
We copy the text directly from the PDF file, as PDF to text converters are currently unable to convert the text properly (thanks to the arcane PDF format).
Then, we pipe the text to our script:
#!/usr/bin/python3 # -*- coding: utf-8 -*- import fileinput import itertools import re import xml.etree.cElementTree as ET BRACKETS_RE = re.compile(r'(\(.+?\)|\[.+?\])') PAGENUMBER_RE = re.compile(r'^\d+$') SPLIT_RE = re.compile(r'[;,]\s+') ABBRVS = { 'a.': ['adj'], 'adv.': ['adv'], # arch. archaic # cf. see also # comp. computer-related # conv. converb, modifying verb # dial. dialect 'det.': ['det'], # Evk. Evenki 'exc.': ['ij'], 'int.': ['itg'], # Mongo. Mongolian 'n.': ['n'], 'num.': ['det', 'qnt'], # ono. onomatopoeia 'pl.': ['pl'], 'pp.': ['post'], 'pro.': ['prn'], # Russ. Russian 'v.': ['v', 'TD'] } def insert_blanks(element, line): words = line.split() if not words: return element.text = words[0] element.tail = None blank = None for i in words[1:]: blank = ET.SubElement(element, 'b') blank.tail = i def is_page_num(line): return PAGENUMBER_RE.match(line) def strip_brackets(line): brackets = BRACKETS_RE.search(line) if brackets: for bracket in brackets.groups(): line.replace(bracket, "") return line class Entry(object): def __split(self, line): return SPLIT_RE.split(line) def __init__(self, line): tags = line.split() self.words = [] self.abbrvs = [] self.meanings = [] found_abbrv = False found_conv = False for tag in tags: if tag in ABBRVS.keys(): # abbreviations found_abbrv = True self.abbrvs.extend(ABBRVS[tag]) continue elif tag == "conv.": found_abbrv = True found_conv = True self.abbrvs.append("vaux") continue if not found_abbrv: # entrys self.words.append(tag) else: # translated self.meanings.append(tag) # if there's "cf" in a word, we trim off everything else for i, word in enumerate(self.words): if word == "cf": self.words = self.words[:i] # if there's a converb, just look at the last word if found_conv: self.words = self.words[-1] else: self.words = " ".join(self.words) self.meanings = " ".join(self.meanings) self.words = strip_brackets(self.words) self.meanings = strip_brackets(self.meanings) if not self.abbrvs: self.words = None self.abbrvs = None self.meanings = None return # preprocessing meanings self.meanings = self.meanings.replace("to", "") # split up meanings and entrys self.words = [x.strip() for x in self.__split(self.words)] self.meanings = [x.strip() for x in self.__split(self.meanings)] def is_cyrillic(word): num_non_cyrillic = 0 num_cyrillic = 0 for c in word: ordc = ord(c) if 0x0400 <= ordc <= 0x04FF: num_cyrillic += 1 else: num_non_cyrillic += 1 return num_cyrillic > num_non_cyrillic def preprocess(lines): new_lines = [] for i, line in enumerate(lines): line = lines[i] if not line: continue line = line.strip() line = line.replace("•", "") line = line.replace("=", "") line = line.replace("cf.", "cf") if not line or is_page_num(line): continue # check if next line should be merged with this line if i+1 < len(lines): words = line.split() if (len(words) == 1 or not is_cyrillic(lines[i+1].split()[0])): lines[i+1] = line + " " + lines[i+1] continue orig_word = "" for j, word in enumerate(words): if j+1 >= len(words): continue next_word = words[j+1] if word.endswith("."): orig_word = " ".join(words[:j]) if word.endswith(";"): # if semicolon seperates dictionary entries if is_cyrillic(next_word): words[j] = word.replace(";", "") line = " ".join(words[:j+1]) next_line = " ".join(words[j+1:]) lines.insert(i+1, next_line) break # if semicolon seperates abbreviations elif next_word.endswith("."): words[j] = word.replace(";", "") line = " ".join(words[:j+1]) next_line = orig_word + " " + " ".join(words[j+1:]) lines.insert(i+1, next_line) break line = line.strip() if line: new_lines.append(line) return new_lines def main(): dictionary = ET.Element("dictionary") pardefs = ET.SubElement(dictionary, "pardefs") lines = list(fileinput.input()) new_lines = preprocess(lines) for line in new_lines: comment = ET.Comment(text=line) pardefs.append(comment) entry = Entry(line) if not (entry.words and entry.abbrvs and entry.meanings): continue for word, meaning in itertools.product(entry.words, entry.meanings): e = ET.SubElement(pardefs, "e") e.set('r', 'LR') p = ET.SubElement(e, 'p') # add word and meaning left = ET.SubElement(p, 'l') insert_blanks(left, word) right = ET.SubElement(p, 'r') insert_blanks(right, meaning) # add abbreviations for abbrv in entry.abbrvs: s = ET.Element('s') s.set('n', abbrv) left.append(s) right.append(s) ET.dump(dictionary) main()
This will give us a XML dump of the dictionary, converted to the lttoolbox format. We format the XML file as shown here:
$ xmllint --format --encode utf8 file.xml > file.dix
The `--encode utf8` option prevents `xmllint` from escaping our unicode.
The final file format looks like this:
<?xml version="1.0" encoding="utf-8"?> <dictionary> <pardefs> <!--аа exc. Oh! See!--> <e r="LR"> <p> <l>аа<s n="ij"/></l> <r>Oh!<b/>See!<s n="ij"/></r> </p> </e> <!--ааҕыс v. to reckon with--> <e r="LR"> <p> <l>ааҕыс<s n="v"/><s n="TD"/></l> <r>reckon<b/>with<s n="v"/><s n="TD"/></r> </p> </e> <!--аайы a. each, every--> <e r="LR"> <p> <l>аайы<s n="adj"/></l> <r>each<s n="adj"/></r> </p> </e> <e r="LR"> <p> <l>аайы<s n="adj"/></l> <r>every<s n="adj"/></r> </p> </e> <!--күн аайы every day--> <!--аак cf аах n. document, paper--> <e r="LR"> <p> <l>аак<s n="n"/></l> <r>document<s n="n"/></r> </p> </e> <e r="LR"> <p> <l>аак<s n="n"/></l> <r>paper<s n="n"/></r> </p> </e> ...