[gcompris/gcomprixogoo] Improved the wiktionary parser. Now extract the type of word like noun, verb and the group for the c
- From: Bruno Coudoin <bcoudoin src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [gcompris/gcomprixogoo] Improved the wiktionary parser. Now extract the type of word like noun, verb and the group for the c
- Date: Sun, 19 Sep 2010 18:11:41 +0000 (UTC)
commit dfba12e3ccd2d3d2a5ecba0b956a23913a0ff566
Author: Bruno Coudoin <bruno coudoin free fr>
Date: Sat Sep 18 00:50:29 2010 +0200
Improved the wiktionary parser. Now extract the type of word like noun, verb
and the group for the case of verbs.
tools/wiktio2xml/wiktio2xml.py | 52 +++++++++++++++++++++++++++++++++++----
1 files changed, 46 insertions(+), 6 deletions(-)
---
diff --git a/tools/wiktio2xml/wiktio2xml.py b/tools/wiktio2xml/wiktio2xml.py
index 7f34155..f5274d4 100755
--- a/tools/wiktio2xml/wiktio2xml.py
+++ b/tools/wiktio2xml/wiktio2xml.py
@@ -34,6 +34,27 @@ class WikiHandler(ContentHandler):
self.isTextElement = True
self.textContent = ""
+ self.wordTypes = {
+ "{{-nom-.*}}": "noun",
+ "{{-nom-pr.*}}": "proper noun",
+ "{{-verb.*}}": "proper noun",
+ "{-pronom-.*}": "pronoun",
+ "{-verb-.*}}": "verb",
+ "{-adj-.*}}": "adjective",
+ "{-adv-.*}}": "adverb",
+ "{-art-.*}}": "article",
+ "{-conj-.*}}": "conjunction",
+ "{-prèp-.*}}": "preposition",
+ "{-post-.*}}": "postposition"
+ }
+
+ self.wordSubTypes = {
+ "{{1ergroupe}}": "1er groupe",
+ "{{2egroupe}}": "2eme groupe",
+ "{{3egroupe}}": "3eme groupe",
+ }
+
+
def endElement(self, name):
if name == 'page':
@@ -126,7 +147,7 @@ class WikiHandler(ContentHandler):
text = re.sub(r"{{w\|([^}]+)}}", r"<i>\1</i>", text)
text = re.sub(r"{{source\|([^}]+)}}", r"- (\1)", text)
- # Remove all recognized wiki tags
+ # Remove all unrecognized wiki tags
text = re.sub(r"{{[^}]+}}", "", text)
# italic
@@ -160,6 +181,11 @@ class WikiHandler(ContentHandler):
inSynonym = False
inAntonym = False
inPron = False
+ wordType = ""
+ wordSubType = ""
+
+ # Append and end of text marker, it makes my life easier
+ self.textContent += "\n{{-EndOfTest-}}"
for l in self.textContent.splitlines():
@@ -169,29 +195,37 @@ class WikiHandler(ContentHandler):
if lang and lang.group(1) != None and lang.group(1) != self.locale:
return
+ for wt in self.wordTypes.keys():
+ if re.search(wt, l):
+ wordType = self.wordTypes[wt]
+
+ for wt in self.wordSubTypes.keys():
+ if re.search(wt, l):
+ wordSubType = self.wordSubTypes[wt]
+
if inDefinition:
- if l != "":
+ if not re.search(r"{{-.*-}}", l):
print self.wiki2xml(l)
else:
inDefinition = False
print self.wiki2xml("</definition>")
if inAnagram:
- if l != "":
+ if not re.search(r"{{-.*-}}", l):
print self.wiki2xml(l)
else:
inAnagram = False
print self.wiki2xml("</anagram>")
if inSynonym:
- if l != "":
+ if not re.search(r"{{-.*-}}", l):
print self.wiki2xml(l)
else:
inSynonym = False
print self.wiki2xml("</synonym>")
if inAntonym:
- if l != "":
+ if not re.search(r"{{-.*-}}", l):
print self.wiki2xml(l)
else:
inAntonym = False
@@ -210,7 +244,12 @@ class WikiHandler(ContentHandler):
if l.startswith("'''" + self.titleContent + "'''"):
inDefinition = True
- print("<definition name='" + self.titleContent + "'>")
+ print("<definition name='" + self.titleContent + "'" +
+ " type='" + wordType + "'" +
+ " subtype='" + wordSubType + "'" + ">")
+ print("<h3>" + wordType + " " + wordSubType + "</h3>")
+ wordType = ""
+ wordSubType = ""
elif l == "{{-anagr-}}":
inAnagram = True
print "<h2>Anagram</h2>"
@@ -229,6 +268,7 @@ class WikiHandler(ContentHandler):
print("<prononciation>")
+
def printHtmlHeader():
print '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'
print '<html xmlns="http://www.w3.org/1999/xhtml" lang="fr" dir="ltr">'
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]