[gcompris/gcomprixogoo] Improved the wiktionary parser. Now extract the type of word like noun, verb and the group for the c

From: Bruno Coudoin <bcoudoin src gnome org>
To: commits-list gnome org
Cc:
Subject: [gcompris/gcomprixogoo] Improved the wiktionary parser. Now extract the type of word like noun, verb and the group for the c
Date: Sun, 19 Sep 2010 18:11:41 +0000 (UTC)
commit dfba12e3ccd2d3d2a5ecba0b956a23913a0ff566
Author: Bruno Coudoin <bruno coudoin free fr>
Date:   Sat Sep 18 00:50:29 2010 +0200

    Improved the wiktionary parser. Now extract the type of word like noun, verb
    and the group for the case of verbs.

 tools/wiktio2xml/wiktio2xml.py |   52 +++++++++++++++++++++++++++++++++++----
 1 files changed, 46 insertions(+), 6 deletions(-)
---
diff --git a/tools/wiktio2xml/wiktio2xml.py b/tools/wiktio2xml/wiktio2xml.py
index 7f34155..f5274d4 100755
--- a/tools/wiktio2xml/wiktio2xml.py
+++ b/tools/wiktio2xml/wiktio2xml.py
@@ -34,6 +34,27 @@ class WikiHandler(ContentHandler):
             self.isTextElement = True
             self.textContent = ""
 
+        self.wordTypes = {
+            "{{-nom-.*}}": "noun",
+            "{{-nom-pr.*}}": "proper noun",
+            "{{-verb.*}}": "proper noun",
+            "{-pronom-.*}": "pronoun",
+            "{-verb-.*}}": "verb",
+            "{-adj-.*}}": "adjective",
+            "{-adv-.*}}": "adverb",
+            "{-art-.*}}": "article",
+            "{-conj-.*}}": "conjunction",
+            "{-prÃ¨p-.*}}": "preposition",
+            "{-post-.*}}": "postposition"
+            }
+
+        self.wordSubTypes = {
+            "{{1ergroupe}}": "1er groupe",
+            "{{2egroupe}}": "2eme groupe",
+            "{{3egroupe}}": "3eme groupe",
+            }
+
+
     def endElement(self, name):
 
         if name == 'page':
@@ -126,7 +147,7 @@ class WikiHandler(ContentHandler):
         text = re.sub(r"{{w\|([^}]+)}}", r"<i>\1</i>", text)
         text = re.sub(r"{{source\|([^}]+)}}", r"- (\1)", text)
 
-        # Remove all recognized wiki tags
+        # Remove all unrecognized wiki tags
         text = re.sub(r"{{[^}]+}}", "", text)
 
         # italic
@@ -160,6 +181,11 @@ class WikiHandler(ContentHandler):
         inSynonym = False
         inAntonym = False
         inPron = False
+        wordType = ""
+        wordSubType = ""
+
+        # Append and end of text marker, it makes my life easier
+        self.textContent += "\n{{-EndOfTest-}}"
 
         for l in self.textContent.splitlines():
 
@@ -169,29 +195,37 @@ class WikiHandler(ContentHandler):
             if lang and lang.group(1) != None and lang.group(1) != self.locale:
                 return
 
+            for wt in self.wordTypes.keys():
+                if re.search(wt, l):
+                    wordType = self.wordTypes[wt]
+
+            for wt in self.wordSubTypes.keys():
+                if re.search(wt, l):
+                    wordSubType = self.wordSubTypes[wt]
+
             if inDefinition:
-                if l != "":
+                if not re.search(r"{{-.*-}}", l):
                     print self.wiki2xml(l)
                 else:
                     inDefinition = False
                     print self.wiki2xml("</definition>")
 
             if inAnagram:
-                if l != "":
+                if not re.search(r"{{-.*-}}", l):
                     print self.wiki2xml(l)
                 else:
                     inAnagram = False
                     print self.wiki2xml("</anagram>")
 
             if inSynonym:
-                if l != "":
+                if not re.search(r"{{-.*-}}", l):
                     print self.wiki2xml(l)
                 else:
                     inSynonym = False
                     print self.wiki2xml("</synonym>")
 
             if inAntonym:
-                if l != "":
+                if not re.search(r"{{-.*-}}", l):
                     print self.wiki2xml(l)
                 else:
                     inAntonym = False
@@ -210,7 +244,12 @@ class WikiHandler(ContentHandler):
 
             if l.startswith("'''" + self.titleContent + "'''"):
                 inDefinition = True
-                print("<definition name='" + self.titleContent + "'>")
+                print("<definition name='" + self.titleContent + "'" +
+                      " type='" + wordType + "'" +
+                      " subtype='" + wordSubType + "'" + ">")
+                print("<h3>" + wordType + " " + wordSubType + "</h3>")
+                wordType = ""
+                wordSubType = ""
             elif l == "{{-anagr-}}":
                 inAnagram = True
                 print "<h2>Anagram</h2>"
@@ -229,6 +268,7 @@ class WikiHandler(ContentHandler):
                 print("<prononciation>")
 
 
+
 def printHtmlHeader():
     print '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>'
     print '<html xmlns="http://www.w3.org/1999/xhtml"; lang="fr" dir="ltr">'
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]