[gcompris/gcomprixogoo] Reworked the wiktionary parser to have a separate model. Much cleaner code but not yet as stable a b



commit 6c0072246ad59ae412f57e72d6233574fdea1491
Author: Bruno Coudoin <bruno coudoin free fr>
Date:   Tue Sep 21 23:46:02 2010 +0200

    Reworked the wiktionary parser to have a separate model. Much cleaner code
    but not yet as stable a before.

 tools/wiktio2xml/wiktio.py     |  103 ++++++++++++++++++++++++++++++++++++++
 tools/wiktio2xml/wiktio2xml.py |  107 +++++++++++++++++++---------------------
 2 files changed, 153 insertions(+), 57 deletions(-)
---
diff --git a/tools/wiktio2xml/wiktio.py b/tools/wiktio2xml/wiktio.py
new file mode 100644
index 0000000..8a5e582
--- /dev/null
+++ b/tools/wiktio2xml/wiktio.py
@@ -0,0 +1,103 @@
+# -*- coding: utf-8 -*-
+
+#
+# Implementation of the wiktionary model
+# used
+
+class Definition:
+
+    def __init__ (self):
+        self.text = ""
+        self.type = ""
+        self.subType = ""
+
+    def addText(self, text):
+        self.text += text
+
+    def setType(self, type):
+        self.type = type
+
+    def setSubType(self, subType):
+        self.subType = subType
+
+    def dump2html(self):
+        print "<definition type='" + self.type \
+            + "' subType='" + self.subType + "'>" \
+            + self.text + "</definition>"
+
+class Word:
+
+    def __init__ (self, word):
+        self.word = word
+        self.definition = []
+        self.synonym = []
+        self.antonym = []
+        self.anagram = []
+        self.prononciation = []
+
+    def addDefinition(self, definition):
+        self.definition.append(definition)
+
+    def addSynonym(self, synonym):
+        if len(synonym):
+            self.synonym.append(synonym)
+
+    def addAntonym(self, antonym):
+        if len(antonym):
+            self.antonym.append(antonym)
+
+    def addAnagram(self, anagram):
+        if len(anagram):
+            self.anagram.append(anagram)
+
+    def addPronociation(self, prononciation):
+        if len(prononciation):
+            self.prononciation.append(prononciation)
+
+    def dump2htmlItem(self, title, liste):
+        if len(liste):
+            print "<h2>" + title + "</h2>"
+            print "<ul>"
+            for s in liste:
+                print "<li>" + s + "</li>"
+            print "</ul>"
+
+    def dump2html(self):
+        print "<hr></hr>"
+        print "<h1>" + self.word + "</h1>"
+        for d in self.definition:
+            d.dump2html()
+
+        self.dump2htmlItem("Synonym", self.synonym)
+        self.dump2htmlItem("Antonym", self.antonym)
+        self.dump2htmlItem("Anagram", self.anagram)
+        self.dump2htmlItem("Prononciation", self.prononciation)
+
+class Wiktio:
+
+    def __init__ (self):
+        self.words = []
+
+    def addWord(self, word):
+        self.words.append(word)
+
+    def getWords(self):
+        return self.words
+
+    def dumpHtmlHeader(self):
+        print """
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
+<html xmlns="http://www.w3.org/1999/xhtml"; lang="fr" dir="ltr">
+<head>
+<title>Mini - Wiktionnaire</title>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+"""
+    def dumpHtmlFooter(self):
+        print """
+</head>
+"""
+    def dump2html(self):
+        self.dumpHtmlHeader()
+        for w in self.words:
+            w.dump2html()
+        self.dumpHtmlFooter()
diff --git a/tools/wiktio2xml/wiktio2xml.py b/tools/wiktio2xml/wiktio2xml.py
index 30bc942..22d8813 100755
--- a/tools/wiktio2xml/wiktio2xml.py
+++ b/tools/wiktio2xml/wiktio2xml.py
@@ -6,12 +6,15 @@ from xml.sax.handler import ContentHandler
 import sys
 import re
 
+import wiktio
+
 class WikiHandler(ContentHandler):
 
-    def __init__ (self, searchWords, locale):
+    def __init__ (self, searchWords, locale, _wiktio):
 
         self.searchWords= searchWords;
         self.locale = locale
+        self.wiktio = _wiktio
 
         self.isPageElement = False
 
@@ -67,8 +70,9 @@ class WikiHandler(ContentHandler):
         if name == 'page':
             self.isPageElement= False
             if self.titleContent in self.searchWords:
-                print "<hr></hr><h1>" + self.titleContent + "</h1>"
-                self.parseText()
+                word = self.parseText()
+                if word:
+                    self.wiktio.addWord(word)
 
             self.titleContent = ""
             self.textContent = ""
@@ -106,7 +110,7 @@ class WikiHandler(ContentHandler):
     # We keep the level of indentation to close in the stack:
     # self.lilevel
     #
-    def indents2xml(self, text):
+    def indents2xml(self, text, asText):
         result = re.search(r"^[ ]*[*#:;]+[ ]*", text)
         if not result:
             close = ""
@@ -134,7 +138,10 @@ class WikiHandler(ContentHandler):
                 result += "<ol>"
                 self.lilevel.append("</ol>")
 
-        return result + "<li>" + text + "</li>"
+        if asText:
+            return text
+        else:
+            return result + "<li>" + text + "</li>"
 
     def quote2xml(self, quote, openXml, closeXml, text):
         index = 0
@@ -154,7 +161,7 @@ class WikiHandler(ContentHandler):
         return text
 
     # Replace standard Wiki tags to XML
-    def wiki2xml(self, text):
+    def wiki2xml(self, text, asText):
 
         text = re.sub(r"{{[-\)\(]}}", "", text)
         text = re.sub(r"\[\[\w+:\w+\]\]", "", text)
@@ -162,7 +169,7 @@ class WikiHandler(ContentHandler):
         if text == "":
             return ""
 
-        text = self.indents2xml(text)
+        text = self.indents2xml(text, asText)
         text = re.sub(r"{{par ext[^}]+}}", r"(Par extension)", text)
         text = re.sub(r"{{litt[^}]+}}", r"(Littéraire)", text)
         text = re.sub(r"{{figuré[^}]+}}", r"(Figuré)", text)
@@ -195,7 +202,8 @@ class WikiHandler(ContentHandler):
 
     # Wikipedia text content is interpreted and transformed in XML
     def parseText(self):
-        inDefinition = False
+        inWord = None
+        inDefinition = None
         inAnagram = False
         inSynonym = False
         inAntonym = False
@@ -214,13 +222,21 @@ class WikiHandler(ContentHandler):
 
             for filter in self.filterContent:
                 if re.search(filter, l, re.I):
-                    return
+                    return inWord
+
+            # Categories
+            # print "1>" + l
+            # if re.search(r"^\[\[Cat\Sgorie:", l, re.U):
+            #     text = re.sub(r"\[\[Cat\Sgorie:(\S\s)+\]\]", r"\1", l)
+            #     print "ICI" + text
+            #     print "<category>" + text + "</category>"
+            #     continue
 
             # Are we still in the correct language section
             # We assume the correct language is ahead
             lang = re.match(r"== {{=([a-z]+)=}} ==", l)
             if lang and lang.group(1) != None and lang.group(1) != self.locale:
-                return
+                return inWord
 
             for wt in self.wordTypes.keys():
                 if re.search(wt, l):
@@ -232,82 +248,57 @@ class WikiHandler(ContentHandler):
 
             if inDefinition:
                 if not re.search(r"{{-.*-.*}}", l):
-                    print self.wiki2xml(l)
+                    inDefinition.addText(self.wiki2xml(l, False))
                 else:
-                    inDefinition = False
-                    print self.wiki2xml("</definition>")
+                    inWord.addDefinition(inDefinition)
+                    inDefinition = None
 
             if inAnagram:
-                if not re.search(r"{{-.*-.*}}", l):
-                    print self.wiki2xml(l)
+                if not re.search(r"{{-.*-.*}}", l) and len(l) > 0:
+                    inWord.addAnagram(self.wiki2xml(l, True))
                 else:
                     inAnagram = False
-                    print self.wiki2xml("</anagram>")
 
             if inSynonym:
                 if not re.search(r"{{-.*-.*}}", l):
-                    print self.wiki2xml(l)
+                    inWord.addSynonym(self.wiki2xml(l, True))
                 else:
                     inSynonym = False
-                    print self.wiki2xml("</synonym>")
 
             if inAntonym:
                 if not re.search(r"{{-.*-.*}}", l):
-                    print self.wiki2xml(l)
+                    inWord.addAntonym(self.wiki2xml(l, True))
                 else:
                     inAntonym = False
-                    print self.wiki2xml("</antonym>")
 
             if inPron:
-                if l != "" and l.find(".ogg") != -1:
-                    # Search the .ogg file
-                    file = l.split("=")
-                    if len(file) >= 2:
-                        file = file[1].replace("}}", "")
-                        print "<a href=http://fr.wiktionary.org/wiki/Fichier:"; \
-                            + file + ">" + file  + "</a>"
+                if not re.search(r"{{-.*-.*}}", l):
+                    if l.find(".ogg") != -1:
+                        # Search the .ogg file
+                        file = l.split("=")
+                        if len(file) >= 2:
+                            file = file[1].replace("}}", "")
+                            inWord.addPronociation(file)
                 else:
                     inPron = False
-                    print self.wiki2xml("</prononciation>")
 
             if l.startswith("'''" + self.titleContent + "'''"):
-                inDefinition = True
-                print "<h2>Definition " + self.titleContent + "</h2>"
-                print("<definition name='" + self.titleContent + "'" +
-                      " type='" + wordType + "'" +
-                      " subtype='" + wordSubType + "'" + ">")
-                print("<h3>" + wordType + " " + wordSubType + "</h3>")
+                inWord = wiktio.Word(self.titleContent)
+                inDefinition = wiktio.Definition()
+                inDefinition.setType(wordType)
+                inDefinition.setSubType(wordSubType)
                 wordType = ""
                 wordSubType = ""
             elif l == "{{-anagr-}}":
                 inAnagram = True
-                print "<h2>Anagram</h2>"
-                print("<anagram>")
             elif l == "{{-syn-}}":
                 inSynonym= True
-                print "<h2>Synonym</h2>"
-                print("<synonym>")
             elif l == "{{-ant-}}":
                 inAntonym = True
-                print "<h2>Antonym</h2>"
-                print("<antonym>")
             elif l == "{{-pron-}}":
                 inPron = True
-                print "<h2>Prononciation</h2>"
-                print("<prononciation>")
 
-
-
-def printHtmlHeader():
-    print '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>'
-    print '<html xmlns="http://www.w3.org/1999/xhtml"; lang="fr" dir="ltr">'
-    print '<head>'
-    print '<title>accueil - Wiktionnaire</title>'
-    print '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />'
-
-def printHtmlFooter():
-    print '</head>'
-    print '</html>'
+        return inWord
 
 def usage():
     print "wiki2xml.py <wiki file> <word list file>"
@@ -329,7 +320,9 @@ words = []
 words = [w.rstrip() for w in f.readlines()]
 f.close()
 
-printHtmlHeader()
-parse(wikiFile, WikiHandler(words, 'fr'))
-printHtmlFooter()
+_wiktio = wiktio.Wiktio()
+
+parse(wikiFile, WikiHandler(words, 'fr', _wiktio))
+
 
+_wiktio.dump2html()



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]