[gcompris/gcomprixogoo] improved wiktio2xml. works better now, getting close to a working version.



commit 4ba71933cd223b070bb45a4611399ae12a2c7fe2
Author: Bruno Coudoin <bruno coudoin free fr>
Date:   Thu Sep 23 01:06:39 2010 +0200

    improved wiktio2xml. works better now, getting close to a working version.

 tools/wiktio2xml/wiktio.py     |   33 ++++++++++++++++++++++++++++-----
 tools/wiktio2xml/wiktio2xml.py |   40 ++++++++++++++++++++++++++--------------
 2 files changed, 54 insertions(+), 19 deletions(-)
---
diff --git a/tools/wiktio2xml/wiktio.py b/tools/wiktio2xml/wiktio.py
index 8a5e582..374850c 100644
--- a/tools/wiktio2xml/wiktio.py
+++ b/tools/wiktio2xml/wiktio.py
@@ -27,13 +27,17 @@ class Definition:
 
 class Word:
 
-    def __init__ (self, word):
-        self.word = word
+    def __init__ (self):
+        self.name = None
         self.definition = []
         self.synonym = []
         self.antonym = []
         self.anagram = []
         self.prononciation = []
+        self.category = []
+
+    def setName(self, name):
+        self.name = name
 
     def addDefinition(self, definition):
         self.definition.append(definition)
@@ -50,10 +54,14 @@ class Word:
         if len(anagram):
             self.anagram.append(anagram)
 
-    def addPronociation(self, prononciation):
+    def addPrononciation(self, prononciation):
         if len(prononciation):
             self.prononciation.append(prononciation)
 
+    def addCategory(self, category):
+        if len(category):
+            self.category.append(category)
+
     def dump2htmlItem(self, title, liste):
         if len(liste):
             print "<h2>" + title + "</h2>"
@@ -62,16 +70,27 @@ class Word:
                 print "<li>" + s + "</li>"
             print "</ul>"
 
+    def dump2htmlPrononciation(self, title, liste):
+        prefix = "http://commons.wikimedia.org/wiki/File:";
+        if len(liste):
+            print "<h2>" + title + "</h2>"
+            print "<ul>"
+            for s in liste:
+                print "<li><a href=" + prefix + s + ">" \
+                    + s + "</a></li>"
+            print "</ul>"
+
     def dump2html(self):
         print "<hr></hr>"
-        print "<h1>" + self.word + "</h1>"
+        print "<h1>" + self.name + "</h1>"
         for d in self.definition:
             d.dump2html()
 
         self.dump2htmlItem("Synonym", self.synonym)
         self.dump2htmlItem("Antonym", self.antonym)
         self.dump2htmlItem("Anagram", self.anagram)
-        self.dump2htmlItem("Prononciation", self.prononciation)
+        self.dump2htmlPrononciation("Prononciation", self.prononciation)
+        self.dump2htmlItem("Category", self.category)
 
 class Wiktio:
 
@@ -84,6 +103,9 @@ class Wiktio:
     def getWords(self):
         return self.words
 
+    def sort(self):
+        self.words.sort(key=lambda word: word.name)
+
     def dumpHtmlHeader(self):
         print """
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
@@ -98,6 +120,7 @@ class Wiktio:
 """
     def dump2html(self):
         self.dumpHtmlHeader()
+        self.sort()
         for w in self.words:
             w.dump2html()
         self.dumpHtmlFooter()
diff --git a/tools/wiktio2xml/wiktio2xml.py b/tools/wiktio2xml/wiktio2xml.py
index 22d8813..5dca4c6 100755
--- a/tools/wiktio2xml/wiktio2xml.py
+++ b/tools/wiktio2xml/wiktio2xml.py
@@ -64,6 +64,8 @@ class WikiHandler(ContentHandler):
                                r"homosexuel",
                                r"vagin"]
 
+        self.filterDefinitionType = [ r"{{vulg[^}]+}}",
+                                      r"{{injur[^}]+}}" ]
 
     def endElement(self, name):
 
@@ -71,7 +73,7 @@ class WikiHandler(ContentHandler):
             self.isPageElement= False
             if self.titleContent in self.searchWords:
                 word = self.parseText()
-                if word:
+                if word and word.name:
                     self.wiktio.addWord(word)
 
             self.titleContent = ""
@@ -116,7 +118,8 @@ class WikiHandler(ContentHandler):
             close = ""
             while self.lilevel:
                 close += self.lilevel.pop()
-            text = close + text
+            if not asText:
+                text = close + text
             return text
 
         indent = result.group(0).rstrip()
@@ -167,7 +170,7 @@ class WikiHandler(ContentHandler):
         text = re.sub(r"\[\[\w+:\w+\]\]", "", text)
         text = re.sub(r"{{\(\|(.*)}}", r"\1", text)
         if text == "":
-            return ""
+            return self.indents2xml(text, asText)
 
         text = self.indents2xml(text, asText)
         text = re.sub(r"{{par ext[^}]+}}", r"(Par extension)", text)
@@ -202,7 +205,7 @@ class WikiHandler(ContentHandler):
 
     # Wikipedia text content is interpreted and transformed in XML
     def parseText(self):
-        inWord = None
+        inWord = wiktio.Word()
         inDefinition = None
         inAnagram = False
         inSynonym = False
@@ -211,7 +214,7 @@ class WikiHandler(ContentHandler):
         wordType = ""
         wordSubType = ""
 
-        # Append and end of text marker, it makes my life easier
+        # Append an end of text marker, it makes my life easier
         self.textContent += "\n{{-EndOfTest-}}"
 
         # Remove html comment (multilines)
@@ -220,17 +223,24 @@ class WikiHandler(ContentHandler):
 
         for l in self.textContent.splitlines():
 
+            next = False
+
             for filter in self.filterContent:
                 if re.search(filter, l, re.I):
-                    return inWord
+                    return None
+
+            for filter in self.filterDefinitionType:
+                if re.search(filter, l, re.I):
+                    next = True
+
+            if next:
+                continue
 
             # Categories
-            # print "1>" + l
-            # if re.search(r"^\[\[Cat\Sgorie:", l, re.U):
-            #     text = re.sub(r"\[\[Cat\Sgorie:(\S\s)+\]\]", r"\1", l)
-            #     print "ICI" + text
-            #     print "<category>" + text + "</category>"
-            #     continue
+            if re.match(ur"\[\[Catégorie:", l):
+                text = re.sub(ur"\[\[Catégorie:(.*)\]\]", r"\1", l)
+                inWord.addCategory(text)
+                continue
 
             # Are we still in the correct language section
             # We assume the correct language is ahead
@@ -250,6 +260,8 @@ class WikiHandler(ContentHandler):
                 if not re.search(r"{{-.*-.*}}", l):
                     inDefinition.addText(self.wiki2xml(l, False))
                 else:
+                    # Force a <ul> close if needed
+                    inDefinition.addText(self.wiki2xml("", False))
                     inWord.addDefinition(inDefinition)
                     inDefinition = None
 
@@ -278,12 +290,12 @@ class WikiHandler(ContentHandler):
                         file = l.split("=")
                         if len(file) >= 2:
                             file = file[1].replace("}}", "")
-                            inWord.addPronociation(file)
+                            inWord.addPrononciation(file)
                 else:
                     inPron = False
 
             if l.startswith("'''" + self.titleContent + "'''"):
-                inWord = wiktio.Word(self.titleContent)
+                inWord.setName(self.titleContent)
                 inDefinition = wiktio.Definition()
                 inDefinition.setType(wordType)
                 inDefinition.setSubType(wordSubType)



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]