[gcompris/gcomprixogoo] Added support for static site export in wiktio2xml Reworked the code to have the state common to bot



commit 60d1ca9ac029f8282672aa43ddf02bd07bcff279
Author: Bruno Coudoin <bruno coudoin free fr>
Date:   Sun Oct 3 02:56:55 2010 +0200

    Added support for static site export in wiktio2xml
    Reworked the code to have the state common to both files.

 tools/wiktio2xml/wiktio.py     |  177 +++++++++++++++++++++++----------------
 tools/wiktio2xml/wiktio2xml.py |  113 +++++++++++++-------------
 2 files changed, 161 insertions(+), 129 deletions(-)
---
diff --git a/tools/wiktio2xml/wiktio.py b/tools/wiktio2xml/wiktio.py
index f09ba98..934bfdd 100644
--- a/tools/wiktio2xml/wiktio.py
+++ b/tools/wiktio2xml/wiktio.py
@@ -20,6 +20,8 @@
 # Implementation of the wiktionary model
 # used
 
+import os.path
+
 class Definition:
 
     def __init__ (self):
@@ -49,81 +51,72 @@ class Definition:
     def setGender(self, gender):
         self.gender = gender
 
-    def addSynonym(self, synonym):
-        if len(synonym):
-            self.synonym.append(synonym)
-
-    def addAntonym(self, antonym):
-        if len(antonym):
-            self.antonym.append(antonym)
-
-    def addAnagram(self, anagram):
-        if len(anagram):
-            self.anagram.append(anagram)
-
-    def addHyperonym(self, hyperonym):
-        if len(hyperonym):
-            self.hyperonym.append(hyperonym)
-
-    def addHyponym(self, hyponym):
-        if len(hyponym):
-            self.hyponym.append(hyponym)
-
-    def addPrononciation(self, prononciation):
-        if len(prononciation):
-            self.prononciation.append(prononciation)
-
-    def addCategory(self, category):
-        if len(category):
-            self.category.append(category)
-
-    def addImage(self, image):
-        if len(image):
-            self.image.append(image)
+    def add(self, atype, text):
+        if len(text) == 0:
+            return
 
-    def dump2htmlImage(self):
+        if atype == Wiktio.ANAGRAM:
+            self.anagram.append(text)
+        elif atype == Wiktio.SYNONYM:
+            self.synonym.append(text)
+        elif atype == Wiktio.ANTONYM:
+            self.antonym.append(text)
+        elif atype == Wiktio.HYPERONYM:
+            self.hyperonym.append(text)
+        elif atype == Wiktio.HYPONYM:
+            self.hyponym.append(text)
+        elif atype == Wiktio.PRON:
+            self.prononciation.append(text)
+        elif atype == Wiktio.IMAGE:
+            self.image.append(text)
+        elif atype == Wiktio.CATEGORY:
+            self.category.append(text)
+        else:
+            print "!!ERROR!!: Type not supported"
+
+    def dump2htmlImage(self, f):
         if self.image:
             prefix = "http://fr.wiktionary.org/wiki/Fichier:";
             for img in self.image:
-                print "<a href='" + prefix + img + "'>" + \
-                    img + '</a><br/>'
+                f.write ( "<a href='" + prefix + img + "'>" + \
+                    img + '</a><br/>' )
 
-    def dump2htmlPrononciation(self, title, liste):
+    def dump2htmlPrononciation(self, f, title, liste):
         prefix = "http://commons.wikimedia.org/wiki/File:";
         if len(liste):
-            print "<h2>" + title + "</h2>"
-            print "<ul>"
+            f.write ( "<h2>" + title + "</h2>" )
+            f.write ( "<ul>" )
             for s in liste:
-                print "<li><a href='" + prefix + s + "'>" \
-                    + s + "</a></li>"
-            print "</ul>"
+                f.write ( "<li><a href='" + prefix + s + "'>" \
+                    + s + "</a></li>" )
+            f.write ( "</ul>" )
 
-    def dump2htmlItem(self, title, liste):
+    def dump2htmlItem(self, f, title, liste):
 
         if len(liste):
-            print "<h2>" + title + "</h2>"
+            f.write ( "<h2>" + title + "</h2>" )
             for s in liste:
                 if s.find(":") >= 0:
-                    print "<br/>" + s
+                    f.write ( "<br/>" + s )
                 else:
-                    print s + ", "
+                    f.write ( s + ", " )
 
-    def dump2html(self):
+    def dump2html(self, f):
         if self.filtered or self.text == "":
             return
-        print "<h3>" + self.type + \
+        f.write ( "<h3>" + self.type + \
             " " + self.subType + \
-            " " + self.gender + "</h3>"
-        self.dump2htmlImage()
-        print self.text
-
-        self.dump2htmlItem("Synonymes", self.synonym)
-        self.dump2htmlItem("Antonymes", self.antonym)
-        self.dump2htmlItem("Anagrammes", self.anagram)
-        self.dump2htmlItem("Hyperonymes", self.hyperonym)
-        self.dump2htmlItem("Hyponymes", self.hyponym)
-        self.dump2htmlPrononciation("Prononciation", self.prononciation)
-        self.dump2htmlItem(u"Catégories", self.category)
+            " " + self.gender + "</h3>" )
+        self.dump2htmlImage(f)
+        f.write ( self.text )
+
+        self.dump2htmlItem(f, "Synonymes", self.synonym)
+        self.dump2htmlItem(f, "Antonymes", self.antonym)
+        self.dump2htmlItem(f, "Anagrammes", self.anagram)
+        self.dump2htmlItem(f, "Hyperonymes", self.hyperonym)
+        self.dump2htmlItem(f, "Hyponymes", self.hyponym)
+        self.dump2htmlPrononciation(f, "Prononciation", self.prononciation)
+        self.dump2htmlItem(f, u"Catégories", self.category)
 
 class Word:
 
@@ -137,18 +130,29 @@ class Word:
     def addDefinition(self, definition):
         self.definition.append(definition)
 
-    def dump2html(self):
-        print "<hr/>"
-        print "<h1>" + self.name + "</h1>"
+    def dump2html(self, f):
+        f.write ( "<hr/>" )
+        f.write ( "<h1>" + self.name + "</h1>" )
         if not self.definition:
-            print "<h2>ERROR: NO DEFINITION</h2>"
+            f.write ( "<h2>ERROR: NO DEFINITION</h2>" )
             return
         for d in self.definition:
-            d.dump2html()
+            d.dump2html(f)
 
 
 class Wiktio:
 
+    (DEFINITION,
+     ANAGRAM,
+     SYNONYM,
+     ANTONYM,
+     HYPERONYM,
+     HYPONYM,
+     PRON,
+     IMAGE,
+     CATEGORY,
+     SKIP) = range(0, 10)
+
     def __init__ (self):
         self.words = []
 
@@ -161,21 +165,48 @@ class Wiktio:
     def sort(self):
         self.words.sort(key=lambda word: word.name.lower())
 
-    def dumpHtmlHeader(self):
-        print """
+    def dumpHtmlHeader(self, f):
+        f.write ( """
 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";>
 <html xmlns="http://www.w3.org/1999/xhtml"; lang="fr" dir="ltr">
 <head>
 <title>Mini - Wiktionnaire</title>
 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
-"""
-    def dumpHtmlFooter(self):
-        print """
+""")
+
+    def dumpHtmlFooter(self, f):
+        f.write ("""
 </head>
-"""
-    def dump2html(self):
-        self.dumpHtmlHeader()
+""")
+
+    # Creates a big HTML file, useful to debug
+    def dump2html(self, file):
+        with open(file, 'w') as f:
+            self.dumpHtmlHeader(f)
+            self.sort()
+            for w in self.words:
+                w.dump2html(f)
+            self.dumpHtmlFooter(f)
+
+    # Creates a static HTML site in the given directory
+    def dump2htmlSite(self, baseDir):
+        if not os.path.isdir(baseDir):
+            print "ERROR: Directory '" + baseDir + "' does not exists."
+            return
+
+        letter = "/"
         self.sort()
-        for w in self.words:
-            w.dump2html()
-        self.dumpHtmlFooter()
+        with open(baseDir + '/index.html', 'w') as f_index:
+            self.dumpHtmlHeader(f_index)
+            for w in self.words:
+                if letter[0] != w.name[0].upper():
+                    letter = w.name[0].upper()
+                    f_index.write ( "<hr/><h1>" + letter[0] + "</h1>" )
+                f_index.write ( "<a href='" + w.name + ".html'>" + w.name + "</a> " )
+                with open(baseDir + '/' + w.name + '.html', 'w') as f:
+                    self.dumpHtmlHeader(f)
+                    w.dump2html(f)
+                    self.dumpHtmlFooter(f)
+
+            self.dumpHtmlFooter(f_index)
+
diff --git a/tools/wiktio2xml/wiktio2xml.py b/tools/wiktio2xml/wiktio2xml.py
index 8ffc8e3..3947688 100755
--- a/tools/wiktio2xml/wiktio2xml.py
+++ b/tools/wiktio2xml/wiktio2xml.py
@@ -22,17 +22,21 @@ from xml.sax.handler import ContentHandler
 import sys
 import re
 
+from optparse import OptionParser
+
 import wiktio
+from wiktio import Wiktio
 
 debug = False
 
 class WikiHandler(ContentHandler):
 
-    def __init__ (self, searchWords, locale, _wiktio):
+    def __init__ (self, searchWords, locale, _wiktio, verbose):
 
         self.searchWords= searchWords;
         self.locale = locale
         self.wiktio = _wiktio
+        self.verbose = verbose
 
         self.isPageElement = False
 
@@ -95,6 +99,8 @@ class WikiHandler(ContentHandler):
         # These definitions will always be skipped
         self.filterDefinitionType = [ ur"{{vulg[^}]*}}",
                                       ur"{{injur[^}]*}}",
+                                      ur"{{sexe[^}]*}}",
+                                      ur"{{sexua[^}]*}}",
                                       ur"coït",
                                       ur"argot"]
 
@@ -201,17 +207,12 @@ class WikiHandler(ContentHandler):
         index = 0
         while index >= 0:
             index = text.find(quote)
-            if index >= 0:
+            index2 = text.find(quote, index)
+            if index >= 0 and index2 >=0:
                 text = text.replace(quote, openXml, 1)
-            else:
-                return text
-
-            index = text.find(quote)
-            if index >= 0:
                 text = text.replace(quote, closeXml, 1)
             else:
-                # Malformed statement, should fix wiktionary
-                text += closeXml
+                return text
         return text
 
     # Replace standard Wiki tags to XML
@@ -253,18 +254,11 @@ class WikiHandler(ContentHandler):
 
     # Wikipedia text content is interpreted and transformed in XML
     def parseText(self):
+        if self.verbose:
+            print "Processing " + self.titleContent
         inWord = wiktio.Word()
 
-        (DEFINITION,
-         ANAGRAM,
-         SYNONYM,
-         ANTONYM,
-         HYPERONYM,
-         HYPONYM,
-         PRON,
-         SKIP) = range(0, 8)
-
-        state = SKIP
+        state = Wiktio.SKIP
 
         wordType = ""
         wordSubType = ""
@@ -299,41 +293,43 @@ class WikiHandler(ContentHandler):
                 inWord.setName(self.titleContent)
                 # Get rid of the word, we don't want it in the definition
                 l = re.sub(r"'''.*'''[ ]*(.*)", r"\1", l)
-                state = DEFINITION
+                # Get rid of non wiki tags
+                l = re.sub(r'}}[^}]+{{', r'}} {{', l)
+                state = Wiktio.DEFINITION
             elif l == "{{-anagr-}}":
                 definition.addText(self.wiki2xml("", False))
-                state = ANAGRAM
+                state = Wiktio.ANAGRAM
             elif l == "{{-syn-}}":
                 definition.addText(self.wiki2xml("", False))
-                state = SYNONYM
+                state = Wiktio.SYNONYM
             elif l == "{{-ant-}}":
                 definition.addText(self.wiki2xml("", False))
-                state = ANTONYM
+                state = Wiktio.ANTONYM
             elif l == "{{-hyper-}}":
                 definition.addText(self.wiki2xml("", False))
-                state = HYPERONYM
+                state = Wiktio.HYPERONYM
             elif l == "{{-hypo-}}":
                 definition.addText(self.wiki2xml("", False))
-                state = HYPONYM
+                state = Wiktio.HYPONYM
             elif l == "{{-pron-}}":
                 definition.addText(self.wiki2xml("", False))
-                state = PRON
+                state = Wiktio.PRON
             elif l == "{{-note-}}":
-                state = SKIP
+                state = Wiktio.SKIP
             elif l == "{{-apr-}}":
-                state = SKIP
+                state = Wiktio.SKIP
             elif l == "{{-drv-}}":
-                state = SKIP
+                state = Wiktio.SKIP
             elif l == "{{-exp-}}":
-                state = SKIP
+                state = Wiktio.SKIP
             elif l == "{{-trad-}}":
-                state = SKIP
+                state = Wiktio.SKIP
             elif l == "{{-voc-}}":
-                state = SKIP
+                state = Wiktio.SKIP
             elif l == "{{-voir-}}":
-                state = SKIP
+                state = Wiktio.SKIP
             elif l == u"{{-réf-}}":
-                state = SKIP
+                state = Wiktio.SKIP
             elif re.search(r"{{-.*-.*}}", l):
                 if definition.text != "":
                     if debug: print "<br/>new definition:" + l + ":"
@@ -343,7 +339,7 @@ class WikiHandler(ContentHandler):
                     filterIndent = ""
                     definition = wiktio.Definition()
                     inWord.addDefinition(definition)
-                state = SKIP
+                state = Wiktio.SKIP
 
             # Are we still in the correct language section
             # We assume the correct language is ahead
@@ -354,7 +350,7 @@ class WikiHandler(ContentHandler):
             # Image
             if definition and re.match(ur"\[\[Image:", l):
                 text = re.sub(ur"\[\[Image:([^|}\]]+).*", r"\1", l)
-                definition.addImage(text)
+                definition.add(Wiktio.IMAGE, text)
                 continue
 
             for wt in self.wordTypes.keys():
@@ -380,7 +376,7 @@ class WikiHandler(ContentHandler):
                     definition.setSubType(wordSubType)
                     break
 
-            if state == SKIP:
+            if state == Wiktio.SKIP:
                 continue
 
             for filter in self.filterContent:
@@ -429,26 +425,18 @@ class WikiHandler(ContentHandler):
             # Categories
             if re.match(ur"\[\[Catégorie:", l):
                 text = re.sub(ur"\[\[Catégorie:([^|}\]]+).*", r"\1", l)
-                definition.addCategory(text)
+                definition.add(Wiktio.CATEGORY, text)
                 continue
 
-            if state == DEFINITION:
+            if state == Wiktio.DEFINITION:
                 definition.addText(self.wiki2xml(l, False))
-            elif state == ANAGRAM:
-                if len(l) > 0:
-                    definition.addAnagram(self.wiki2xml(l, True))
-            elif state == SYNONYM:
-                definition.addSynonym(self.wiki2xml(l, True))
-            elif state == ANTONYM:
-                definition.addAntonym(self.wiki2xml(l, True))
-            elif state == HYPERONYM:
-                definition.addHyperonym(self.wiki2xml(l, True))
-            elif state == HYPONYM:
-                definition.addHyponym(self.wiki2xml(l, True))
-            elif state == PRON:
+            elif state == Wiktio.PRON:
                 file = re.subn(r".*audio=([^|}]+).*", r"\1", l)
                 if file[1] == 1:
-                    definition.addPrononciation(file[0])
+                    definition.add(state, file[0])
+            else:
+                if len(l) > 0:
+                    definition.add(state, self.wiki2xml(l, True))
 
         return inWord
 
@@ -459,7 +447,18 @@ def usage():
 reload(sys)
 sys.setdefaultencoding('utf-8')
 
-if len(sys.argv) != 3:
+parser = OptionParser()
+parser.add_option("-o", "--output", dest="output",
+                  help="write result to file or directory")
+parser.add_option("-q", "--quiet",
+                  action="store_false", dest="verbose", default=True,
+                  help="don't print status messages to stdout")
+parser.add_option("-s", "--site",
+                  action="store_false", dest="site", default=False,
+                  help="Creates a web site")
+(options, args) = parser.parse_args()
+
+if len(sys.argv) < 3:
     usage()
     sys.exit()
 
@@ -474,7 +473,9 @@ f.close()
 
 _wiktio = wiktio.Wiktio()
 
-parse(wikiFile, WikiHandler(words, 'fr', _wiktio))
-
+parse(wikiFile, WikiHandler(words, 'fr', _wiktio, options.verbose))
 
-_wiktio.dump2html()
+if options.site:
+    _wiktio.dump2htmlSite(options.output)
+else:
+    _wiktio.dump2html(options.output)



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]