[gcompris/gcomprixogoo] Now the descriptions of the definitions are managed individually in a python tree structure.



commit 2345329f709d54ce3954eeb08ded3fee2693d623
Author: Bruno Coudoin <bruno coudoin free fr>
Date:   Sun Oct 10 21:08:39 2010 +0200

    Now the descriptions of the definitions are managed individually in a python tree structure.

 tools/wiktio2xml/wiktio.py     |   83 +++++++++++++++++++++++++++++++---
 tools/wiktio2xml/wiktio2xml.py |   97 +++++++++++++++++----------------------
 2 files changed, 119 insertions(+), 61 deletions(-)
---
diff --git a/tools/wiktio2xml/wiktio.py b/tools/wiktio2xml/wiktio.py
index 934bfdd..617673f 100644
--- a/tools/wiktio2xml/wiktio.py
+++ b/tools/wiktio2xml/wiktio.py
@@ -22,14 +22,82 @@
 
 import os.path
 
+id = 0
+# Represent the description of a definition
+# This is recursive, a description can hold an
+# unlimited number of subdescription
+class Description:
+
+    def __init__ (self, parent, text, level, numbered = False):
+        global id
+        self.id = id
+        id += 1
+        self.parent = parent
+        self.level = level
+        self.text = text
+        self.numbered = numbered
+        self.descriptions = []
+
+    def isEmpty(self):
+        if len(self.descriptions) > 0:
+            return False
+        return True
+
+    # Return True if this node's text field of one of its
+    # children is not empty
+    def hasContent(self):
+        if len(self.text) > 0:
+            return True
+        else:
+            for d in self.descriptions:
+                if d.hasContent():
+                    return True
+        return False
+
+    # Recursively find the node at the given level
+    def getNodeAtLevel(self, level):
+        if level == self.level:
+            return self
+        elif level < self.level:
+            return self.parent.getNodeAtLevel(level)
+        else:
+            return None
+
+    def addDescription(self, text, level, numbered):
+        node = self.getNodeAtLevel(level - 1)
+        if node:
+            description = Description(node, text, level, numbered)
+            node.descriptions.append( description )
+            return description
+        return None
+
+    def dump2html(self, f):
+        if len(self.text) > 0:
+            f.write ( "<li>" + self.text + "</li>" )
+        if len(self.descriptions) > 0:
+            if self.level >= 0:
+                if self.numbered:
+                    f.write ( "<ul>" )
+                else:
+                    f.write ( "<ol>" )
+            for d in self.descriptions:
+                d.dump2html(f)
+            if self.level >= 0:
+                if self.numbered:
+                    f.write ( "</ul>" )
+                else:
+                    f.write ( "</ol>" )
+
+
 class Definition:
 
     def __init__ (self):
-        self.text = ""
         self.type = ""
         self.subType = ""
         self.filtered = False
         self.gender = ""
+        self.rootDescription = Description(None, "", -1)
+        self.currentDescription = self.rootDescription
         self.synonym = []
         self.antonym = []
         self.anagram = []
@@ -39,9 +107,6 @@ class Definition:
         self.category = []
         self.image = []
 
-    def addText(self, text):
-        self.text += text
-
     def setType(self, type):
         self.type = type
 
@@ -51,6 +116,12 @@ class Definition:
     def setGender(self, gender):
         self.gender = gender
 
+    # A definition may hold several descriptions, each one can
+    # have several sub descriptions.
+    def addDescription(self, text, level, numbered):
+        self.currentDescription = \
+            self.currentDescription.addDescription(text, level, numbered)
+
     def add(self, atype, text):
         if len(text) == 0:
             return
@@ -102,13 +173,13 @@ class Definition:
                     f.write ( s + ", " )
 
     def dump2html(self, f):
-        if self.filtered or self.text == "":
+        if self.filtered or not self.rootDescription.hasContent():
             return
         f.write ( "<h3>" + self.type + \
             " " + self.subType + \
             " " + self.gender + "</h3>" )
         self.dump2htmlImage(f)
-        f.write ( self.text )
+        self.rootDescription.dump2html(f)
 
         self.dump2htmlItem(f, "Synonymes", self.synonym)
         self.dump2htmlItem(f, "Antonymes", self.antonym)
diff --git a/tools/wiktio2xml/wiktio2xml.py b/tools/wiktio2xml/wiktio2xml.py
index 3947688..1569a50 100755
--- a/tools/wiktio2xml/wiktio2xml.py
+++ b/tools/wiktio2xml/wiktio2xml.py
@@ -46,7 +46,7 @@ class WikiHandler(ContentHandler):
         self.isTextElement = False
         self.textContent = u""
 
-        self.lilevel = []
+        self.lilevel = 0
 
     def startElement(self, name, attrs):
 
@@ -164,42 +164,30 @@ class WikiHandler(ContentHandler):
     # Notes:
     # These may be nested.
     #
-    # We keep the level of indentation to close in the stack:
+    # We keep the level of indentation to close in:
     # self.lilevel
     #
+    # Returns a list [text, level, numbered]
+    # numbered = True if this is a numbered list
+    #
     def indents2xml(self, text, asText):
+        numbered = False
         result = re.search(r"^[ ]*[*#:;]+[ ]*", text)
         if not result:
-            close = ""
-            while self.lilevel:
-                close += self.lilevel.pop()
-            if not asText:
-                text = close + text
-            return text
+            self.lilevel = 0
+            return [text, self.lilevel, numbered]
 
         indent = result.group(0).rstrip()
+        self.lilevel = len(indent)
         text = text[result.end():]
 
         if asText:
-            return text
-
-        result = ""
-        # Close indents if needed
-        while len(self.lilevel) > len(indent):
-            result += self.lilevel.pop()
-
-        # Open new indents
-        # Remove the current level from it
-        indent = indent[len(self.lilevel):]
-        for char in list(indent):
-            if char in "*:;":
-                result += "<ul>"
-                self.lilevel.append("</ul>")
-            elif char == "#":
-                result += "<ol>"
-                self.lilevel.append("</ol>")
-
-        return result + "<li>" + text + "</li>"
+            return [text, self.lilevel, numbered]
+
+        if indent[-1:] == "#":
+            numbered = True
+
+        return [text, self.lilevel, numbered]
 
     # Replaces '''xx''' and ''xx'' from the given text
     # with openXml xx closeXml
@@ -216,6 +204,8 @@ class WikiHandler(ContentHandler):
         return text
 
     # Replace standard Wiki tags to XML
+    # Returns a list [text, level, numbered]
+    # numbered = True if this is a numbered list
     def wiki2xml(self, text, asText):
 
         text = re.sub(r"{{[-\)\(]}}", "", text)
@@ -224,7 +214,7 @@ class WikiHandler(ContentHandler):
         if text == "":
             return self.indents2xml(text, asText)
 
-        text = self.indents2xml(text, asText)
+        [text, level, numbered] = self.indents2xml(text, asText)
         text = re.sub(ur"{{par ext[^}]*}}", ur"(Par extension)", text)
         text = re.sub(ur"{{figuré[^}]*}}", ur"(Figuré)", text)
         text = re.sub(ur"{{w\|([^}]+)}}", ur"<i>\1</i>", text)
@@ -250,7 +240,7 @@ class WikiHandler(ContentHandler):
                 text = text[:start] + text[pipe+1:]
                 text = text.replace("]]", "", 1)
 
-        return text
+        return [text, level, numbered]
 
     # Wikipedia text content is interpreted and transformed in XML
     def parseText(self):
@@ -280,7 +270,7 @@ class WikiHandler(ContentHandler):
             concat = ""
             next = False
 
-            if debug: print "<br/>l:" + l + ":"
+            if debug: print "   l:" + l + ":"
             if re.search(r"<[^>]+$", l):
                 # Wiki uses a trick to format text area by ending in uncomplete
                 # html tags. In this case, we concat this line with the next one
@@ -296,23 +286,33 @@ class WikiHandler(ContentHandler):
                 # Get rid of non wiki tags
                 l = re.sub(r'}}[^}]+{{', r'}} {{', l)
                 state = Wiktio.DEFINITION
+
+                for wt in self.genders.keys():
+                    if re.search(wt, l):
+                        gender = self.genders[wt]
+                        definition.setGender(gender)
+                        break
+
+                for wt in self.wordSubTypes.keys():
+                    if re.search(wt, l):
+                        wordSubType = self.wordSubTypes[wt]
+                        definition.setSubType(wordSubType)
+                        break
+
+                definition.addDescription("", 0, False)
+                continue
+
             elif l == "{{-anagr-}}":
-                definition.addText(self.wiki2xml("", False))
                 state = Wiktio.ANAGRAM
             elif l == "{{-syn-}}":
-                definition.addText(self.wiki2xml("", False))
                 state = Wiktio.SYNONYM
             elif l == "{{-ant-}}":
-                definition.addText(self.wiki2xml("", False))
                 state = Wiktio.ANTONYM
             elif l == "{{-hyper-}}":
-                definition.addText(self.wiki2xml("", False))
                 state = Wiktio.HYPERONYM
             elif l == "{{-hypo-}}":
-                definition.addText(self.wiki2xml("", False))
                 state = Wiktio.HYPONYM
             elif l == "{{-pron-}}":
-                definition.addText(self.wiki2xml("", False))
                 state = Wiktio.PRON
             elif l == "{{-note-}}":
                 state = Wiktio.SKIP
@@ -331,10 +331,8 @@ class WikiHandler(ContentHandler):
             elif l == u"{{-réf-}}":
                 state = Wiktio.SKIP
             elif re.search(r"{{-.*-.*}}", l):
-                if definition.text != "":
-                    if debug: print "<br/>new definition:" + l + ":"
-                    # Force a <ul> close if needed
-                    definition.addText(self.wiki2xml("", False))
+                if not definition.rootDescription.isEmpty():
+                    if debug: print "  new definition:" + l + ":"
                     # Next definition
                     filterIndent = ""
                     definition = wiktio.Definition()
@@ -364,18 +362,6 @@ class WikiHandler(ContentHandler):
                     definition.filtered = True
                     break
 
-            for wt in self.genders.keys():
-                if re.search(wt, l):
-                    gender = self.genders[wt]
-                    definition.setGender(gender)
-                    break
-
-            for wt in self.wordSubTypes.keys():
-                if re.search(wt, l):
-                    wordSubType = self.wordSubTypes[wt]
-                    definition.setSubType(wordSubType)
-                    break
-
             if state == Wiktio.SKIP:
                 continue
 
@@ -409,7 +395,7 @@ class WikiHandler(ContentHandler):
 
             # We already found a meaning for this word, we pick
             # other senses restrictively
-            if definition.text != "":
+            if not definition.rootDescription.isEmpty():
                 for filter in self.filterSecondDefinitionType:
                     if re.search(filter, l, re.I):
                         result = re.search(r"^[ ]*[*#:;]+[ ]*", l)
@@ -429,14 +415,15 @@ class WikiHandler(ContentHandler):
                 continue
 
             if state == Wiktio.DEFINITION:
-                definition.addText(self.wiki2xml(l, False))
+                [text, level, numbered] = self.wiki2xml(l, False)
+                definition.addDescription(text, level, numbered)
             elif state == Wiktio.PRON:
                 file = re.subn(r".*audio=([^|}]+).*", r"\1", l)
                 if file[1] == 1:
                     definition.add(state, file[0])
             else:
                 if len(l) > 0:
-                    definition.add(state, self.wiki2xml(l, True))
+                    definition.add(state, self.wiki2xml(l, True)[0])
 
         return inWord
 



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]