[gcompris/gcomprixogoo] improved wiktionary parsing.



commit 9fdb76780db0c526715acfa146c2168b908f4427
Author: Bruno Coudoin <bruno coudoin free fr>
Date:   Sun Sep 26 23:28:09 2010 +0200

    improved wiktionary parsing.

 tools/wiktio2xml/fr_words.txt  |    5 +-
 tools/wiktio2xml/wiktio.py     |    9 +++-
 tools/wiktio2xml/wiktio2xml.py |   88 +++++++++++++++++++++++++++++++++------
 3 files changed, 82 insertions(+), 20 deletions(-)
---
diff --git a/tools/wiktio2xml/fr_words.txt b/tools/wiktio2xml/fr_words.txt
index 3768232..8d44af9 100644
--- a/tools/wiktio2xml/fr_words.txt
+++ b/tools/wiktio2xml/fr_words.txt
@@ -648,7 +648,7 @@ rouler
 tendre
 transporter
 voler
-abîmé
+abîmer
 ancien
 blanc
 bleu
@@ -865,7 +865,6 @@ siffler
 surveiller
 traîner
 trouver
-caché
 fou
 méchant
 gâteau
@@ -924,7 +923,7 @@ remercier
 remuer
 souhaiter
 sucer
-barbouillé
+barbouiller
 demi
 égal
 entier
diff --git a/tools/wiktio2xml/wiktio.py b/tools/wiktio2xml/wiktio.py
index 8331e9b..f09ba98 100644
--- a/tools/wiktio2xml/wiktio.py
+++ b/tools/wiktio2xml/wiktio.py
@@ -94,7 +94,7 @@ class Definition:
             print "<h2>" + title + "</h2>"
             print "<ul>"
             for s in liste:
-                print "<li><a href=" + prefix + s + ">" \
+                print "<li><a href='" + prefix + s + "'>" \
                     + s + "</a></li>"
             print "</ul>"
 
@@ -109,7 +109,7 @@ class Definition:
                     print s + ", "
 
     def dump2html(self):
-        if self.filtered:
+        if self.filtered or self.text == "":
             return
         print "<h3>" + self.type + \
             " " + self.subType + \
@@ -138,8 +138,11 @@ class Word:
         self.definition.append(definition)
 
     def dump2html(self):
-        print "<hr></hr>"
+        print "<hr/>"
         print "<h1>" + self.name + "</h1>"
+        if not self.definition:
+            print "<h2>ERROR: NO DEFINITION</h2>"
+            return
         for d in self.definition:
             d.dump2html()
 
diff --git a/tools/wiktio2xml/wiktio2xml.py b/tools/wiktio2xml/wiktio2xml.py
index c93b935..8ffc8e3 100755
--- a/tools/wiktio2xml/wiktio2xml.py
+++ b/tools/wiktio2xml/wiktio2xml.py
@@ -24,6 +24,8 @@ import re
 
 import wiktio
 
+debug = False
+
 class WikiHandler(ContentHandler):
 
     def __init__ (self, searchWords, locale, _wiktio):
@@ -90,14 +92,29 @@ class WikiHandler(ContentHandler):
                                ur"homosexuel",
                                ur"vagin"]
 
+        # These definitions will always be skipped
         self.filterDefinitionType = [ ur"{{vulg[^}]*}}",
                                       ur"{{injur[^}]*}}",
+                                      ur"coït",
+                                      ur"argot"]
+
+        # These definitions will be skipped only if not in the first
+        # sense found
+        self.filterSecondDefinitionType = [
                                       ur"{{dés[^}]*}}",
                                       ur"{{vx[^}]*}}",
                                       ur"{{métonymie[^}]*}}",
                                       ur"{{familier[^}]*}}",
-                                      ur"coït",
-                                      ur"argot"]
+                                      ur"{{hérald[^}]*}}",
+                                      ur"{{botan[^}]*}}",
+                                      ur"{{zool[^}]*}}",
+                                      ur"{{polit[^}]*}",
+                                      ur"{{péj[^}]*}}",
+                                      ur"{{oeno[^}]*}}",
+                                      ur"{{litt[^}]*}}",
+#                                      ur"{{par ext[^}]*}}",
+                                      ur"{{figuré[^}]*}}"
+                                      ]
 
     def endElement(self, name):
 
@@ -157,6 +174,9 @@ class WikiHandler(ContentHandler):
         indent = result.group(0).rstrip()
         text = text[result.end():]
 
+        if asText:
+            return text
+
         result = ""
         # Close indents if needed
         while len(self.lilevel) > len(indent):
@@ -173,10 +193,7 @@ class WikiHandler(ContentHandler):
                 result += "<ol>"
                 self.lilevel.append("</ol>")
 
-        if asText:
-            return text
-        else:
-            return result + "<li>" + text + "</li>"
+        return result + "<li>" + text + "</li>"
 
     # Replaces '''xx''' and ''xx'' from the given text
     # with openXml xx closeXml
@@ -208,7 +225,6 @@ class WikiHandler(ContentHandler):
 
         text = self.indents2xml(text, asText)
         text = re.sub(ur"{{par ext[^}]*}}", ur"(Par extension)", text)
-        text = re.sub(ur"{{litt[^}]*}}", ur"(Littéraire)", text)
         text = re.sub(ur"{{figuré[^}]*}}", ur"(Figuré)", text)
         text = re.sub(ur"{{w\|([^}]+)}}", ur"<i>\1</i>", text)
         text = re.sub(ur"{{source\|([^}]+)}}", ur"- (\1)", text)
@@ -238,7 +254,6 @@ class WikiHandler(ContentHandler):
     # Wikipedia text content is interpreted and transformed in XML
     def parseText(self):
         inWord = wiktio.Word()
-        definition = None
 
         (DEFINITION,
          ANAGRAM,
@@ -256,7 +271,7 @@ class WikiHandler(ContentHandler):
         filterIndent = ""
         gender = ""
 
-        # Append an end of text marker, it makes my life easier
+        # Append an end of text marker, it forces the end of the definition
         self.textContent += "\n{{-EndOfTest-}}"
 
         # Remove html comment (multilines)
@@ -264,11 +279,14 @@ class WikiHandler(ContentHandler):
                                   self.textContent, re.M)
 
         definition = wiktio.Definition()
+        inWord.addDefinition(definition)
         concat = ""
         for l in self.textContent.splitlines():
             l = concat + l
+            concat = ""
             next = False
 
+            if debug: print "<br/>l:" + l + ":"
             if re.search(r"<[^>]+$", l):
                 # Wiki uses a trick to format text area by ending in uncomplete
                 # html tags. In this case, we concat this line with the next one
@@ -280,27 +298,51 @@ class WikiHandler(ContentHandler):
             if l.startswith("'''" + self.titleContent + "'''"):
                 inWord.setName(self.titleContent)
                 # Get rid of the word, we don't want it in the definition
-                l = re.sub(r"'''.*'''(.*)", r"\1", l)
+                l = re.sub(r"'''.*'''[ ]*(.*)", r"\1", l)
                 state = DEFINITION
             elif l == "{{-anagr-}}":
+                definition.addText(self.wiki2xml("", False))
                 state = ANAGRAM
             elif l == "{{-syn-}}":
+                definition.addText(self.wiki2xml("", False))
                 state = SYNONYM
             elif l == "{{-ant-}}":
+                definition.addText(self.wiki2xml("", False))
                 state = ANTONYM
             elif l == "{{-hyper-}}":
+                definition.addText(self.wiki2xml("", False))
                 state = HYPERONYM
             elif l == "{{-hypo-}}":
+                definition.addText(self.wiki2xml("", False))
                 state = HYPONYM
             elif l == "{{-pron-}}":
+                definition.addText(self.wiki2xml("", False))
                 state = PRON
+            elif l == "{{-note-}}":
+                state = SKIP
+            elif l == "{{-apr-}}":
+                state = SKIP
+            elif l == "{{-drv-}}":
+                state = SKIP
+            elif l == "{{-exp-}}":
+                state = SKIP
+            elif l == "{{-trad-}}":
+                state = SKIP
+            elif l == "{{-voc-}}":
+                state = SKIP
+            elif l == "{{-voir-}}":
+                state = SKIP
+            elif l == u"{{-réf-}}":
+                state = SKIP
             elif re.search(r"{{-.*-.*}}", l):
                 if definition.text != "":
+                    if debug: print "<br/>new definition:" + l + ":"
                     # Force a <ul> close if needed
                     definition.addText(self.wiki2xml("", False))
-                    inWord.addDefinition(definition)
                     # Next definition
+                    filterIndent = ""
                     definition = wiktio.Definition()
+                    inWord.addDefinition(definition)
                 state = SKIP
 
             # Are we still in the correct language section
@@ -319,28 +361,32 @@ class WikiHandler(ContentHandler):
                 if re.search(wt, l):
                     wordType = self.wordTypes[wt]
                     definition.setType(wordType)
+                    break
 
             for wt in self.wordSkipTypes:
                 if re.search(wt, l):
                     definition.filtered = True
+                    break
 
             for wt in self.genders.keys():
                 if re.search(wt, l):
                     gender = self.genders[wt]
                     definition.setGender(gender)
+                    break
 
             for wt in self.wordSubTypes.keys():
                 if re.search(wt, l):
                     wordSubType = self.wordSubTypes[wt]
                     definition.setSubType(wordSubType)
+                    break
 
             if state == SKIP:
                 continue
 
             for filter in self.filterContent:
                 if re.search(filter, l, re.I):
-                    if definition:
-                        definition.filtered = True
+                    definition.filtered = True
+                    break
 
             if filterIndent != "":
                 # We are filtering, check this line is
@@ -355,6 +401,7 @@ class WikiHandler(ContentHandler):
                     filterIndent = ""
 
 
+            # Filter meanings having really bad words in them
             for filter in self.filterDefinitionType:
                 if re.search(filter, l, re.I):
                     result = re.search(r"^[ ]*[*#:;]+[ ]*", l)
@@ -362,12 +409,25 @@ class WikiHandler(ContentHandler):
                         # Keep the indent level for which we filter
                         filterIndent = result.group(0).rstrip()
                     next = True
+                    break
+
+            # We already found a meaning for this word, we pick
+            # other senses restrictively
+            if definition.text != "":
+                for filter in self.filterSecondDefinitionType:
+                    if re.search(filter, l, re.I):
+                        result = re.search(r"^[ ]*[*#:;]+[ ]*", l)
+                        if result:
+                            # Keep the indent level for which we filter
+                            filterIndent = result.group(0).rstrip()
+                        next = True
+                        break
 
             if next:
                 continue
 
             # Categories
-            if definition and re.match(ur"\[\[Catégorie:", l):
+            if re.match(ur"\[\[Catégorie:", l):
                 text = re.sub(ur"\[\[Catégorie:([^|}\]]+).*", r"\1", l)
                 definition.addCategory(text)
                 continue



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]