[gcompris/gcomprixogoo] Reworked the wiktionary parser to have a separate model. Much cleaner code but not yet as stable a b
- From: Bruno Coudoin <bcoudoin src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [gcompris/gcomprixogoo] Reworked the wiktionary parser to have a separate model. Much cleaner code but not yet as stable a b
- Date: Tue, 21 Sep 2010 21:47:35 +0000 (UTC)
commit 6c0072246ad59ae412f57e72d6233574fdea1491
Author: Bruno Coudoin <bruno coudoin free fr>
Date: Tue Sep 21 23:46:02 2010 +0200
Reworked the wiktionary parser to have a separate model. Much cleaner code
but not yet as stable a before.
tools/wiktio2xml/wiktio.py | 103 ++++++++++++++++++++++++++++++++++++++
tools/wiktio2xml/wiktio2xml.py | 107 +++++++++++++++++++---------------------
2 files changed, 153 insertions(+), 57 deletions(-)
---
diff --git a/tools/wiktio2xml/wiktio.py b/tools/wiktio2xml/wiktio.py
new file mode 100644
index 0000000..8a5e582
--- /dev/null
+++ b/tools/wiktio2xml/wiktio.py
@@ -0,0 +1,103 @@
+# -*- coding: utf-8 -*-
+
+#
+# Implementation of the wiktionary model
+# used
+
+class Definition:
+
+ def __init__ (self):
+ self.text = ""
+ self.type = ""
+ self.subType = ""
+
+ def addText(self, text):
+ self.text += text
+
+ def setType(self, type):
+ self.type = type
+
+ def setSubType(self, subType):
+ self.subType = subType
+
+ def dump2html(self):
+ print "<definition type='" + self.type \
+ + "' subType='" + self.subType + "'>" \
+ + self.text + "</definition>"
+
+class Word:
+
+ def __init__ (self, word):
+ self.word = word
+ self.definition = []
+ self.synonym = []
+ self.antonym = []
+ self.anagram = []
+ self.prononciation = []
+
+ def addDefinition(self, definition):
+ self.definition.append(definition)
+
+ def addSynonym(self, synonym):
+ if len(synonym):
+ self.synonym.append(synonym)
+
+ def addAntonym(self, antonym):
+ if len(antonym):
+ self.antonym.append(antonym)
+
+ def addAnagram(self, anagram):
+ if len(anagram):
+ self.anagram.append(anagram)
+
+ def addPronociation(self, prononciation):
+ if len(prononciation):
+ self.prononciation.append(prononciation)
+
+ def dump2htmlItem(self, title, liste):
+ if len(liste):
+ print "<h2>" + title + "</h2>"
+ print "<ul>"
+ for s in liste:
+ print "<li>" + s + "</li>"
+ print "</ul>"
+
+ def dump2html(self):
+ print "<hr></hr>"
+ print "<h1>" + self.word + "</h1>"
+ for d in self.definition:
+ d.dump2html()
+
+ self.dump2htmlItem("Synonym", self.synonym)
+ self.dump2htmlItem("Antonym", self.antonym)
+ self.dump2htmlItem("Anagram", self.anagram)
+ self.dump2htmlItem("Prononciation", self.prononciation)
+
+class Wiktio:
+
+ def __init__ (self):
+ self.words = []
+
+ def addWord(self, word):
+ self.words.append(word)
+
+ def getWords(self):
+ return self.words
+
+ def dumpHtmlHeader(self):
+ print """
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" lang="fr" dir="ltr">
+<head>
+<title>Mini - Wiktionnaire</title>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
+"""
+ def dumpHtmlFooter(self):
+ print """
+</head>
+"""
+ def dump2html(self):
+ self.dumpHtmlHeader()
+ for w in self.words:
+ w.dump2html()
+ self.dumpHtmlFooter()
diff --git a/tools/wiktio2xml/wiktio2xml.py b/tools/wiktio2xml/wiktio2xml.py
index 30bc942..22d8813 100755
--- a/tools/wiktio2xml/wiktio2xml.py
+++ b/tools/wiktio2xml/wiktio2xml.py
@@ -6,12 +6,15 @@ from xml.sax.handler import ContentHandler
import sys
import re
+import wiktio
+
class WikiHandler(ContentHandler):
- def __init__ (self, searchWords, locale):
+ def __init__ (self, searchWords, locale, _wiktio):
self.searchWords= searchWords;
self.locale = locale
+ self.wiktio = _wiktio
self.isPageElement = False
@@ -67,8 +70,9 @@ class WikiHandler(ContentHandler):
if name == 'page':
self.isPageElement= False
if self.titleContent in self.searchWords:
- print "<hr></hr><h1>" + self.titleContent + "</h1>"
- self.parseText()
+ word = self.parseText()
+ if word:
+ self.wiktio.addWord(word)
self.titleContent = ""
self.textContent = ""
@@ -106,7 +110,7 @@ class WikiHandler(ContentHandler):
# We keep the level of indentation to close in the stack:
# self.lilevel
#
- def indents2xml(self, text):
+ def indents2xml(self, text, asText):
result = re.search(r"^[ ]*[*#:;]+[ ]*", text)
if not result:
close = ""
@@ -134,7 +138,10 @@ class WikiHandler(ContentHandler):
result += "<ol>"
self.lilevel.append("</ol>")
- return result + "<li>" + text + "</li>"
+ if asText:
+ return text
+ else:
+ return result + "<li>" + text + "</li>"
def quote2xml(self, quote, openXml, closeXml, text):
index = 0
@@ -154,7 +161,7 @@ class WikiHandler(ContentHandler):
return text
# Replace standard Wiki tags to XML
- def wiki2xml(self, text):
+ def wiki2xml(self, text, asText):
text = re.sub(r"{{[-\)\(]}}", "", text)
text = re.sub(r"\[\[\w+:\w+\]\]", "", text)
@@ -162,7 +169,7 @@ class WikiHandler(ContentHandler):
if text == "":
return ""
- text = self.indents2xml(text)
+ text = self.indents2xml(text, asText)
text = re.sub(r"{{par ext[^}]+}}", r"(Par extension)", text)
text = re.sub(r"{{litt[^}]+}}", r"(Littéraire)", text)
text = re.sub(r"{{figuré[^}]+}}", r"(Figuré)", text)
@@ -195,7 +202,8 @@ class WikiHandler(ContentHandler):
# Wikipedia text content is interpreted and transformed in XML
def parseText(self):
- inDefinition = False
+ inWord = None
+ inDefinition = None
inAnagram = False
inSynonym = False
inAntonym = False
@@ -214,13 +222,21 @@ class WikiHandler(ContentHandler):
for filter in self.filterContent:
if re.search(filter, l, re.I):
- return
+ return inWord
+
+ # Categories
+ # print "1>" + l
+ # if re.search(r"^\[\[Cat\Sgorie:", l, re.U):
+ # text = re.sub(r"\[\[Cat\Sgorie:(\S\s)+\]\]", r"\1", l)
+ # print "ICI" + text
+ # print "<category>" + text + "</category>"
+ # continue
# Are we still in the correct language section
# We assume the correct language is ahead
lang = re.match(r"== {{=([a-z]+)=}} ==", l)
if lang and lang.group(1) != None and lang.group(1) != self.locale:
- return
+ return inWord
for wt in self.wordTypes.keys():
if re.search(wt, l):
@@ -232,82 +248,57 @@ class WikiHandler(ContentHandler):
if inDefinition:
if not re.search(r"{{-.*-.*}}", l):
- print self.wiki2xml(l)
+ inDefinition.addText(self.wiki2xml(l, False))
else:
- inDefinition = False
- print self.wiki2xml("</definition>")
+ inWord.addDefinition(inDefinition)
+ inDefinition = None
if inAnagram:
- if not re.search(r"{{-.*-.*}}", l):
- print self.wiki2xml(l)
+ if not re.search(r"{{-.*-.*}}", l) and len(l) > 0:
+ inWord.addAnagram(self.wiki2xml(l, True))
else:
inAnagram = False
- print self.wiki2xml("</anagram>")
if inSynonym:
if not re.search(r"{{-.*-.*}}", l):
- print self.wiki2xml(l)
+ inWord.addSynonym(self.wiki2xml(l, True))
else:
inSynonym = False
- print self.wiki2xml("</synonym>")
if inAntonym:
if not re.search(r"{{-.*-.*}}", l):
- print self.wiki2xml(l)
+ inWord.addAntonym(self.wiki2xml(l, True))
else:
inAntonym = False
- print self.wiki2xml("</antonym>")
if inPron:
- if l != "" and l.find(".ogg") != -1:
- # Search the .ogg file
- file = l.split("=")
- if len(file) >= 2:
- file = file[1].replace("}}", "")
- print "<a href=http://fr.wiktionary.org/wiki/Fichier:" \
- + file + ">" + file + "</a>"
+ if not re.search(r"{{-.*-.*}}", l):
+ if l.find(".ogg") != -1:
+ # Search the .ogg file
+ file = l.split("=")
+ if len(file) >= 2:
+ file = file[1].replace("}}", "")
+ inWord.addPronociation(file)
else:
inPron = False
- print self.wiki2xml("</prononciation>")
if l.startswith("'''" + self.titleContent + "'''"):
- inDefinition = True
- print "<h2>Definition " + self.titleContent + "</h2>"
- print("<definition name='" + self.titleContent + "'" +
- " type='" + wordType + "'" +
- " subtype='" + wordSubType + "'" + ">")
- print("<h3>" + wordType + " " + wordSubType + "</h3>")
+ inWord = wiktio.Word(self.titleContent)
+ inDefinition = wiktio.Definition()
+ inDefinition.setType(wordType)
+ inDefinition.setSubType(wordSubType)
wordType = ""
wordSubType = ""
elif l == "{{-anagr-}}":
inAnagram = True
- print "<h2>Anagram</h2>"
- print("<anagram>")
elif l == "{{-syn-}}":
inSynonym= True
- print "<h2>Synonym</h2>"
- print("<synonym>")
elif l == "{{-ant-}}":
inAntonym = True
- print "<h2>Antonym</h2>"
- print("<antonym>")
elif l == "{{-pron-}}":
inPron = True
- print "<h2>Prononciation</h2>"
- print("<prononciation>")
-
-
-def printHtmlHeader():
- print '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'
- print '<html xmlns="http://www.w3.org/1999/xhtml" lang="fr" dir="ltr">'
- print '<head>'
- print '<title>accueil - Wiktionnaire</title>'
- print '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />'
-
-def printHtmlFooter():
- print '</head>'
- print '</html>'
+ return inWord
def usage():
print "wiki2xml.py <wiki file> <word list file>"
@@ -329,7 +320,9 @@ words = []
words = [w.rstrip() for w in f.readlines()]
f.close()
-printHtmlHeader()
-parse(wikiFile, WikiHandler(words, 'fr'))
-printHtmlFooter()
+_wiktio = wiktio.Wiktio()
+
+parse(wikiFile, WikiHandler(words, 'fr', _wiktio))
+
+_wiktio.dump2html()
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]