[gcompris/gcomprixogoo] improved wiktio2xml. works better now, getting close to a working version.
- From: Bruno Coudoin <bcoudoin src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [gcompris/gcomprixogoo] improved wiktio2xml. works better now, getting close to a working version.
- Date: Wed, 22 Sep 2010 23:08:03 +0000 (UTC)
commit 4ba71933cd223b070bb45a4611399ae12a2c7fe2
Author: Bruno Coudoin <bruno coudoin free fr>
Date: Thu Sep 23 01:06:39 2010 +0200
improved wiktio2xml. works better now, getting close to a working version.
tools/wiktio2xml/wiktio.py | 33 ++++++++++++++++++++++++++++-----
tools/wiktio2xml/wiktio2xml.py | 40 ++++++++++++++++++++++++++--------------
2 files changed, 54 insertions(+), 19 deletions(-)
---
diff --git a/tools/wiktio2xml/wiktio.py b/tools/wiktio2xml/wiktio.py
index 8a5e582..374850c 100644
--- a/tools/wiktio2xml/wiktio.py
+++ b/tools/wiktio2xml/wiktio.py
@@ -27,13 +27,17 @@ class Definition:
class Word:
- def __init__ (self, word):
- self.word = word
+ def __init__ (self):
+ self.name = None
self.definition = []
self.synonym = []
self.antonym = []
self.anagram = []
self.prononciation = []
+ self.category = []
+
+ def setName(self, name):
+ self.name = name
def addDefinition(self, definition):
self.definition.append(definition)
@@ -50,10 +54,14 @@ class Word:
if len(anagram):
self.anagram.append(anagram)
- def addPronociation(self, prononciation):
+ def addPrononciation(self, prononciation):
if len(prononciation):
self.prononciation.append(prononciation)
+ def addCategory(self, category):
+ if len(category):
+ self.category.append(category)
+
def dump2htmlItem(self, title, liste):
if len(liste):
print "<h2>" + title + "</h2>"
@@ -62,16 +70,27 @@ class Word:
print "<li>" + s + "</li>"
print "</ul>"
+ def dump2htmlPrononciation(self, title, liste):
+ prefix = "http://commons.wikimedia.org/wiki/File:"
+ if len(liste):
+ print "<h2>" + title + "</h2>"
+ print "<ul>"
+ for s in liste:
+ print "<li><a href=" + prefix + s + ">" \
+ + s + "</a></li>"
+ print "</ul>"
+
def dump2html(self):
print "<hr></hr>"
- print "<h1>" + self.word + "</h1>"
+ print "<h1>" + self.name + "</h1>"
for d in self.definition:
d.dump2html()
self.dump2htmlItem("Synonym", self.synonym)
self.dump2htmlItem("Antonym", self.antonym)
self.dump2htmlItem("Anagram", self.anagram)
- self.dump2htmlItem("Prononciation", self.prononciation)
+ self.dump2htmlPrononciation("Prononciation", self.prononciation)
+ self.dump2htmlItem("Category", self.category)
class Wiktio:
@@ -84,6 +103,9 @@ class Wiktio:
def getWords(self):
return self.words
+ def sort(self):
+ self.words.sort(key=lambda word: word.name)
+
def dumpHtmlHeader(self):
print """
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
@@ -98,6 +120,7 @@ class Wiktio:
"""
def dump2html(self):
self.dumpHtmlHeader()
+ self.sort()
for w in self.words:
w.dump2html()
self.dumpHtmlFooter()
diff --git a/tools/wiktio2xml/wiktio2xml.py b/tools/wiktio2xml/wiktio2xml.py
index 22d8813..5dca4c6 100755
--- a/tools/wiktio2xml/wiktio2xml.py
+++ b/tools/wiktio2xml/wiktio2xml.py
@@ -64,6 +64,8 @@ class WikiHandler(ContentHandler):
r"homosexuel",
r"vagin"]
+ self.filterDefinitionType = [ r"{{vulg[^}]+}}",
+ r"{{injur[^}]+}}" ]
def endElement(self, name):
@@ -71,7 +73,7 @@ class WikiHandler(ContentHandler):
self.isPageElement= False
if self.titleContent in self.searchWords:
word = self.parseText()
- if word:
+ if word and word.name:
self.wiktio.addWord(word)
self.titleContent = ""
@@ -116,7 +118,8 @@ class WikiHandler(ContentHandler):
close = ""
while self.lilevel:
close += self.lilevel.pop()
- text = close + text
+ if not asText:
+ text = close + text
return text
indent = result.group(0).rstrip()
@@ -167,7 +170,7 @@ class WikiHandler(ContentHandler):
text = re.sub(r"\[\[\w+:\w+\]\]", "", text)
text = re.sub(r"{{\(\|(.*)}}", r"\1", text)
if text == "":
- return ""
+ return self.indents2xml(text, asText)
text = self.indents2xml(text, asText)
text = re.sub(r"{{par ext[^}]+}}", r"(Par extension)", text)
@@ -202,7 +205,7 @@ class WikiHandler(ContentHandler):
# Wikipedia text content is interpreted and transformed in XML
def parseText(self):
- inWord = None
+ inWord = wiktio.Word()
inDefinition = None
inAnagram = False
inSynonym = False
@@ -211,7 +214,7 @@ class WikiHandler(ContentHandler):
wordType = ""
wordSubType = ""
- # Append and end of text marker, it makes my life easier
+ # Append an end of text marker, it makes my life easier
self.textContent += "\n{{-EndOfTest-}}"
# Remove html comment (multilines)
@@ -220,17 +223,24 @@ class WikiHandler(ContentHandler):
for l in self.textContent.splitlines():
+ next = False
+
for filter in self.filterContent:
if re.search(filter, l, re.I):
- return inWord
+ return None
+
+ for filter in self.filterDefinitionType:
+ if re.search(filter, l, re.I):
+ next = True
+
+ if next:
+ continue
# Categories
- # print "1>" + l
- # if re.search(r"^\[\[Cat\Sgorie:", l, re.U):
- # text = re.sub(r"\[\[Cat\Sgorie:(\S\s)+\]\]", r"\1", l)
- # print "ICI" + text
- # print "<category>" + text + "</category>"
- # continue
+ if re.match(ur"\[\[Catégorie:", l):
+ text = re.sub(ur"\[\[Catégorie:(.*)\]\]", r"\1", l)
+ inWord.addCategory(text)
+ continue
# Are we still in the correct language section
# We assume the correct language is ahead
@@ -250,6 +260,8 @@ class WikiHandler(ContentHandler):
if not re.search(r"{{-.*-.*}}", l):
inDefinition.addText(self.wiki2xml(l, False))
else:
+ # Force a <ul> close if needed
+ inDefinition.addText(self.wiki2xml("", False))
inWord.addDefinition(inDefinition)
inDefinition = None
@@ -278,12 +290,12 @@ class WikiHandler(ContentHandler):
file = l.split("=")
if len(file) >= 2:
file = file[1].replace("}}", "")
- inWord.addPronociation(file)
+ inWord.addPrononciation(file)
else:
inPron = False
if l.startswith("'''" + self.titleContent + "'''"):
- inWord = wiktio.Word(self.titleContent)
+ inWord.setName(self.titleContent)
inDefinition = wiktio.Definition()
inDefinition.setType(wordType)
inDefinition.setSubType(wordSubType)
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]