[gcompris/gcomprixogoo] Added support for static site export in wiktio2xml Reworked the code to have the state common to bot
- From: Bruno Coudoin <bcoudoin src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [gcompris/gcomprixogoo] Added support for static site export in wiktio2xml Reworked the code to have the state common to bot
- Date: Sun, 3 Oct 2010 00:58:00 +0000 (UTC)
commit 60d1ca9ac029f8282672aa43ddf02bd07bcff279
Author: Bruno Coudoin <bruno coudoin free fr>
Date: Sun Oct 3 02:56:55 2010 +0200
Added support for static site export in wiktio2xml
Reworked the code to have the state common to both files.
tools/wiktio2xml/wiktio.py | 177 +++++++++++++++++++++++----------------
tools/wiktio2xml/wiktio2xml.py | 113 +++++++++++++-------------
2 files changed, 161 insertions(+), 129 deletions(-)
---
diff --git a/tools/wiktio2xml/wiktio.py b/tools/wiktio2xml/wiktio.py
index f09ba98..934bfdd 100644
--- a/tools/wiktio2xml/wiktio.py
+++ b/tools/wiktio2xml/wiktio.py
@@ -20,6 +20,8 @@
# Implementation of the wiktionary model
# used
+import os.path
+
class Definition:
def __init__ (self):
@@ -49,81 +51,72 @@ class Definition:
def setGender(self, gender):
self.gender = gender
- def addSynonym(self, synonym):
- if len(synonym):
- self.synonym.append(synonym)
-
- def addAntonym(self, antonym):
- if len(antonym):
- self.antonym.append(antonym)
-
- def addAnagram(self, anagram):
- if len(anagram):
- self.anagram.append(anagram)
-
- def addHyperonym(self, hyperonym):
- if len(hyperonym):
- self.hyperonym.append(hyperonym)
-
- def addHyponym(self, hyponym):
- if len(hyponym):
- self.hyponym.append(hyponym)
-
- def addPrononciation(self, prononciation):
- if len(prononciation):
- self.prononciation.append(prononciation)
-
- def addCategory(self, category):
- if len(category):
- self.category.append(category)
-
- def addImage(self, image):
- if len(image):
- self.image.append(image)
+ def add(self, atype, text):
+ if len(text) == 0:
+ return
- def dump2htmlImage(self):
+ if atype == Wiktio.ANAGRAM:
+ self.anagram.append(text)
+ elif atype == Wiktio.SYNONYM:
+ self.synonym.append(text)
+ elif atype == Wiktio.ANTONYM:
+ self.antonym.append(text)
+ elif atype == Wiktio.HYPERONYM:
+ self.hyperonym.append(text)
+ elif atype == Wiktio.HYPONYM:
+ self.hyponym.append(text)
+ elif atype == Wiktio.PRON:
+ self.prononciation.append(text)
+ elif atype == Wiktio.IMAGE:
+ self.image.append(text)
+ elif atype == Wiktio.CATEGORY:
+ self.category.append(text)
+ else:
+ print "!!ERROR!!: Type not supported"
+
+ def dump2htmlImage(self, f):
if self.image:
prefix = "http://fr.wiktionary.org/wiki/Fichier:"
for img in self.image:
- print "<a href='" + prefix + img + "'>" + \
- img + '</a><br/>'
+ f.write ( "<a href='" + prefix + img + "'>" + \
+ img + '</a><br/>' )
- def dump2htmlPrononciation(self, title, liste):
+ def dump2htmlPrononciation(self, f, title, liste):
prefix = "http://commons.wikimedia.org/wiki/File:"
if len(liste):
- print "<h2>" + title + "</h2>"
- print "<ul>"
+ f.write ( "<h2>" + title + "</h2>" )
+ f.write ( "<ul>" )
for s in liste:
- print "<li><a href='" + prefix + s + "'>" \
- + s + "</a></li>"
- print "</ul>"
+ f.write ( "<li><a href='" + prefix + s + "'>" \
+ + s + "</a></li>" )
+ f.write ( "</ul>" )
- def dump2htmlItem(self, title, liste):
+ def dump2htmlItem(self, f, title, liste):
if len(liste):
- print "<h2>" + title + "</h2>"
+ f.write ( "<h2>" + title + "</h2>" )
for s in liste:
if s.find(":") >= 0:
- print "<br/>" + s
+ f.write ( "<br/>" + s )
else:
- print s + ", "
+ f.write ( s + ", " )
- def dump2html(self):
+ def dump2html(self, f):
if self.filtered or self.text == "":
return
- print "<h3>" + self.type + \
+ f.write ( "<h3>" + self.type + \
" " + self.subType + \
- " " + self.gender + "</h3>"
- self.dump2htmlImage()
- print self.text
-
- self.dump2htmlItem("Synonymes", self.synonym)
- self.dump2htmlItem("Antonymes", self.antonym)
- self.dump2htmlItem("Anagrammes", self.anagram)
- self.dump2htmlItem("Hyperonymes", self.hyperonym)
- self.dump2htmlItem("Hyponymes", self.hyponym)
- self.dump2htmlPrononciation("Prononciation", self.prononciation)
- self.dump2htmlItem(u"Catégories", self.category)
+ " " + self.gender + "</h3>" )
+ self.dump2htmlImage(f)
+ f.write ( self.text )
+
+ self.dump2htmlItem(f, "Synonymes", self.synonym)
+ self.dump2htmlItem(f, "Antonymes", self.antonym)
+ self.dump2htmlItem(f, "Anagrammes", self.anagram)
+ self.dump2htmlItem(f, "Hyperonymes", self.hyperonym)
+ self.dump2htmlItem(f, "Hyponymes", self.hyponym)
+ self.dump2htmlPrononciation(f, "Prononciation", self.prononciation)
+ self.dump2htmlItem(f, u"Catégories", self.category)
class Word:
@@ -137,18 +130,29 @@ class Word:
def addDefinition(self, definition):
self.definition.append(definition)
- def dump2html(self):
- print "<hr/>"
- print "<h1>" + self.name + "</h1>"
+ def dump2html(self, f):
+ f.write ( "<hr/>" )
+ f.write ( "<h1>" + self.name + "</h1>" )
if not self.definition:
- print "<h2>ERROR: NO DEFINITION</h2>"
+ f.write ( "<h2>ERROR: NO DEFINITION</h2>" )
return
for d in self.definition:
- d.dump2html()
+ d.dump2html(f)
class Wiktio:
+ (DEFINITION,
+ ANAGRAM,
+ SYNONYM,
+ ANTONYM,
+ HYPERONYM,
+ HYPONYM,
+ PRON,
+ IMAGE,
+ CATEGORY,
+ SKIP) = range(0, 10)
+
def __init__ (self):
self.words = []
@@ -161,21 +165,48 @@ class Wiktio:
def sort(self):
self.words.sort(key=lambda word: word.name.lower())
- def dumpHtmlHeader(self):
- print """
+ def dumpHtmlHeader(self, f):
+ f.write ( """
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="fr" dir="ltr">
<head>
<title>Mini - Wiktionnaire</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
-"""
- def dumpHtmlFooter(self):
- print """
+""")
+
+ def dumpHtmlFooter(self, f):
+ f.write ("""
</head>
-"""
- def dump2html(self):
- self.dumpHtmlHeader()
+""")
+
+ # Creates a big HTML file, useful to debug
+ def dump2html(self, file):
+ with open(file, 'w') as f:
+ self.dumpHtmlHeader(f)
+ self.sort()
+ for w in self.words:
+ w.dump2html(f)
+ self.dumpHtmlFooter(f)
+
+ # Creates a static HTML site in the given directory
+ def dump2htmlSite(self, baseDir):
+ if not os.path.isdir(baseDir):
+ print "ERROR: Directory '" + baseDir + "' does not exists."
+ return
+
+ letter = "/"
self.sort()
- for w in self.words:
- w.dump2html()
- self.dumpHtmlFooter()
+ with open(baseDir + '/index.html', 'w') as f_index:
+ self.dumpHtmlHeader(f_index)
+ for w in self.words:
+ if letter[0] != w.name[0].upper():
+ letter = w.name[0].upper()
+ f_index.write ( "<hr/><h1>" + letter[0] + "</h1>" )
+ f_index.write ( "<a href='" + w.name + ".html'>" + w.name + "</a> " )
+ with open(baseDir + '/' + w.name + '.html', 'w') as f:
+ self.dumpHtmlHeader(f)
+ w.dump2html(f)
+ self.dumpHtmlFooter(f)
+
+ self.dumpHtmlFooter(f_index)
+
diff --git a/tools/wiktio2xml/wiktio2xml.py b/tools/wiktio2xml/wiktio2xml.py
index 8ffc8e3..3947688 100755
--- a/tools/wiktio2xml/wiktio2xml.py
+++ b/tools/wiktio2xml/wiktio2xml.py
@@ -22,17 +22,21 @@ from xml.sax.handler import ContentHandler
import sys
import re
+from optparse import OptionParser
+
import wiktio
+from wiktio import Wiktio
debug = False
class WikiHandler(ContentHandler):
- def __init__ (self, searchWords, locale, _wiktio):
+ def __init__ (self, searchWords, locale, _wiktio, verbose):
self.searchWords= searchWords;
self.locale = locale
self.wiktio = _wiktio
+ self.verbose = verbose
self.isPageElement = False
@@ -95,6 +99,8 @@ class WikiHandler(ContentHandler):
# These definitions will always be skipped
self.filterDefinitionType = [ ur"{{vulg[^}]*}}",
ur"{{injur[^}]*}}",
+ ur"{{sexe[^}]*}}",
+ ur"{{sexua[^}]*}}",
ur"coït",
ur"argot"]
@@ -201,17 +207,12 @@ class WikiHandler(ContentHandler):
index = 0
while index >= 0:
index = text.find(quote)
- if index >= 0:
+ index2 = text.find(quote, index)
+ if index >= 0 and index2 >=0:
text = text.replace(quote, openXml, 1)
- else:
- return text
-
- index = text.find(quote)
- if index >= 0:
text = text.replace(quote, closeXml, 1)
else:
- # Malformed statement, should fix wiktionary
- text += closeXml
+ return text
return text
# Replace standard Wiki tags to XML
@@ -253,18 +254,11 @@ class WikiHandler(ContentHandler):
# Wikipedia text content is interpreted and transformed in XML
def parseText(self):
+ if self.verbose:
+ print "Processing " + self.titleContent
inWord = wiktio.Word()
- (DEFINITION,
- ANAGRAM,
- SYNONYM,
- ANTONYM,
- HYPERONYM,
- HYPONYM,
- PRON,
- SKIP) = range(0, 8)
-
- state = SKIP
+ state = Wiktio.SKIP
wordType = ""
wordSubType = ""
@@ -299,41 +293,43 @@ class WikiHandler(ContentHandler):
inWord.setName(self.titleContent)
# Get rid of the word, we don't want it in the definition
l = re.sub(r"'''.*'''[ ]*(.*)", r"\1", l)
- state = DEFINITION
+ # Get rid of non wiki tags
+ l = re.sub(r'}}[^}]+{{', r'}} {{', l)
+ state = Wiktio.DEFINITION
elif l == "{{-anagr-}}":
definition.addText(self.wiki2xml("", False))
- state = ANAGRAM
+ state = Wiktio.ANAGRAM
elif l == "{{-syn-}}":
definition.addText(self.wiki2xml("", False))
- state = SYNONYM
+ state = Wiktio.SYNONYM
elif l == "{{-ant-}}":
definition.addText(self.wiki2xml("", False))
- state = ANTONYM
+ state = Wiktio.ANTONYM
elif l == "{{-hyper-}}":
definition.addText(self.wiki2xml("", False))
- state = HYPERONYM
+ state = Wiktio.HYPERONYM
elif l == "{{-hypo-}}":
definition.addText(self.wiki2xml("", False))
- state = HYPONYM
+ state = Wiktio.HYPONYM
elif l == "{{-pron-}}":
definition.addText(self.wiki2xml("", False))
- state = PRON
+ state = Wiktio.PRON
elif l == "{{-note-}}":
- state = SKIP
+ state = Wiktio.SKIP
elif l == "{{-apr-}}":
- state = SKIP
+ state = Wiktio.SKIP
elif l == "{{-drv-}}":
- state = SKIP
+ state = Wiktio.SKIP
elif l == "{{-exp-}}":
- state = SKIP
+ state = Wiktio.SKIP
elif l == "{{-trad-}}":
- state = SKIP
+ state = Wiktio.SKIP
elif l == "{{-voc-}}":
- state = SKIP
+ state = Wiktio.SKIP
elif l == "{{-voir-}}":
- state = SKIP
+ state = Wiktio.SKIP
elif l == u"{{-réf-}}":
- state = SKIP
+ state = Wiktio.SKIP
elif re.search(r"{{-.*-.*}}", l):
if definition.text != "":
if debug: print "<br/>new definition:" + l + ":"
@@ -343,7 +339,7 @@ class WikiHandler(ContentHandler):
filterIndent = ""
definition = wiktio.Definition()
inWord.addDefinition(definition)
- state = SKIP
+ state = Wiktio.SKIP
# Are we still in the correct language section
# We assume the correct language is ahead
@@ -354,7 +350,7 @@ class WikiHandler(ContentHandler):
# Image
if definition and re.match(ur"\[\[Image:", l):
text = re.sub(ur"\[\[Image:([^|}\]]+).*", r"\1", l)
- definition.addImage(text)
+ definition.add(Wiktio.IMAGE, text)
continue
for wt in self.wordTypes.keys():
@@ -380,7 +376,7 @@ class WikiHandler(ContentHandler):
definition.setSubType(wordSubType)
break
- if state == SKIP:
+ if state == Wiktio.SKIP:
continue
for filter in self.filterContent:
@@ -429,26 +425,18 @@ class WikiHandler(ContentHandler):
# Categories
if re.match(ur"\[\[Catégorie:", l):
text = re.sub(ur"\[\[Catégorie:([^|}\]]+).*", r"\1", l)
- definition.addCategory(text)
+ definition.add(Wiktio.CATEGORY, text)
continue
- if state == DEFINITION:
+ if state == Wiktio.DEFINITION:
definition.addText(self.wiki2xml(l, False))
- elif state == ANAGRAM:
- if len(l) > 0:
- definition.addAnagram(self.wiki2xml(l, True))
- elif state == SYNONYM:
- definition.addSynonym(self.wiki2xml(l, True))
- elif state == ANTONYM:
- definition.addAntonym(self.wiki2xml(l, True))
- elif state == HYPERONYM:
- definition.addHyperonym(self.wiki2xml(l, True))
- elif state == HYPONYM:
- definition.addHyponym(self.wiki2xml(l, True))
- elif state == PRON:
+ elif state == Wiktio.PRON:
file = re.subn(r".*audio=([^|}]+).*", r"\1", l)
if file[1] == 1:
- definition.addPrononciation(file[0])
+ definition.add(state, file[0])
+ else:
+ if len(l) > 0:
+ definition.add(state, self.wiki2xml(l, True))
return inWord
@@ -459,7 +447,18 @@ def usage():
reload(sys)
sys.setdefaultencoding('utf-8')
-if len(sys.argv) != 3:
+parser = OptionParser()
+parser.add_option("-o", "--output", dest="output",
+ help="write result to file or directory")
+parser.add_option("-q", "--quiet",
+ action="store_false", dest="verbose", default=True,
+ help="don't print status messages to stdout")
+parser.add_option("-s", "--site",
+ action="store_false", dest="site", default=False,
+ help="Creates a web site")
+(options, args) = parser.parse_args()
+
+if len(sys.argv) < 3:
usage()
sys.exit()
@@ -474,7 +473,9 @@ f.close()
_wiktio = wiktio.Wiktio()
-parse(wikiFile, WikiHandler(words, 'fr', _wiktio))
-
+parse(wikiFile, WikiHandler(words, 'fr', _wiktio, options.verbose))
-_wiktio.dump2html()
+if options.site:
+ _wiktio.dump2htmlSite(options.output)
+else:
+ _wiktio.dump2html(options.output)
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]