[gcompris/gcomprixogoo] Major rework of the parsing algo in wiktio2xml. Added support for hyponym and hyperonym. Added suppo
- From: Bruno Coudoin <bcoudoin src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [gcompris/gcomprixogoo] Major rework of the parsing algo in wiktio2xml. Added support for hyponym and hyperonym. Added suppo
- Date: Sat, 25 Sep 2010 00:18:30 +0000 (UTC)
commit 16f702ac931bb9f6a852214a37fed4de62d5c7fc
Author: Bruno Coudoin <bruno coudoin free fr>
Date: Sat Sep 25 02:16:47 2010 +0200
Major rework of the parsing algo in wiktio2xml. Added support for
hyponym and hyperonym. Added support for images.
tools/wiktio2xml/wiktio.py | 103 +++++++++++++++---------
tools/wiktio2xml/wiktio2xml.py | 175 +++++++++++++++++++++------------------
2 files changed, 159 insertions(+), 119 deletions(-)
---
diff --git a/tools/wiktio2xml/wiktio.py b/tools/wiktio2xml/wiktio.py
index ec99116..8331e9b 100644
--- a/tools/wiktio2xml/wiktio.py
+++ b/tools/wiktio2xml/wiktio.py
@@ -28,6 +28,14 @@ class Definition:
self.subType = ""
self.filtered = False
self.gender = ""
+ self.synonym = []
+ self.antonym = []
+ self.anagram = []
+ self.hyperonym = []
+ self.hyponym = []
+ self.prononciation = []
+ self.category = []
+ self.image = []
def addText(self, text):
self.text += text
@@ -41,31 +49,6 @@ class Definition:
def setGender(self, gender):
self.gender = gender
- def dump2html(self):
- if self.filtered:
- return
- print "<h3>" + self.type + \
- " " + self.subType + \
- " " + self.gender + "</h3>"
- print self.text
-
-class Word:
-
- def __init__ (self, name = None):
- self.name = name
- self.definition = []
- self.synonym = []
- self.antonym = []
- self.anagram = []
- self.prononciation = []
- self.category = []
-
- def setName(self, name):
- self.name = name
-
- def addDefinition(self, definition):
- self.definition.append(definition)
-
def addSynonym(self, synonym):
if len(synonym):
self.synonym.append(synonym)
@@ -78,6 +61,14 @@ class Word:
if len(anagram):
self.anagram.append(anagram)
+ def addHyperonym(self, hyperonym):
+ if len(hyperonym):
+ self.hyperonym.append(hyperonym)
+
+ def addHyponym(self, hyponym):
+ if len(hyponym):
+ self.hyponym.append(hyponym)
+
def addPrononciation(self, prononciation):
if len(prononciation):
self.prononciation.append(prononciation)
@@ -86,14 +77,16 @@ class Word:
if len(category):
self.category.append(category)
- def dump2htmlItem(self, title, liste):
- if len(liste):
- print "<h2>" + title + "</h2>"
- for s in liste:
- if s.find(":") >= 0:
- print "<br></br>" + s
- else:
- print s
+ def addImage(self, image):
+ if len(image):
+ self.image.append(image)
+
+ def dump2htmlImage(self):
+ if self.image:
+ prefix = "http://fr.wiktionary.org/wiki/Fichier:"
+ for img in self.image:
+ print "<a href='" + prefix + img + "'>" + \
+ img + '</a><br/>'
def dump2htmlPrononciation(self, title, liste):
prefix = "http://commons.wikimedia.org/wiki/File:"
@@ -105,17 +98,51 @@ class Word:
+ s + "</a></li>"
print "</ul>"
+ def dump2htmlItem(self, title, liste):
+
+ if len(liste):
+ print "<h2>" + title + "</h2>"
+ for s in liste:
+ if s.find(":") >= 0:
+ print "<br/>" + s
+ else:
+ print s + ", "
+
+ def dump2html(self):
+ if self.filtered:
+ return
+ print "<h3>" + self.type + \
+ " " + self.subType + \
+ " " + self.gender + "</h3>"
+ self.dump2htmlImage()
+ print self.text
+
+ self.dump2htmlItem("Synonymes", self.synonym)
+ self.dump2htmlItem("Antonymes", self.antonym)
+ self.dump2htmlItem("Anagrammes", self.anagram)
+ self.dump2htmlItem("Hyperonymes", self.hyperonym)
+ self.dump2htmlItem("Hyponymes", self.hyponym)
+ self.dump2htmlPrononciation("Prononciation", self.prononciation)
+ self.dump2htmlItem(u"Catégories", self.category)
+
+class Word:
+
+ def __init__ (self, name = None):
+ self.name = name
+ self.definition = []
+
+ def setName(self, name):
+ self.name = name
+
+ def addDefinition(self, definition):
+ self.definition.append(definition)
+
def dump2html(self):
print "<hr></hr>"
print "<h1>" + self.name + "</h1>"
for d in self.definition:
d.dump2html()
- self.dump2htmlItem("Synonym", self.synonym)
- self.dump2htmlItem("Antonym", self.antonym)
- self.dump2htmlItem("Anagram", self.anagram)
- self.dump2htmlPrononciation("Prononciation", self.prononciation)
- self.dump2htmlItem("Category", self.category)
class Wiktio:
diff --git a/tools/wiktio2xml/wiktio2xml.py b/tools/wiktio2xml/wiktio2xml.py
index a9bf269..90335b0 100755
--- a/tools/wiktio2xml/wiktio2xml.py
+++ b/tools/wiktio2xml/wiktio2xml.py
@@ -233,11 +233,19 @@ class WikiHandler(ContentHandler):
# Wikipedia text content is interpreted and transformed in XML
def parseText(self):
inWord = wiktio.Word()
- inDefinition = None
- inAnagram = False
- inSynonym = False
- inAntonym = False
- inPron = False
+ definition = None
+
+ (DEFINITION,
+ ANAGRAM,
+ SYNONYM,
+ ANTONYM,
+ HYPERONYM,
+ HYPONYM,
+ PRON,
+ SKIP) = range(0, 8)
+
+ state = SKIP
+
wordType = ""
wordSubType = ""
filterIndent = ""
@@ -250,22 +258,80 @@ class WikiHandler(ContentHandler):
self.textContent = re.sub(r"<!--[^>]*-->", "",
self.textContent, re.M)
+ definition = wiktio.Definition()
concat = ""
for l in self.textContent.splitlines():
l = concat + l
next = False
if re.search(r"<[^>]+$", l):
- # Wiki uses a trick to format text area by endind in uncomplete
+ # Wiki uses a trick to format text area by ending in uncomplete
# html tags. In this case, we concat this line with the next one
# before processing it
concat = l
continue
+ # Determine the section of the document we are in
+ if l.startswith("'''" + self.titleContent + "'''"):
+ inWord.setName(self.titleContent)
+ # Get rid of the word, we don't want it in the definition
+ l = re.sub(r"'''.*'''(.*)", r"\1", l)
+ state = DEFINITION
+ elif l == "{{-anagr-}}":
+ state = ANAGRAM
+ elif l == "{{-syn-}}":
+ state = SYNONYM
+ elif l == "{{-ant-}}":
+ state = ANTONYM
+ elif l == "{{-hyper-}}":
+ state = HYPERONYM
+ elif l == "{{-hypo-}}":
+ state = HYPONYM
+ elif l == "{{-pron-}}":
+ state = PRON
+ elif re.search(r"{{-.*-.*}}", l):
+ if definition.text != "":
+ # Force a <ul> close if needed
+ definition.addText(self.wiki2xml("", False))
+ inWord.addDefinition(definition)
+ # Next definition
+ definition = wiktio.Definition()
+ state = SKIP
+
+ # Are we still in the correct language section
+ # We assume the correct language is ahead
+ lang = re.match(r"== {{=([a-z]+)=}} ==", l)
+ if lang and lang.group(1) != None and lang.group(1) != self.locale:
+ return inWord
+
+ # Image
+ if definition and re.match(ur"\[\[Image:", l):
+ text = re.sub(ur"\[\[Image:([^|}\]]+).*", r"\1", l)
+ definition.addImage(text)
+ continue
+
+ for wt in self.wordTypes.keys():
+ if re.search(wt, l):
+ wordType = self.wordTypes[wt]
+ definition.setType(wordType)
+
+ for wt in self.genders.keys():
+ if re.search(wt, l):
+ gender = self.genders[wt]
+ definition.setGender(gender)
+
+ for wt in self.wordSubTypes.keys():
+ if re.search(wt, l):
+ wordSubType = self.wordSubTypes[wt]
+ definition.setSubType(wordSubType)
+
+ if state == SKIP:
+ continue
+
for filter in self.filterContent:
if re.search(filter, l, re.I):
- if inDefinition:
- inDefinition.filtered = True
+ if definition:
+ definition.filtered = True
if filterIndent != "":
# We are filtering, check this line is
@@ -292,81 +358,28 @@ class WikiHandler(ContentHandler):
continue
# Categories
- if re.match(ur"\[\[Catégorie:", l):
- text = re.sub(ur"\[\[Catégorie:(.*)\]\]", r"\1", l)
- inWord.addCategory(text)
+ if definition and re.match(ur"\[\[Catégorie:", l):
+ text = re.sub(ur"\[\[Catégorie:([^|}\]]+).*", r"\1", l)
+ definition.addCategory(text)
continue
- # Are we still in the correct language section
- # We assume the correct language is ahead
- lang = re.match(r"== {{=([a-z]+)=}} ==", l)
- if lang and lang.group(1) != None and lang.group(1) != self.locale:
- return inWord
-
- for wt in self.wordTypes.keys():
- if re.search(wt, l):
- wordType = self.wordTypes[wt]
-
- for wt in self.genders.keys():
- if re.search(wt, l):
- gender = self.genders[wt]
-
- for wt in self.wordSubTypes.keys():
- if re.search(wt, l):
- wordSubType = self.wordSubTypes[wt]
-
- if inDefinition:
- if not re.search(r"{{-.*-.*}}", l):
- inDefinition.addText(self.wiki2xml(l, False))
- else:
- # Force a <ul> close if needed
- inDefinition.addText(self.wiki2xml("", False))
- inWord.addDefinition(inDefinition)
- inDefinition = None
-
- if inAnagram:
- if not re.search(r"{{-.*-.*}}", l) and len(l) > 0:
- inWord.addAnagram(self.wiki2xml(l, True))
- else:
- inAnagram = False
-
- if inSynonym:
- if not re.search(r"{{-.*-.*}}", l):
- inWord.addSynonym(self.wiki2xml(l, True))
- else:
- inSynonym = False
-
- if inAntonym:
- if not re.search(r"{{-.*-.*}}", l):
- inWord.addAntonym(self.wiki2xml(l, True))
- else:
- inAntonym = False
-
- if inPron:
- if not re.search(r"{{-.*-.*}}", l):
- file = re.subn(r".*audio=([^|}]+).*", r"\1", l)
- if file[1] == 1:
- inWord.addPrononciation(file[0])
- else:
- inPron = False
-
- if l.startswith("'''" + self.titleContent + "'''"):
- inWord.setName(self.titleContent)
- inDefinition = wiktio.Definition()
- inDefinition.setType(wordType)
- inDefinition.setSubType(wordSubType)
- inDefinition.setGender(gender)
- wordType = ""
- wordSubType = ""
- gender = ""
- elif l == "{{-anagr-}}":
- inAnagram = True
- elif l == "{{-syn-}}":
- inSynonym= True
- elif l == "{{-ant-}}":
- inAntonym = True
- elif l == "{{-pron-}}":
- inPron = True
+ if state == DEFINITION:
+ definition.addText(self.wiki2xml(l, False))
+ elif state == ANAGRAM:
+ if len(l) > 0:
+ definition.addAnagram(self.wiki2xml(l, True))
+ elif state == SYNONYM:
+ definition.addSynonym(self.wiki2xml(l, True))
+ elif state == ANTONYM:
+ definition.addAntonym(self.wiki2xml(l, True))
+ elif state == HYPERONYM:
+ definition.addHyperonym(self.wiki2xml(l, True))
+ elif state == HYPONYM:
+ definition.addHyponym(self.wiki2xml(l, True))
+ elif state == PRON:
+ file = re.subn(r".*audio=([^|}]+).*", r"\1", l)
+ if file[1] == 1:
+ definition.addPrononciation(file[0])
return inWord
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]