[library-web] get title from wiki page if link was WikiStyle
- From: Frederic Peters <fpeters src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [library-web] get title from wiki page if link was WikiStyle
- Date: Thu, 30 Jan 2014 14:33:15 +0000 (UTC)
commit 4e1e30211f64d241925fd4caed631b51e41e4a03
Author: Frédéric Péters <fpeters 0d be>
Date: Thu Jan 30 14:31:49 2014 +0000
get title from wiki page if link was WikiStyle
src/document.py | 19 ++++++++++++++++++-
src/overlay.py | 9 +++++++--
2 files changed, 25 insertions(+), 3 deletions(-)
---
diff --git a/src/document.py b/src/document.py
index c4f2c4d..fbb5920 100644
--- a/src/document.py
+++ b/src/document.py
@@ -188,6 +188,7 @@ class RemoteDocument(Document):
self.title = {}
self.href = {}
self.abstract = {}
+ self.cancelled = False
for title in overlay.findall('title'):
lang = title.attrib.get(
'{http://www.w3.org/XML/1998/namespace}lang', 'en')
@@ -241,6 +242,20 @@ class RemoteDocument(Document):
doc = parser.parse(open(filename))
doc.childNodes[-1].attributes['xmlns'] = 'http://www.w3.org/1999/xhtml'
del doc.childNodes[:-1]
+
+ if self.href[lang].endswith('?action=print') and (
+ not self.title.get(lang) or ' ' not in self.title[lang]):
+ # wiki document with a WikiTitle
+ html = ET.fromstring(doc.toxml())
+ try:
+ title = ET.ElementTree(html).find('.//{http://www.w3.org/1999/xhtml}h1').text
+ except AttributeError:
+ # no title in page, probably not good to go, remove.
+ self.cancelled = True
+ continue
+ else:
+ self.title[lang] = title
+
cmd = ['xsltproc', '--output', dst,
'--stringparam', 'libgo.originalhref', self.href[lang],
'--stringparam', 'libgo.channel', self.channel,
@@ -262,9 +277,11 @@ class RemoteDocument(Document):
# can be "watched" for changes
if self.overlay.find('local').attrib.get('nocache'):
return app.download(href, use_cache=False)
- return app.download(href)
+ return app.download(href, use_cache=True)
def create_element(self, parent, language, original_language = None):
+ if self.cancelled:
+ return
doc = Document.create_element(self, parent, language, original_language)
if not doc:
return
diff --git a/src/overlay.py b/src/overlay.py
index fea1d06..31ed5fc 100644
--- a/src/overlay.py
+++ b/src/overlay.py
@@ -123,7 +123,7 @@ class Overlay:
documents_node = self.tree.find('documents')
for extra in self.tree.findall('/documents/extrawikidocs'):
href = extra.find('href').text
- content = app.download(href + '?action=print', use_cache=False)
+ content = app.download(href + '?action=print', use_cache=True)
# parse the wiki page and get all links in content
parser = html5lib.HTMLParser()
@@ -141,6 +141,10 @@ class Overlay:
# heuristic to eliminate generated links
continue
doc_href = urlparse.urljoin(href, link.attrib.get('href'))
+ if doc_href.split('/')[-1] == 'HowDoI':
+ # blacklist HowDoI as that page name is used in multiple
+ # places and we can't differentiate at the moment.
+ continue
logging.info('adding extra document from wiki: %s' % doc_href)
title = link.text
doc_node = ET.SubElement(documents_node, 'document')
@@ -167,7 +171,8 @@ class Overlay:
for title in overlay.findall('title'):
lang = title.attrib.get(
'{http://www.w3.org/XML/1998/namespace}lang', 'en')
- document.title[lang] = title.text
+ if title.text:
+ document.title[lang] = title.text
for lang in document.languages:
if not document.title.get(lang):
document.title[lang] = document.title.get('en')
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]