[library-web] get title from wiki page if link was WikiStyle



commit 4e1e30211f64d241925fd4caed631b51e41e4a03
Author: Frédéric Péters <fpeters 0d be>
Date:   Thu Jan 30 14:31:49 2014 +0000

    get title from wiki page if link was WikiStyle

 src/document.py |   19 ++++++++++++++++++-
 src/overlay.py  |    9 +++++++--
 2 files changed, 25 insertions(+), 3 deletions(-)
---
diff --git a/src/document.py b/src/document.py
index c4f2c4d..fbb5920 100644
--- a/src/document.py
+++ b/src/document.py
@@ -188,6 +188,7 @@ class RemoteDocument(Document):
         self.title = {}
         self.href = {}
         self.abstract = {}
+        self.cancelled = False
         for title in overlay.findall('title'):
             lang = title.attrib.get(
                     '{http://www.w3.org/XML/1998/namespace}lang', 'en')
@@ -241,6 +242,20 @@ class RemoteDocument(Document):
             doc = parser.parse(open(filename))
             doc.childNodes[-1].attributes['xmlns'] = 'http://www.w3.org/1999/xhtml'
             del doc.childNodes[:-1]
+
+            if self.href[lang].endswith('?action=print') and (
+                    not self.title.get(lang) or ' ' not in self.title[lang]):
+                # wiki document with a WikiTitle
+                html = ET.fromstring(doc.toxml())
+                try:
+                    title = ET.ElementTree(html).find('.//{http://www.w3.org/1999/xhtml}h1').text
+                except AttributeError:
+                    # no title in page, probably not good to go, remove.
+                    self.cancelled = True
+                    continue
+                else:
+                    self.title[lang] = title
+
             cmd = ['xsltproc', '--output', dst,
                     '--stringparam', 'libgo.originalhref', self.href[lang],
                     '--stringparam', 'libgo.channel', self.channel,
@@ -262,9 +277,11 @@ class RemoteDocument(Document):
         # can be "watched" for changes
         if self.overlay.find('local').attrib.get('nocache'):
             return app.download(href, use_cache=False)
-        return app.download(href)
+        return app.download(href, use_cache=True)
 
     def create_element(self, parent, language, original_language = None):
+        if self.cancelled:
+            return
         doc = Document.create_element(self, parent, language, original_language)
         if not doc:
             return
diff --git a/src/overlay.py b/src/overlay.py
index fea1d06..31ed5fc 100644
--- a/src/overlay.py
+++ b/src/overlay.py
@@ -123,7 +123,7 @@ class Overlay:
         documents_node = self.tree.find('documents')
         for extra in self.tree.findall('/documents/extrawikidocs'):
             href = extra.find('href').text
-            content = app.download(href + '?action=print', use_cache=False)
+            content = app.download(href + '?action=print', use_cache=True)
 
             # parse the wiki page and get all links in content
             parser = html5lib.HTMLParser()
@@ -141,6 +141,10 @@ class Overlay:
                     # heuristic to eliminate generated links
                     continue
                 doc_href = urlparse.urljoin(href, link.attrib.get('href'))
+                if doc_href.split('/')[-1] == 'HowDoI':
+                    # blacklist HowDoI as that page name is used in multiple
+                    # places and we can't differentiate at the moment.
+                    continue
                 logging.info('adding extra document from wiki: %s' % doc_href)
                 title = link.text
                 doc_node = ET.SubElement(documents_node, 'document')
@@ -167,7 +171,8 @@ class Overlay:
             for title in overlay.findall('title'):
                 lang = title.attrib.get(
                         '{http://www.w3.org/XML/1998/namespace}lang', 'en')
-                document.title[lang] = title.text
+                if title.text:
+                    document.title[lang] = title.text
             for lang in document.languages:
                 if not document.title.get(lang):
                     document.title[lang] = document.title.get('en')


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]