bigboard r7311 - trunk/bigboard/stocks/mail



Author: walters
Date: Wed Apr 23 23:12:49 2008
New Revision: 7311
URL: http://svn.gnome.org/viewvc/bigboard?rev=7311&view=rev

Log:
Use BeautifulSoup to parse GMail HTML


Modified:
   trunk/bigboard/stocks/mail/MailStock.py

Modified: trunk/bigboard/stocks/mail/MailStock.py
==============================================================================
--- trunk/bigboard/stocks/mail/MailStock.py	(original)
+++ trunk/bigboard/stocks/mail/MailStock.py	Wed Apr 23 23:12:49 2008
@@ -1,7 +1,6 @@
 #!/usr/bin/python
 # -*- coding: utf-8 -*-
 import logging, re, time, urllib2
-import htmlentitydefs
 
 import gobject, gtk
 import hippo
@@ -25,65 +24,31 @@
 def replace_chr(m):
     return unichr(int(m.group(1), 16))
 UNICHR_REPLACE = re.compile(r"\\u([A-F-a-f0-9]{4})")
-
-# http://effbot.org/zone/re-sub.htm#unescape-html
-##
-# Removes HTML or XML character references and entities from a text string.
-#
-# @param text The HTML (or XML) source text.
-# @return The plain text, as a Unicode string, if necessary.
-def unescape_html_entities(text):
-    xml_entities = ["quot", "amp", "apos", "lt", "gt"]
-    def fixup(m):
-        text = m.group(0)
-        if text[:2] == "&#":
-            # character reference
-            try:
-                if text[:3] == "&#x":
-                    return unichr(int(text[3:-1], 16))
-                else:
-                    return unichr(int(text[2:-1]))
-            except ValueError:
-                pass
-        else:
-            # named entity
-            entityname = text[1:-1]
-            # Don't unescape valid XML entities
-            if entityname in xml_entities:
-                return text
-            try:
-                text = unichr(htmlentitydefs.name2codepoint[entityname])
-            except KeyError:
-                pass
-        return text # leave as is
-    return re.sub("&#?\w+;", fixup, text)
-
 def gmail_jshtml_str_parse(s, markup=False):
     # Replace \uxxxx escapes
     parsed_str = UNICHR_REPLACE.sub(replace_chr, s)
     parsed_str = unescape_html_entities(parsed_str)
-
     # At this point, we have a Python unicode string which *should* hold
-    # an XML fragment.  Convert that fragment into a document string.
+    # an HTML fragment.  Convert that fragment into a document string.
     pystr = "<html>" + parsed_str + "</html>"
-    # Parse that document string into a DOM.    
-    dom = xml.dom.minidom.parseString(pystr)
-    textContent = StringIO()
-    # Now we parse the XML, only allowing the bold tag through, and eating everything else
-    def DomToText(node):       
-        if node.nodeType == Node.TEXT_NODE:
-            textContent.write(gobject.markup_escape_text(node.data))
-        if markup and node.nodeType == Node.ELEMENT_NODE and node.nodeName == 'b':
+    # Now use BeautifulSoup to parse it
+    from BeautifulSoup import BeautifulSoup
+    soup = BeautifulSoup(pystr, convertEntities=BeautifulSoup.HTML_ENTITIES)
+    textContent = StringIO()    
+    def filterBoldOnly(node):       
+        if isinstance(node, unicode):
+            textContent.write(gobject.markup_escape_text(node))
+            return
+        if markup and node.name == 'b':
             in_bold = True
             textContent.write('<b>')
         else:
-            in_bold = False             
-        if node.hasChildNodes():
-            for child in node.childNodes:
-                DomToText(child)
+            in_bold = False
+        for child in node.childGenerator():
+            filterBoldOnly(child)
         if in_bold:
             textContent.write('</b>')
-    DomToText(dom.documentElement)
+    filterBoldOnly(soup)
     # Return the sanely filtered content
     return textContent.getvalue()
 



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]