bigboard r7296 - trunk/bigboard/stocks/mail



Author: walters
Date: Mon Apr 14 22:55:45 2008
New Revision: 7296
URL: http://svn.gnome.org/viewvc/bigboard?rev=7296&view=rev

Log:
Unescape HTML entities before parsing as HTML.

This fixes GMail giving us e.g. …

Modified:
   trunk/bigboard/stocks/mail/MailStock.py

Modified: trunk/bigboard/stocks/mail/MailStock.py
==============================================================================
--- trunk/bigboard/stocks/mail/MailStock.py	(original)
+++ trunk/bigboard/stocks/mail/MailStock.py	Mon Apr 14 22:55:45 2008
@@ -1,3 +1,5 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
 import logging, re, time, urllib2
 
 import gobject, gtk
@@ -23,10 +25,38 @@
     return unichr(int(m.group(1), 16))
 UNICHR_REPLACE = re.compile(r"\\u([A-F-a-f0-9]{4})")
 
+# http://effbot.org/zone/re-sub.htm#unescape-html
+##
+# Removes HTML or XML character references and entities from a text string.
+#
+# @param text The HTML (or XML) source text.
+# @return The plain text, as a Unicode string, if necessary.
+def unescape_html_entities(text):
+    def fixup(m):
+        text = m.group(0)
+        if text[:2] == "&#":
+            # character reference
+            try:
+                if text[:3] == "&#x":
+                    return unichr(int(text[3:-1], 16))
+                else:
+                    return unichr(int(text[2:-1]))
+            except ValueError:
+                pass
+        else:
+            # named entity
+            try:
+                text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
+            except KeyError:
+                pass
+        return text # leave as is
+    return re.sub("&#?\w+;", fixup, text)
+
 def gmail_jshtml_str_parse(s, markup=False):
     # Replace \uxxxx escapes
     parsed_str = UNICHR_REPLACE.sub(replace_chr, s)
-    
+    parsed_str = unescape_html_entities(s)
+
     # At this point, we have a Python unicode string which *should* hold
     # an XML fragment.  Convert that fragment into a document string.
     pystr = "<html>" + parsed_str + "</html>"
@@ -240,6 +270,7 @@
         libbig.show_url("http://mail.google.com/mail";)
 
 if __name__ == '__main__':
+    assert unescape_html_entities("foo&hellip;bar") == "fooâbar"
     # We want to keep bold tags
     assert gmail_jshtml_str_parse(r'test \u003cb\>hi\u003c/b\> moo', True) == 'test <b>hi</b> moo'
     # Strip unknown tag "A"



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]