bigboard r7296 - trunk/bigboard/stocks/mail
- From: walters svn gnome org
- To: svn-commits-list gnome org
- Subject: bigboard r7296 - trunk/bigboard/stocks/mail
- Date: Mon, 14 Apr 2008 22:55:46 +0100 (BST)
Author: walters
Date: Mon Apr 14 22:55:45 2008
New Revision: 7296
URL: http://svn.gnome.org/viewvc/bigboard?rev=7296&view=rev
Log:
Unescape HTML entities before parsing as HTML.
This fixes GMail giving us e.g. …
Modified:
trunk/bigboard/stocks/mail/MailStock.py
Modified: trunk/bigboard/stocks/mail/MailStock.py
==============================================================================
--- trunk/bigboard/stocks/mail/MailStock.py (original)
+++ trunk/bigboard/stocks/mail/MailStock.py Mon Apr 14 22:55:45 2008
@@ -1,3 +1,5 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
import logging, re, time, urllib2
import gobject, gtk
@@ -23,10 +25,38 @@
return unichr(int(m.group(1), 16))
UNICHR_REPLACE = re.compile(r"\\u([A-F-a-f0-9]{4})")
+# http://effbot.org/zone/re-sub.htm#unescape-html
+##
+# Removes HTML or XML character references and entities from a text string.
+#
+# @param text The HTML (or XML) source text.
+# @return The plain text, as a Unicode string, if necessary.
+def unescape_html_entities(text):
+ def fixup(m):
+ text = m.group(0)
+ if text[:2] == "&#":
+ # character reference
+ try:
+ if text[:3] == "&#x":
+ return unichr(int(text[3:-1], 16))
+ else:
+ return unichr(int(text[2:-1]))
+ except ValueError:
+ pass
+ else:
+ # named entity
+ try:
+ text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
+ except KeyError:
+ pass
+ return text # leave as is
+ return re.sub("&#?\w+;", fixup, text)
+
def gmail_jshtml_str_parse(s, markup=False):
# Replace \uxxxx escapes
parsed_str = UNICHR_REPLACE.sub(replace_chr, s)
-
+ parsed_str = unescape_html_entities(s)
+
# At this point, we have a Python unicode string which *should* hold
# an XML fragment. Convert that fragment into a document string.
pystr = "<html>" + parsed_str + "</html>"
@@ -240,6 +270,7 @@
libbig.show_url("http://mail.google.com/mail")
if __name__ == '__main__':
+ assert unescape_html_entities("foo…bar") == "fooâbar"
# We want to keep bold tags
assert gmail_jshtml_str_parse(r'test \u003cb\>hi\u003c/b\> moo', True) == 'test <b>hi</b> moo'
# Strip unknown tag "A"
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]