bigboard r7294 - trunk/bigboard/stocks/mail

From: walters svn gnome org
To: svn-commits-list gnome org
Subject: bigboard r7294 - trunk/bigboard/stocks/mail
Date: Fri, 11 Apr 2008 22:47:08 +0100 (BST)
Author: walters
Date: Fri Apr 11 22:47:08 2008
New Revision: 7294
URL: http://svn.gnome.org/viewvc/bigboard?rev=7294&view=rev

Log:
526827: Fix GMail escape characters


Modified:
   trunk/bigboard/stocks/mail/MailStock.py

Modified: trunk/bigboard/stocks/mail/MailStock.py
==============================================================================
--- trunk/bigboard/stocks/mail/MailStock.py	(original)
+++ trunk/bigboard/stocks/mail/MailStock.py	Fri Apr 11 22:47:08 2008
@@ -1,7 +1,12 @@
-import logging, re, htmlentitydefs, time, urllib2
+import logging, re, time, urllib2
 
 import gobject, gtk
 import hippo
+import xml.dom.minidom
+from xml.dom.minidom import Node
+from StringIO import StringIO
+
+import simplejson
 
 from bigboard.stock import Stock
 from bigboard.slideout import ThemedSlideout
@@ -16,47 +21,37 @@
 
 _logger = logging.getLogger('bigboard.stocks.MailStock')
 
-def remove_strange_tags(s, markup=False):
-    if "\\u003cb\\>" in s:
-        if markup == True:
-            b = "<b>"
-            e = "</b>"
-        else:
-            b = ""
-            e = ""
-        s = s.replace("\\u003cb\\>", b)
-        s = s.replace("\\u003c/b\\>", e)
-    return s
-
-_CONVERT_ENTITIES_RE = re.compile("&(?:(#[0-9]+)|(#x[0-9A-Fa-f]+)|([A-Za-z]+));")
-
-def _convert_entity(m):
-    try:
-        if m.group(1) is not None:
-            return unichr(int(m.group(1)[1:]))
-        elif m.group(2) is not None:
-            return unichr(int(m.group(2)[2:], 16))
+def replace_chr(m):
+    return unichr(int(m.group(1), 16))
+UNICHR_REPLACE = re.compile(r"\\u([A-F-a-f0-9]{4})")
+
+def gmail_jshtml_str_parse(s, markup=False):
+    s = s.replace(r'\>', '>')
+    s = s.replace(r'\<', '<')
+    parsed_str = UNICHR_REPLACE.sub(replace_chr, s)
+    # At this point, we have a Python unicode string which *should* hold
+    # an XML fragment.  Convert that fragment into a document string.
+    pystr = "<html>" + parsed_str + "</html>"
+    # Parse that document string into a DOM.    
+    dom = xml.dom.minidom.parseString(pystr)
+    textContent = StringIO()
+    # Now we parse the XML, only allowing the bold tag through, and eating everything else
+    def DomToText(node):       
+        if node.nodeType == Node.TEXT_NODE:
+            textContent.write(node.data)
+        if markup and node.nodeType == Node.ELEMENT_NODE and node.nodeName == 'b':
+            in_bold = True
+            textContent.write('<b>')
         else:
-            return unichr(htmlentitydefs.name2codepoint[m.group(3)])
-    except ValueError:
-        return m.group(0)
-    except KeyError:
-        return m.group(0)
-    except OverflowError:
-        return m.group(0)
-
-def convert_entities(s):
-    """Replace standard HTML entities and numeric character references in the string"""
-    return _CONVERT_ENTITIES_RE.sub(_convert_entity, s) 
-
-# assert convert_entities("&amp;") == "&"
-# assert convert_entities("&amp;foo&lt;") == "&foo<"
-# assert convert_entities("&#x41;") == "A"
-# assert convert_entities("&#65;") == "A"
-# assert convert_entities("&zzz_amp;") == "&zzz_amp;" # not something we parse as an entity
-# assert convert_entities("&zzzamp;") == "&zzzamp;" # unknown entity
-# assert convert_entities("&#x1000000;") == "&#x1000000;" # not a unicode character
-# assert convert_entities("&#x1000000000;") == "&#x1000000000;" # overflow
+            in_bold = False             
+        if node.hasChildNodes():
+            for child in node.childNodes:
+                DomToText(child)
+        if in_bold:
+            textContent.write('</b>')
+    DomToText(dom.documentElement)
+    # Return the sanely filtered content
+    return textContent.getvalue()
 
 class LabelSlideout(ThemedSlideout):
     __gsignals__ = {
@@ -94,7 +89,7 @@
         self.__header = Header(topborder=False)
         self.id = thread.id        
 
-        subject = remove_strange_tags(thread.subject)
+        subject = gmail_jshtml_str_parse(thread.subject)
         
         subject_box = hippo.CanvasText(classes='header', text=subject)
         self.__header.append(subject_box, hippo.PACK_EXPAND)
@@ -106,9 +101,8 @@
                 if type(value) is list:
                     s = ", ".join(value)
                 if type(value) is str:
-                    s = remove_strange_tags(value)
-                
-                s = convert_entities(s)
+                    s = gmail_jshtml_str_parse(value)
+
                 box = hippo.CanvasText(text=s, xalign=hippo.ALIGNMENT_START)
                 vbox.append(box)
         
@@ -189,7 +183,7 @@
             for thread in threads:
                 if i >= self.__display_limit: break
                 
-                subject = remove_strange_tags(thread.subject, True)
+                subject = gmail_jshtml_str_parse(thread.subject, True)
                 
                 box = PrelightingCanvasBox()
                 box.connect("button-press-event", self.create_email_slideout, thread)
@@ -246,3 +240,10 @@
     
     def __on_more_button(self):
         libbig.show_url("http://mail.google.com/mail";)
+
+if __name__ == '__main__':
+    # We want to keep bold tags
+    assert gmail_jshtml_str_parse(r'test \u003cb\>hi\u003c/b\> moo', True) == 'test <b>hi</b> moo'
+    # Strip unknown tag "A"
+    assert gmail_jshtml_str_parse(r'test \u003ca\>hi\u003c/a\> moo', True) == 'test hi moo'
+    
\ No newline at end of file
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]