r7152 - bigboard/trunk/bigboard/stocks/mail



Author: otaylor
Date: 2008-01-08 16:47:07 -0600 (Tue, 08 Jan 2008)
New Revision: 7152

Modified:
   bigboard/trunk/bigboard/stocks/mail/MailStock.py
Log:
MailStock.py: Rewrite convert_entities to be more efficient

Modified: bigboard/trunk/bigboard/stocks/mail/MailStock.py
===================================================================
--- bigboard/trunk/bigboard/stocks/mail/MailStock.py	2008-01-08 22:26:17 UTC (rev 7151)
+++ bigboard/trunk/bigboard/stocks/mail/MailStock.py	2008-01-08 22:47:07 UTC (rev 7152)
@@ -27,23 +27,36 @@
         s = s.replace("\\u003c/b\\>", e)
     return s
 
+_CONVERT_ENTITIES_RE = re.compile("&(?:(#[0-9]+)|(#x[0-9A-Fa-f]+)|([A-Za-z]+));")
+
+def _convert_entity(m):
+    try:
+        if m.group(1) is not None:
+            return unichr(int(m.group(1)[1:]))
+        elif m.group(2) is not None:
+            return unichr(int(m.group(2)[2:], 16))
+        else:
+            return unichr(htmlentitydefs.name2codepoint[m.group(3)])
+    except ValueError:
+        return m.group(0)
+    except KeyError:
+        return m.group(0)
+    except OverflowError:
+        return m.group(0)
+
 def convert_entities(s):
-    exp = re.compile("&[#a-zA-Z0-9]*;")
-    for match in exp.finditer(s):
-        if match is not None:
-            html_entity = match.group()
-            try:
-                if html_entity[1] == '#':
-                    entity_num = int(html_entity[2:-1])
-                    replacement_entity = unichr(entity_num)
-                else:
-                    entity_str = html_entity[1:-1]
-                    replacement_entity = unichr(htmlentitydefs.name2codepoint[entity_str])
-                s = s.replace(html_entity, replacement_entity)
-            except KeyError:
-                pass
-    return s
+    """Replace standard HTML entities and numeric character references in the string"""
+    return _CONVERT_ENTITIES_RE.sub(_convert_entity, s) 
 
+# assert convert_entities("&") == "&"
+# assert convert_entities("&amp;foo&lt;") == "&foo<"
+# assert convert_entities("&#x41;") == "A"
+# assert convert_entities("&#65;") == "A"
+# assert convert_entities("&zzz_amp;") == "&zzz_amp;" # not something we parse as an entity
+# assert convert_entities("&zzzamp;") == "&zzzamp;" # unknown entity
+# assert convert_entities("&#x1000000;") == "&#x1000000;" # not a unicode character
+# assert convert_entities("&#x1000000000;") == "&#x1000000000;" # overflow
+
 class LabelSlideout(ThemedSlideout):
     __gsignals__ = {
                     'changed' : (gobject.SIGNAL_RUN_LAST, gobject.TYPE_NONE, (gobject.TYPE_STRING, )),



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]