[glib] Optimized the overlong sequence check in g_utf8_get_char_extended()



commit c3bc0f4f8fc5125a732f383671ee5bff939423d2
Author: Mikhail Zabaluev <mikhail zabaluev gmail com>
Date:   Thu May 27 14:00:12 2010 -0400

    Optimized the overlong sequence check in g_utf8_get_char_extended()
    
    Rather make it branch to get the due sequence length for the resulting
    character code, we can as well get the minimum code value in the initial
    branching.

 glib/gutf8.c |   26 +++++++++++++++++---------
 1 files changed, 17 insertions(+), 9 deletions(-)
---
diff --git a/glib/gutf8.c b/glib/gutf8.c
index 27b1e4c..d0badbb 100644
--- a/glib/gutf8.c
+++ b/glib/gutf8.c
@@ -704,20 +704,23 @@ g_utf8_strrchr (const char *p,
 
 
 /* Like g_utf8_get_char, but take a maximum length
- * and return (gunichar)-2 on incomplete trailing character
+ * and return (gunichar)-2 on incomplete trailing character;
+ * also check for malformed or overlong sequences
+ * and return (gunichar)-1 in this case.
  */
 static inline gunichar
 g_utf8_get_char_extended (const  gchar *p,
-			  gssize max_len)  
+			  gssize max_len)
 {
   guint i, len;
+  gunichar min_code;
   gunichar wc = (guchar) *p;
 
   if (wc < 0x80)
     {
       return wc;
     }
-  else if (wc < 0xc0)
+  else if (G_UNLIKELY (wc < 0xc0))
     {
       return (gunichar)-1;
     }
@@ -725,33 +728,38 @@ g_utf8_get_char_extended (const  gchar *p,
     {
       len = 2;
       wc &= 0x1f;
+      min_code = 1 << 7;
     }
   else if (wc < 0xf0)
     {
       len = 3;
       wc &= 0x0f;
+      min_code = 1 << 11;
     }
   else if (wc < 0xf8)
     {
       len = 4;
       wc &= 0x07;
+      min_code = 1 << 16;
     }
   else if (wc < 0xfc)
     {
       len = 5;
       wc &= 0x03;
+      min_code = 1 << 21;
     }
   else if (wc < 0xfe)
     {
       len = 6;
       wc &= 0x01;
+      min_code = 1 << 26;
     }
   else
     {
       return (gunichar)-1;
     }
-  
-  if (max_len >= 0 && len > max_len)
+
+  if (G_UNLIKELY (max_len >= 0 && len > max_len))
     {
       for (i = 1; i < max_len; i++)
 	{
@@ -764,8 +772,8 @@ g_utf8_get_char_extended (const  gchar *p,
   for (i = 1; i < len; ++i)
     {
       gunichar ch = ((guchar *)p)[i];
-      
-      if ((ch & 0xc0) != 0x80)
+
+      if (G_UNLIKELY ((ch & 0xc0) != 0x80))
 	{
 	  if (ch)
 	    return (gunichar)-1;
@@ -777,9 +785,9 @@ g_utf8_get_char_extended (const  gchar *p,
       wc |= (ch & 0x3f);
     }
 
-  if (UTF8_LENGTH(wc) != len)
+  if (G_UNLIKELY (wc < min_code))
     return (gunichar)-1;
-  
+
   return wc;
 }
 



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]