r3954 - in trunk/birnet: . tests



Author: timj
Date: 2006-10-10 17:25:27 -0400 (Tue, 10 Oct 2006)
New Revision: 3954

Modified:
   trunk/birnet/ChangeLog
   trunk/birnet/birnetutf8.cc
   trunk/birnet/birnetutf8.hh
   trunk/birnet/tests/strings.cc
Log:
Tue Oct 10 23:23:11 2006  Tim Janik  <timj gtk org>                                                                                                           
                                                                                                                                                              
        * birnetutf8.hh, birnetutf8.cc: added Unichar::isvalid().                                                                                             
        changed arguemnt order of utf8_find_prev() to match that of                                                                                           
        g_utf8_find_prev_char(), adapted test case. added utf8_validate(),                                                                                    
        wrapping g_utf8_validate().                                                                                                                           
                                                                                                                                                              
        * tests/strings.cc: adapt to utf8_find_prev(). added Unichar::isvalid()                                                                               
        test and utf8_validate() test.                                                                                                                        
                                                                                                                                                              



Modified: trunk/birnet/ChangeLog
===================================================================
--- trunk/birnet/ChangeLog	2006-10-09 23:35:14 UTC (rev 3953)
+++ trunk/birnet/ChangeLog	2006-10-10 21:25:27 UTC (rev 3954)
@@ -1,3 +1,13 @@
+Tue Oct 10 23:23:11 2006  Tim Janik  <timj gtk org>
+
+	* birnetutf8.hh, birnetutf8.cc: added Unichar::isvalid().
+	changed arguemnt order of utf8_find_prev() to match that of 
+	g_utf8_find_prev_char(), adapted test case. added utf8_validate(),
+	wrapping g_utf8_validate().
+
+	* tests/strings.cc: adapt to utf8_find_prev(). added Unichar::isvalid()
+	test and utf8_validate() test.
+
 Tue Oct 10 01:34:05 2006  Tim Janik  <timj gtk org>
 
 	* birnetcdefs.h: provide extra prototype for extern inline functions

Modified: trunk/birnet/birnetutf8.cc
===================================================================
--- trunk/birnet/birnetutf8.cc	2006-10-09 23:35:14 UTC (rev 3953)
+++ trunk/birnet/birnetutf8.cc	2006-10-10 21:25:27 UTC (rev 3954)
@@ -311,4 +311,17 @@
   return l;
 }
 
+bool
+utf8_validate (const String   &strng,
+               int            *bound)
+{
+  const char *c = &strng[0];
+  size_t l = strng.size();
+  const gchar *end = NULL;
+  gboolean gb = g_utf8_validate (c, l, &end);
+  if (bound)
+    *bound = !gb ? end - c : -1;
+  return gb != false;
+}
+
 } // Birnet

Modified: trunk/birnet/birnetutf8.hh
===================================================================
--- trunk/birnet/birnetutf8.hh	2006-10-09 23:35:14 UTC (rev 3953)
+++ trunk/birnet/birnetutf8.hh	2006-10-10 21:25:27 UTC (rev 3954)
@@ -24,26 +24,27 @@
 namespace Birnet {
 
 namespace Unichar {
-bool    isalnum      (unichar uc) BIRNET_CONST;
-bool    isalpha      (unichar uc) BIRNET_CONST;
-bool    iscntrl      (unichar uc) BIRNET_CONST;
-bool    isdigit      (unichar uc) BIRNET_CONST;
-int     digit_value  (unichar uc) BIRNET_CONST;
-bool    isgraph      (unichar uc) BIRNET_CONST;
-bool    islower      (unichar uc) BIRNET_CONST;
-unichar tolower      (unichar uc) BIRNET_CONST;
-bool    isprint      (unichar uc) BIRNET_CONST;
-bool    ispunct      (unichar uc) BIRNET_CONST;
-bool    isspace      (unichar uc) BIRNET_CONST;
-bool    isupper      (unichar uc) BIRNET_CONST;
-unichar toupper      (unichar uc) BIRNET_CONST;
-bool    isxdigit     (unichar uc) BIRNET_CONST;
-int     xdigit_value (unichar uc) BIRNET_CONST;
-bool    istitle      (unichar uc) BIRNET_CONST;
-unichar totitle      (unichar uc) BIRNET_CONST;
-bool    isdefined    (unichar uc) BIRNET_CONST;
-bool    iswide       (unichar uc) BIRNET_CONST;
-bool    iswide_cjk   (unichar uc) BIRNET_CONST;
+inline bool isvalid      (unichar uc) BIRNET_CONST;
+bool        isalnum      (unichar uc) BIRNET_CONST;
+bool        isalpha      (unichar uc) BIRNET_CONST;
+bool        iscntrl      (unichar uc) BIRNET_CONST;
+bool        isdigit      (unichar uc) BIRNET_CONST;
+int         digit_value  (unichar uc) BIRNET_CONST;
+bool        isgraph      (unichar uc) BIRNET_CONST;
+bool        islower      (unichar uc) BIRNET_CONST;
+unichar     tolower      (unichar uc) BIRNET_CONST;
+bool        isprint      (unichar uc) BIRNET_CONST;
+bool        ispunct      (unichar uc) BIRNET_CONST;
+bool        isspace      (unichar uc) BIRNET_CONST;
+bool        isupper      (unichar uc) BIRNET_CONST;
+unichar     toupper      (unichar uc) BIRNET_CONST;
+bool        isxdigit     (unichar uc) BIRNET_CONST;
+int         xdigit_value (unichar uc) BIRNET_CONST;
+bool        istitle      (unichar uc) BIRNET_CONST;
+unichar     totitle      (unichar uc) BIRNET_CONST;
+bool        isdefined    (unichar uc) BIRNET_CONST;
+bool        iswide       (unichar uc) BIRNET_CONST;
+bool        iswide_cjk   (unichar uc) BIRNET_CONST;
 typedef enum {
   CONTROL,              FORMAT,                 UNASSIGNED,
   PRIVATE_USE,          SURROGATE,              LOWERCASE_LETTER,
@@ -82,17 +83,35 @@
 inline char*          utf8_prev         (char           *c);
 inline const char*    utf8_find_next    (const char     *c,
                                          const char     *bound = NULL);
-inline char*          utf8_find_next    (char           *c,
+inline char*          utf8_find_next    (char           *current,
                                          const char     *bound = NULL);
-inline const char*    utf8_find_prev    (const char     *c,
-                                         const char     *start = NULL);
-inline char*          utf8_find_prev    (char           *c,
-                                         const char     *start = NULL);
+inline const char*    utf8_find_prev    (const char     *start,
+                                         const char     *current);
+inline char*          utf8_find_prev    (const char     *start,
+                                         char           *currrent);
 unichar               utf8_to_unichar   (const char     *str);
 int                   utf8_from_unichar (unichar         uc,
                                          char            str[8]);
+bool                  utf8_validate     (const String   &string,
+                                         int            *bound = NULL);
 
 /* --- implementation bits --- */
+namespace Unichar {
+inline bool
+isvalid (unichar uc)
+{
+  if (BIRNET_UNLIKELY (uc > 0xfdcf && uc < 0xfdf0))
+    return false;
+  if (BIRNET_UNLIKELY ((uc & 0xfffe) == 0xfffe))
+    return false;
+  if (BIRNET_UNLIKELY (uc > 0x10ffff))
+    return false;
+  if (BIRNET_UNLIKELY ((uc & 0xfffff800) == 0xd800))
+    return false;
+  return true;
+}
+} // Unichar
+
 extern const int8 utf8_skip_table[256];
 
 inline const char*
@@ -144,20 +163,20 @@
 }
 
 inline const char*
-utf8_find_prev (const char *c,
-                const char *start)
+utf8_find_prev (const char     *start,
+                const char     *current)
 {
   do
-    c--;
-  while (c >= start && (*c & 0xc0) == 0x80);
-  return !start || c >= start ? c : NULL;
+    current--;
+  while (current >= start && (*current & 0xc0) == 0x80);
+  return current >= start ? current : NULL;
 }
 
 inline char*
-utf8_find_prev (char       *c,
-                const char *start)
+utf8_find_prev (const char     *start,
+                char           *current)
 {
-  return const_cast<char*> (utf8_find_prev (const_cast<const char*> (c), start));
+  return const_cast<char*> (utf8_find_prev (start, const_cast<const char*> (current)));
 }
 
 

Modified: trunk/birnet/tests/strings.cc
===================================================================
--- trunk/birnet/tests/strings.cc	2006-10-09 23:35:14 UTC (rev 3953)
+++ trunk/birnet/tests/strings.cc	2006-10-10 21:25:27 UTC (rev 3954)
@@ -61,7 +61,7 @@
       TCHECK (pn == cur + 1);
       gn = g_utf8_find_next_char (cur, NULL);
       TCHECK (pn == gn);
-      pp = utf8_find_prev (pn, cbuffer);
+      pp = utf8_find_prev (cbuffer, pn);
       TCHECK (pp == cur);
       /* random unichar */
       cur = pn;
@@ -69,7 +69,7 @@
       TCHECK (pn == cur + l);
       gn = g_utf8_find_next_char (cur, NULL);
       TCHECK (pn == gn);
-      pp = utf8_find_prev (pn, cbuffer);
+      pp = utf8_find_prev (cbuffer, pn);
       TCHECK (pp == cur);
       /* y */
       cur = pn;
@@ -77,7 +77,7 @@
       TCHECK (pn == cur + 1);
       gn = g_utf8_find_next_char (cur, NULL);
       TCHECK (pn == gn);
-      pp = utf8_find_prev (pn, cbuffer);
+      pp = utf8_find_prev (cbuffer, pn);
       TCHECK (pp == cur);
       /* 7 (last) */
       cur = pn;
@@ -85,7 +85,7 @@
       TCHECK (pn == cur + 1);
       gn = g_utf8_find_next_char (cur, NULL);
       TCHECK (pn == gn);
-      pp = utf8_find_prev (pn, cbuffer);
+      pp = utf8_find_prev (cbuffer, pn);
       TCHECK (pp == cur);
       /* last with bounds */
       pn = utf8_find_next (cur, cur + strlen (cur));
@@ -95,6 +95,20 @@
       /* first with bounds */
       pp = utf8_find_prev (cbuffer, cbuffer);
       TCHECK (pp == NULL);
+
+      /* validate valid UTF-8 */
+      bool bb = utf8_validate (cbuffer);
+      bool gb = g_utf8_validate (cbuffer, -1, NULL);
+      TCHECK (bb == gb);
+      /* validate invalid UTF-8 */
+      cbuffer[rand() % (l + 3)] = rand();
+      const char *gp;
+      int indx;
+      bb = utf8_validate (cbuffer, &indx);
+      gb = g_utf8_validate (cbuffer, -1, &gp);
+      TCHECK (bb == gb);
+      if (!bb)
+        TCHECK (cbuffer + indx == gp);
     }
   TDONE();
 }
@@ -114,6 +128,9 @@
       if (i % 20000 == 0)
         TOK();
 
+      bb = Unichar::isvalid (uc);
+      gb = g_unichar_validate (uc);
+      TCHECK (bb == gb);
       bb = Unichar::isalnum (uc);
       gb = g_unichar_isalnum (uc);
       TCHECK (bb == gb);




[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]