r3950 - in trunk/birnet: . tests



Author: timj
Date: 2006-10-09 17:08:01 -0400 (Mon, 09 Oct 2006)
New Revision: 3950

Added:
   trunk/birnet/birnetutf8.cc
   trunk/birnet/birnetutf8.hh
   trunk/birnet/tests/strings.cc
Modified:
   trunk/birnet/ChangeLog
   trunk/birnet/Makefile.am
   trunk/birnet/birnet.hh
   trunk/birnet/tests/Makefile.am
Log:
Mon Oct  9 23:00:39 2006  Tim Janik  <timj gtk org>

        * birnetutf8.hh, birnetutf8.cc: wrap GLib isalnum() and friends to 
        classify unichar characters. provide inline functions to increment
        and decrement positions in UTF-8 strings.

        * tests/strings.cc: new test program to test birnetutf8.hh functions.




Modified: trunk/birnet/ChangeLog
===================================================================
--- trunk/birnet/ChangeLog	2006-10-09 21:05:43 UTC (rev 3949)
+++ trunk/birnet/ChangeLog	2006-10-09 21:08:01 UTC (rev 3950)
@@ -1,3 +1,11 @@
+Mon Oct  9 23:00:39 2006  Tim Janik  <timj gtk org>
+
+	* birnetutf8.hh, birnetutf8.cc: wrap GLib isalnum() and friends to 
+	classify unichar characters. provide inline functions to increment
+	and decrement positions in UTF-8 strings.
+
+	* tests/strings.cc: new test program to test birnetutf8.hh functions.
+
 Sun Oct  8 16:20:18 2006  Tim Janik  <timj gtk org>
 
 	* Makefile.am: ship birnetcdefs.h.

Modified: trunk/birnet/Makefile.am
===================================================================
--- trunk/birnet/Makefile.am	2006-10-09 21:05:43 UTC (rev 3949)
+++ trunk/birnet/Makefile.am	2006-10-09 21:08:01 UTC (rev 3950)
@@ -17,6 +17,7 @@
 	birnetsignal.hh                 \
 	birnettests.h			\
 	birnetthread.hh			\
+	birnetutf8.hh			\
 	birnetutils.hh			\
 )
 birnet_sources = $(strip 		\
@@ -25,6 +26,7 @@
 	birnetmsg.cc			\
 	birnetsignal.cc                 \
 	birnetthread.cc			\
+	birnetutf8.cc			\
 	birnetutils.cc			\
 )
 birnet_private_headers = $(strip 	\

Modified: trunk/birnet/birnet.hh
===================================================================
--- trunk/birnet/birnet.hh	2006-10-09 21:05:43 UTC (rev 3949)
+++ trunk/birnet/birnet.hh	2006-10-09 21:08:01 UTC (rev 3950)
@@ -24,6 +24,7 @@
 
 #include <birnet/birnetmsg.hh>
 
+#include <birnet/birnetutf8.hh>
 #include <birnet/birnetutils.hh>
 #include <birnet/birnetsignal.hh>
 #include <birnet/birnetthread.hh>

Added: trunk/birnet/birnetutf8.cc
===================================================================
--- trunk/birnet/birnetutf8.cc	2006-10-09 21:05:43 UTC (rev 3949)
+++ trunk/birnet/birnetutf8.cc	2006-10-09 21:08:01 UTC (rev 3950)
@@ -0,0 +1,313 @@
+/* BirnetUtf8 - UTF-8 utilities
+ * Copyright (C) 2006 Tim Janik
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General
+ * Public License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+#include "birnetutf8.hh"
+
+namespace Birnet {
+namespace Unichar {
+
+/* --- unichar ctype.h equivalents --- */
+bool
+isalnum (unichar uc)
+{
+  return g_unichar_isalnum (uc);
+}
+
+bool
+isalpha (unichar uc)
+{
+  return g_unichar_isalpha (uc);
+}
+
+bool
+iscntrl (unichar uc)
+{
+  return g_unichar_iscntrl (uc);
+}
+
+bool
+isdigit (unichar uc)
+{
+  return g_unichar_isdigit (uc);
+}
+
+int
+digit_value (unichar uc)
+{
+  return g_unichar_digit_value (uc);
+}
+
+bool
+isgraph (unichar uc)
+{
+  return g_unichar_isgraph (uc);
+}
+
+bool
+islower (unichar uc)
+{
+  return g_unichar_islower (uc);
+}
+
+unichar
+tolower (unichar uc)
+{
+  return g_unichar_tolower (uc);
+}
+
+bool
+isprint (unichar uc)
+{
+  return g_unichar_isprint (uc);
+}
+
+bool
+ispunct (unichar uc)
+{
+  return g_unichar_ispunct (uc);
+}
+
+bool
+isspace (unichar uc)
+{
+  return g_unichar_isspace (uc);
+}
+
+bool
+isupper (unichar uc)
+{
+  return g_unichar_isupper (uc);
+}
+
+unichar
+toupper (unichar uc)
+{
+  return g_unichar_toupper (uc);
+}
+
+bool
+isxdigit (unichar uc)
+{
+  return g_unichar_isxdigit (uc);
+}
+
+int
+xdigit_value (unichar uc)
+{
+  return g_unichar_xdigit_value (uc);
+}
+
+bool
+istitle (unichar uc)
+{
+  return g_unichar_istitle (uc);
+}
+
+unichar
+totitle (unichar uc)
+{
+  return g_unichar_totitle (uc);
+}
+
+bool
+isdefined (unichar uc)
+{
+  return g_unichar_isdefined (uc);
+}
+
+bool
+iswide (unichar uc)
+{
+  return g_unichar_iswide (uc);
+}
+
+bool
+iswide_cjk (unichar uc)
+{
+#if GLIB_CHECK_VERSION (2, 10, 0)
+  return g_unichar_iswide_cjk (uc);
+#else
+  return false;
+#endif
+}
+
+Type
+get_type (unichar uc)
+{
+  return Type (g_unichar_type (uc));
+}
+
+BreakType
+get_break (unichar uc)
+{
+  return BreakType (g_unichar_break_type (uc));
+}
+
+/* --- ensure castable Birnet::Unichar::Type --- */
+BIRNET_STATIC_ASSERT (Unichar::CONTROL == (int) G_UNICODE_CONTROL);
+BIRNET_STATIC_ASSERT (Unichar::FORMAT == (int) G_UNICODE_FORMAT);
+BIRNET_STATIC_ASSERT (Unichar::UNASSIGNED == (int) G_UNICODE_UNASSIGNED);
+BIRNET_STATIC_ASSERT (Unichar::PRIVATE_USE == (int) G_UNICODE_PRIVATE_USE);
+BIRNET_STATIC_ASSERT (Unichar::SURROGATE == (int) G_UNICODE_SURROGATE);
+BIRNET_STATIC_ASSERT (Unichar::LOWERCASE_LETTER == (int) G_UNICODE_LOWERCASE_LETTER);
+BIRNET_STATIC_ASSERT (Unichar::MODIFIER_LETTER == (int) G_UNICODE_MODIFIER_LETTER);
+BIRNET_STATIC_ASSERT (Unichar::OTHER_LETTER == (int) G_UNICODE_OTHER_LETTER);
+BIRNET_STATIC_ASSERT (Unichar::TITLECASE_LETTER == (int) G_UNICODE_TITLECASE_LETTER);
+BIRNET_STATIC_ASSERT (Unichar::UPPERCASE_LETTER == (int) G_UNICODE_UPPERCASE_LETTER);
+BIRNET_STATIC_ASSERT (Unichar::COMBINING_MARK == (int) G_UNICODE_COMBINING_MARK);
+BIRNET_STATIC_ASSERT (Unichar::ENCLOSING_MARK == (int) G_UNICODE_ENCLOSING_MARK);
+BIRNET_STATIC_ASSERT (Unichar::NON_SPACING_MARK == (int) G_UNICODE_NON_SPACING_MARK);
+BIRNET_STATIC_ASSERT (Unichar::DECIMAL_NUMBER == (int) G_UNICODE_DECIMAL_NUMBER);
+BIRNET_STATIC_ASSERT (Unichar::LETTER_NUMBER == (int) G_UNICODE_LETTER_NUMBER);
+BIRNET_STATIC_ASSERT (Unichar::OTHER_NUMBER == (int) G_UNICODE_OTHER_NUMBER);
+BIRNET_STATIC_ASSERT (Unichar::CONNECT_PUNCTUATION == (int) G_UNICODE_CONNECT_PUNCTUATION);
+BIRNET_STATIC_ASSERT (Unichar::DASH_PUNCTUATION == (int) G_UNICODE_DASH_PUNCTUATION);
+BIRNET_STATIC_ASSERT (Unichar::CLOSE_PUNCTUATION == (int) G_UNICODE_CLOSE_PUNCTUATION);
+BIRNET_STATIC_ASSERT (Unichar::FINAL_PUNCTUATION == (int) G_UNICODE_FINAL_PUNCTUATION);
+BIRNET_STATIC_ASSERT (Unichar::INITIAL_PUNCTUATION == (int) G_UNICODE_INITIAL_PUNCTUATION);
+BIRNET_STATIC_ASSERT (Unichar::OTHER_PUNCTUATION == (int) G_UNICODE_OTHER_PUNCTUATION);
+BIRNET_STATIC_ASSERT (Unichar::OPEN_PUNCTUATION == (int) G_UNICODE_OPEN_PUNCTUATION);
+BIRNET_STATIC_ASSERT (Unichar::CURRENCY_SYMBOL == (int) G_UNICODE_CURRENCY_SYMBOL);
+BIRNET_STATIC_ASSERT (Unichar::MODIFIER_SYMBOL == (int) G_UNICODE_MODIFIER_SYMBOL);
+BIRNET_STATIC_ASSERT (Unichar::MATH_SYMBOL == (int) G_UNICODE_MATH_SYMBOL);
+BIRNET_STATIC_ASSERT (Unichar::OTHER_SYMBOL == (int) G_UNICODE_OTHER_SYMBOL);
+BIRNET_STATIC_ASSERT (Unichar::LINE_SEPARATOR == (int) G_UNICODE_LINE_SEPARATOR);
+BIRNET_STATIC_ASSERT (Unichar::PARAGRAPH_SEPARATOR == (int) G_UNICODE_PARAGRAPH_SEPARATOR);
+BIRNET_STATIC_ASSERT (Unichar::SPACE_SEPARATOR == (int) G_UNICODE_SPACE_SEPARATOR);
+
+/* --- ensure castable Birnet::Unichar::BreakType --- */
+BIRNET_STATIC_ASSERT (Unichar::BREAK_MANDATORY == (int) G_UNICODE_BREAK_MANDATORY);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_CARRIAGE_RETURN == (int) G_UNICODE_BREAK_CARRIAGE_RETURN);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_LINE_FEED == (int) G_UNICODE_BREAK_LINE_FEED);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_COMBINING_MARK == (int) G_UNICODE_BREAK_COMBINING_MARK);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_SURROGATE == (int) G_UNICODE_BREAK_SURROGATE);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_ZERO_WIDTH_SPACE == (int) G_UNICODE_BREAK_ZERO_WIDTH_SPACE);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_INSEPARABLE == (int) G_UNICODE_BREAK_INSEPARABLE);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_NON_BREAKING_GLUE == (int) G_UNICODE_BREAK_NON_BREAKING_GLUE);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_CONTINGENT == (int) G_UNICODE_BREAK_CONTINGENT);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_SPACE == (int) G_UNICODE_BREAK_SPACE);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_AFTER == (int) G_UNICODE_BREAK_AFTER);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_BEFORE == (int) G_UNICODE_BREAK_BEFORE);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_BEFORE_AND_AFTER == (int) G_UNICODE_BREAK_BEFORE_AND_AFTER);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_HYPHEN == (int) G_UNICODE_BREAK_HYPHEN);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_NON_STARTER == (int) G_UNICODE_BREAK_NON_STARTER);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_OPEN_PUNCTUATION == (int) G_UNICODE_BREAK_OPEN_PUNCTUATION);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_CLOSE_PUNCTUATION == (int) G_UNICODE_BREAK_CLOSE_PUNCTUATION);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_QUOTATION == (int) G_UNICODE_BREAK_QUOTATION);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_EXCLAMATION == (int) G_UNICODE_BREAK_EXCLAMATION);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_IDEOGRAPHIC == (int) G_UNICODE_BREAK_IDEOGRAPHIC);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_NUMERIC == (int) G_UNICODE_BREAK_NUMERIC);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_INFIX_SEPARATOR == (int) G_UNICODE_BREAK_INFIX_SEPARATOR);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_SYMBOL == (int) G_UNICODE_BREAK_SYMBOL);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_ALPHABETIC == (int) G_UNICODE_BREAK_ALPHABETIC);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_PREFIX == (int) G_UNICODE_BREAK_PREFIX);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_POSTFIX == (int) G_UNICODE_BREAK_POSTFIX);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_COMPLEX_CONTEXT == (int) G_UNICODE_BREAK_COMPLEX_CONTEXT);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_AMBIGUOUS == (int) G_UNICODE_BREAK_AMBIGUOUS);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_UNKNOWN == (int) G_UNICODE_BREAK_UNKNOWN);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_NEXT_LINE == (int) G_UNICODE_BREAK_NEXT_LINE);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_WORD_JOINER == (int) G_UNICODE_BREAK_WORD_JOINER);
+#if GLIB_CHECK_VERSION (2, 10, 0)
+BIRNET_STATIC_ASSERT (Unichar::BREAK_HANGUL_L_JAMO == (int) G_UNICODE_BREAK_HANGUL_L_JAMO);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_HANGUL_V_JAMO == (int) G_UNICODE_BREAK_HANGUL_V_JAMO);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_HANGUL_T_JAMO == (int) G_UNICODE_BREAK_HANGUL_T_JAMO);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_HANGUL_LV_SYLLABLE == (int) G_UNICODE_BREAK_HANGUL_LV_SYLLABLE);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_HANGUL_LVT_SYLLABLE == (int) G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE);
+#endif
+} // Unichar
+
+/* --- UTF-8 movement --- */
+const int8 utf8_skip_table[256] = {
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
+};
+
+static inline const int8
+utf8_char_length (const uint8 c)
+{
+  return c < 0xfe ? utf8_skip_table[c] : -1;
+}
+static inline const uint8
+utf8_length_bits (const uint8 l)
+{
+  const uint length_bits[] = { 0x00, 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, };
+  return l <= 6 ? length_bits[l] : 0;
+}
+
+static inline const uint8
+utf8_char_mask (const uint8 c)
+{
+  const uint8 char_masks[8] = { 0x00, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01, 0x00 };
+  return char_masks[utf8_skip_table[c]];
+}
+
+static inline uint8
+utf8_length_from_unichar (unichar uc)
+{
+  uint8 l = 1;
+  l += uc >= 0x00000080; /* 2 */
+  l += uc >= 0x00000800; /* 3 */
+  l += uc >= 0x00010000; /* 4 */
+  l += uc >= 0x00200000; /* 5 */
+  l += uc >= 0x04000000; /* 6 */
+  return l;
+}
+
+unichar
+utf8_to_unichar (const char *str)
+{
+  uint8 clen = utf8_char_length (*str);
+  uint8 mask = utf8_char_mask (*str);
+  if (clen < 1)
+    return 0xffffffff;
+  unichar uc = *str & mask;
+  for (uint i = 1; i < clen; i++)
+    {
+      uint8 c = str[i];
+      if ((c & 0xc0) != 0x80)
+        return 0xffffffff;
+      uc = (uc << 6) + (c & 0x3f);
+    }
+  return uc;
+}
+
+int
+utf8_from_unichar (unichar uc,
+                   char    str[8])
+{
+  const int l = utf8_length_from_unichar (uc);
+  if (!str)
+    return l;
+  uint i = l;
+  str[i] = 0;
+  while (--i)
+    {
+      str[i] = (uc & 0x3f) | 0x80;
+      uc >>= 6;
+    }
+  str[i] = uc | utf8_length_bits (l); /* i == 0 */
+  return l;
+}
+
+} // Birnet

Added: trunk/birnet/birnetutf8.hh
===================================================================
--- trunk/birnet/birnetutf8.hh	2006-10-09 21:05:43 UTC (rev 3949)
+++ trunk/birnet/birnetutf8.hh	2006-10-09 21:08:01 UTC (rev 3950)
@@ -0,0 +1,167 @@
+/* BirnetUtf8 - UTF-8 utilities
+ * Copyright (C) 2006 Tim Janik
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General
+ * Public License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+#ifndef __BIRNET_UTF8_HH__
+#define __BIRNET_UTF8_HH__
+
+#include <birnet/birnetutils.hh>
+
+namespace Birnet {
+
+namespace Unichar {
+bool    isalnum      (unichar uc) BIRNET_CONST;
+bool    isalpha      (unichar uc) BIRNET_CONST;
+bool    iscntrl      (unichar uc) BIRNET_CONST;
+bool    isdigit      (unichar uc) BIRNET_CONST;
+int     digit_value  (unichar uc) BIRNET_CONST;
+bool    isgraph      (unichar uc) BIRNET_CONST;
+bool    islower      (unichar uc) BIRNET_CONST;
+unichar tolower      (unichar uc) BIRNET_CONST;
+bool    isprint      (unichar uc) BIRNET_CONST;
+bool    ispunct      (unichar uc) BIRNET_CONST;
+bool    isspace      (unichar uc) BIRNET_CONST;
+bool    isupper      (unichar uc) BIRNET_CONST;
+unichar toupper      (unichar uc) BIRNET_CONST;
+bool    isxdigit     (unichar uc) BIRNET_CONST;
+int     xdigit_value (unichar uc) BIRNET_CONST;
+bool    istitle      (unichar uc) BIRNET_CONST;
+unichar totitle      (unichar uc) BIRNET_CONST;
+bool    isdefined    (unichar uc) BIRNET_CONST;
+bool    iswide       (unichar uc) BIRNET_CONST;
+bool    iswide_cjk   (unichar uc) BIRNET_CONST;
+typedef enum {
+  CONTROL,              FORMAT,                 UNASSIGNED,
+  PRIVATE_USE,          SURROGATE,              LOWERCASE_LETTER,
+  MODIFIER_LETTER,      OTHER_LETTER,           TITLECASE_LETTER,
+  UPPERCASE_LETTER,     COMBINING_MARK,         ENCLOSING_MARK,
+  NON_SPACING_MARK,     DECIMAL_NUMBER,         LETTER_NUMBER,
+  OTHER_NUMBER,         CONNECT_PUNCTUATION,    DASH_PUNCTUATION,
+  CLOSE_PUNCTUATION,    FINAL_PUNCTUATION,      INITIAL_PUNCTUATION,
+  OTHER_PUNCTUATION,    OPEN_PUNCTUATION,       CURRENCY_SYMBOL,
+  MODIFIER_SYMBOL,      MATH_SYMBOL,            OTHER_SYMBOL,
+  LINE_SEPARATOR,       PARAGRAPH_SEPARATOR,    SPACE_SEPARATOR
+} Type;
+Type    get_type     (unichar uc) BIRNET_CONST;
+typedef enum {
+  BREAK_MANDATORY,        BREAK_CARRIAGE_RETURN,    BREAK_LINE_FEED,
+  BREAK_COMBINING_MARK,   BREAK_SURROGATE,          BREAK_ZERO_WIDTH_SPACE,
+  BREAK_INSEPARABLE,      BREAK_NON_BREAKING_GLUE,  BREAK_CONTINGENT,
+  BREAK_SPACE,            BREAK_AFTER,              BREAK_BEFORE,
+  BREAK_BEFORE_AND_AFTER, BREAK_HYPHEN,             BREAK_NON_STARTER,
+  BREAK_OPEN_PUNCTUATION, BREAK_CLOSE_PUNCTUATION,  BREAK_QUOTATION,
+  BREAK_EXCLAMATION,      BREAK_IDEOGRAPHIC,        BREAK_NUMERIC,
+  BREAK_INFIX_SEPARATOR,  BREAK_SYMBOL,             BREAK_ALPHABETIC,
+  BREAK_PREFIX,           BREAK_POSTFIX,            BREAK_COMPLEX_CONTEXT,
+  BREAK_AMBIGUOUS,        BREAK_UNKNOWN,            BREAK_NEXT_LINE,
+  BREAK_WORD_JOINER,      BREAK_HANGUL_L_JAMO,      BREAK_HANGUL_V_JAMO,
+  BREAK_HANGUL_T_JAMO,    BREAK_HANGUL_LV_SYLLABLE, BREAK_HANGUL_LVT_SYLLABLE
+} BreakType;
+BreakType get_break  (unichar uc) BIRNET_CONST;
+
+} // Unichar
+
+/* --- UTF-8 movement --- */
+inline const char*    utf8_next         (const char     *c);
+inline char*          utf8_next         (char           *c);
+inline const char*    utf8_prev         (const char     *c);
+inline char*          utf8_prev         (char           *c);
+inline const char*    utf8_find_next    (const char     *c,
+                                         const char     *bound = NULL);
+inline char*          utf8_find_next    (char           *c,
+                                         const char     *bound = NULL);
+inline const char*    utf8_find_prev    (const char     *c,
+                                         const char     *start = NULL);
+inline char*          utf8_find_prev    (char           *c,
+                                         const char     *start = NULL);
+unichar               utf8_to_unichar   (const char     *str);
+int                   utf8_from_unichar (unichar         uc,
+                                         char            str[8]);
+
+/* --- implementation bits --- */
+extern const int8 utf8_skip_table[256];
+
+inline const char*
+utf8_next (const char *c)
+{
+  return c + utf8_skip_table[(uint8) *c];
+}
+
+inline char*
+utf8_next (char *c)
+{
+  return c + utf8_skip_table[(uint8) *c];
+}
+
+inline const char*
+utf8_prev (const char *c)
+{
+  do
+    c--;
+  while ((*c & 0xc0) == 0x80);
+  return c;
+}
+
+inline char*
+utf8_prev (char *c)
+{
+  do
+    c--;
+  while ((*c & 0xc0) == 0x80);
+  return c;
+}
+
+inline const char*
+utf8_find_next (const char *c,
+                const char *bound)
+{
+  if (*c)
+    do
+      c++;
+    while ((!bound || c < bound) && (*c & 0xc0) == 0x80);
+  return !bound || c < bound ? c : NULL;
+}
+
+inline char*
+utf8_find_next (char       *c,
+                const char *bound)
+{
+  return const_cast<char*> (utf8_find_next (const_cast<const char*> (c), bound));
+}
+
+inline const char*
+utf8_find_prev (const char *c,
+                const char *start)
+{
+  do
+    c--;
+  while (c >= start && (*c & 0xc0) == 0x80);
+  return !start || c >= start ? c : NULL;
+}
+
+inline char*
+utf8_find_prev (char       *c,
+                const char *start)
+{
+  return const_cast<char*> (utf8_find_prev (const_cast<const char*> (c), start));
+}
+
+
+} // Birnet
+
+#endif /* __BIRNET_UTF8_HH__ */
+/* vim:set ts=8 sts=2 sw=2: */

Modified: trunk/birnet/tests/Makefile.am
===================================================================
--- trunk/birnet/tests/Makefile.am	2006-10-09 21:05:43 UTC (rev 3949)
+++ trunk/birnet/tests/Makefile.am	2006-10-09 21:08:01 UTC (rev 3950)
@@ -7,16 +7,25 @@
 INCLUDES       += -I$(top_srcdir) -I$(top_builddir) -I. $(BIRNET_CFLAGS)
 DEFS	       += -DG_LOG_DOMAIN='"$(basename $(@F))"' -DPARANOID -DG_DISABLE_CONST_RETURNS
 
-TESTS 		 = infotest signal sorting datalist threads
+TESTS 		 = 
 noinst_PROGRAMS  = $(TESTS)
 progs_ldadd 	 = $(top_builddir)/birnet/libbirnet.o $(BIRNET_LIBS) -lm
+
+TESTS		+= infotest
 infotest_SOURCES = infotest.cc
 infotest_LDADD	 = $(progs_ldadd)
+TESTS		+= strings
+strings_SOURCES  = strings.cc
+strings_LDADD	 = $(progs_ldadd)
+TESTS		+= threads
 threads_SOURCES	 = threads.cc
 threads_LDADD	 = $(progs_ldadd)
+TESTS		+= signal
 signal_SOURCES	 = signal.cc
 signal_LDADD	 = $(progs_ldadd)
+TESTS		+= sorting
 sorting_SOURCES  = sorting.cc
 sorting_LDADD	 = $(progs_ldadd)
+TESTS		+= datalist
 datalist_SOURCES = datalist.cc
 datalist_LDADD	 = $(progs_ldadd)

Added: trunk/birnet/tests/strings.cc
===================================================================
--- trunk/birnet/tests/strings.cc	2006-10-09 21:05:43 UTC (rev 3949)
+++ trunk/birnet/tests/strings.cc	2006-10-09 21:08:01 UTC (rev 3950)
@@ -0,0 +1,211 @@
+/* Birnet
+ * Copyright (C) 2006 Tim Janik
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General
+ * Public License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+//#define TEST_VERBOSE
+#include <birnet/birnettests.h>
+using namespace Birnet;
+
+namespace {
+using namespace Birnet;
+
+static void
+random_tf8_and_unichar_test (void)
+{
+  TSTART ("utf8<->unichar");
+  const uint count = 1000000;
+  for (uint i = 0; i < count; i++)
+    {
+      if (i % 20000 == 0)
+        TOK();
+      unichar nc, uc = rand() % (0x100 << (i % 24));
+      if (!uc)
+        continue;
+      char buffer[8], gstr[10] = { 0, };
+      int l;
+
+      l = utf8_from_unichar (uc, buffer);
+      TCHECK (l > 0);
+      TCHECK (l <= 7);
+      TCHECK (buffer[l] == 0);
+      TCHECK (l == g_unichar_to_utf8 (uc, gstr));
+      TCHECK (strcmp (gstr, buffer) == 0);
+      nc = utf8_to_unichar (buffer);
+      TCHECK (nc == g_utf8_get_char (buffer));
+      TCHECK (uc == nc);
+      TCHECK (l == utf8_from_unichar (uc, NULL));
+      char *p1 = utf8_next (buffer);
+      TCHECK (p1 == buffer + l);
+      char *p2 = utf8_prev (p1);
+      TCHECK (p2 == buffer);
+
+      char cbuffer[1024];
+      snprintf (cbuffer, 1024, "x%sy7", buffer);
+      char *cur = cbuffer, *pn, *gn, *pp;
+      /* x */
+      pn = utf8_find_next (cur);
+      TCHECK (pn == cur + 1);
+      gn = g_utf8_find_next_char (cur, NULL);
+      TCHECK (pn == gn);
+      pp = utf8_find_prev (pn, cbuffer);
+      TCHECK (pp == cur);
+      /* random unichar */
+      cur = pn;
+      pn = utf8_find_next (cur);
+      TCHECK (pn == cur + l);
+      gn = g_utf8_find_next_char (cur, NULL);
+      TCHECK (pn == gn);
+      pp = utf8_find_prev (pn, cbuffer);
+      TCHECK (pp == cur);
+      /* y */
+      cur = pn;
+      pn = utf8_find_next (cur);
+      TCHECK (pn == cur + 1);
+      gn = g_utf8_find_next_char (cur, NULL);
+      TCHECK (pn == gn);
+      pp = utf8_find_prev (pn, cbuffer);
+      TCHECK (pp == cur);
+      /* 7 (last) */
+      cur = pn;
+      pn = utf8_find_next (cur);
+      TCHECK (pn == cur + 1);
+      gn = g_utf8_find_next_char (cur, NULL);
+      TCHECK (pn == gn);
+      pp = utf8_find_prev (pn, cbuffer);
+      TCHECK (pp == cur);
+      /* last with bounds */
+      pn = utf8_find_next (cur, cur + strlen (cur));
+      TCHECK (pn == NULL);
+      gn = g_utf8_find_next_char (cur, cur + strlen (cur));
+      TCHECK (pn == gn);
+      /* first with bounds */
+      pp = utf8_find_prev (cbuffer, cbuffer);
+      TCHECK (pp == NULL);
+    }
+  TDONE();
+}
+
+static void
+random_unichar_test (void)
+{
+  TSTART ("unichar classification");
+  const uint count = 1000000;
+  for (uint i = 0; i < count; i++)
+    {
+      unichar uc = rand() % (0x100 << (i % 24));
+      unichar bc, gc;
+      gboolean gb;
+      bool bb;
+      int bv, gv;
+      if (i % 20000 == 0)
+        TOK();
+
+      bb = Unichar::isalnum (uc);
+      gb = g_unichar_isalnum (uc);
+      TCHECK (bb == gb);
+      bb = Unichar::isalpha (uc);
+      gb = g_unichar_isalpha (uc);
+      TCHECK (bb == gb);
+      bb = Unichar::iscntrl (uc);
+      gb = g_unichar_iscntrl (uc);
+      TCHECK (bb == gb);
+      bb = Unichar::isdigit (uc);
+      gb = g_unichar_isdigit (uc);
+      TCHECK (bb == gb);
+      bv = Unichar::digit_value (uc);
+      gv = g_unichar_digit_value (uc);
+      TCHECK (bv == gv);
+      bv = Unichar::digit_value ('0' + uc % 10);
+      gv = g_unichar_digit_value ('0' + uc % 10);
+      TCHECK (bv == gv);
+      bb = Unichar::isgraph (uc);
+      gb = g_unichar_isgraph (uc);
+      TCHECK (bv == gv);
+      bb = Unichar::islower (uc);
+      gb = g_unichar_islower (uc);
+      TCHECK (bb == gb);
+      bc = Unichar::tolower (uc);
+      gc = g_unichar_tolower (uc);
+      TCHECK (bc == gc);
+      bb = Unichar::isprint (uc);
+      gb = g_unichar_isprint (uc);
+      TCHECK (bb == gb);
+      bb = Unichar::ispunct (uc);
+      gb = g_unichar_ispunct (uc);
+      TCHECK (bb == gb);
+      bb = Unichar::isspace (uc);
+      gb = g_unichar_isspace (uc);
+      TCHECK (bb == gb);
+      bb = Unichar::isupper (uc);
+      gb = g_unichar_isupper (uc);
+      TCHECK (bb == gb);
+      bc = Unichar::toupper (uc);
+      gc = g_unichar_toupper (uc);
+      TCHECK (bc == gc);
+      bb = Unichar::isxdigit (uc);
+      gb = g_unichar_isxdigit (uc);
+      TCHECK (bb == gb);
+      bv = Unichar::xdigit_value (uc);
+      gv = g_unichar_xdigit_value (uc);
+      TCHECK (bv == gv);
+      bv = Unichar::xdigit_value ('0' + uc % 10);
+      gv = g_unichar_xdigit_value ('0' + uc % 10);
+      TCHECK (bv == gv);
+      bv = Unichar::xdigit_value ('a' + uc % 6);
+      gv = g_unichar_xdigit_value ('a' + uc % 6);
+      TCHECK (bv == gv);
+      bv = Unichar::xdigit_value ('A' + uc % 6);
+      gv = g_unichar_xdigit_value ('A' + uc % 6);
+      TCHECK (bv == gv);
+      bb = Unichar::istitle (uc);
+      gb = g_unichar_istitle (uc);
+      TCHECK (bb == gb);
+      bc = Unichar::totitle (uc);
+      gc = g_unichar_totitle (uc);
+      TCHECK (bc == gc);
+      bb = Unichar::isdefined (uc);
+      gb = g_unichar_isdefined (uc);
+      TCHECK (bb == gb);
+      bb = Unichar::iswide (uc);
+      gb = g_unichar_iswide (uc);
+      TCHECK (bb == gb);
+#if GLIB_CHECK_VERSION (2, 10, 0)
+      bb = Unichar::iswide_cjk (uc);
+      gb = g_unichar_iswide_cjk (uc);
+      TCHECK (bb == gb);
+#endif
+      TCHECK (Unichar::get_type (uc) == (int) g_unichar_type (uc));
+      TCHECK (Unichar::get_break (uc) == (int) g_unichar_break_type (uc));
+    }
+  TDONE();
+}
+
+} // Anon
+
+int
+main (int   argc,
+      char *argv[])
+{
+  birnet_init_test (&argc, &argv);
+
+  random_unichar_test();
+  random_tf8_and_unichar_test();
+  
+  return 0;
+}
+
+/* vim:set ts=8 sts=2 sw=2: */




[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]