r3950 - in trunk/birnet: . tests
- From: timj svn gnome org
- To: svn-commits-list gnome org
- Subject: r3950 - in trunk/birnet: . tests
- Date: Mon, 9 Oct 2006 17:08:23 -0400 (EDT)
Author: timj
Date: 2006-10-09 17:08:01 -0400 (Mon, 09 Oct 2006)
New Revision: 3950
Added:
trunk/birnet/birnetutf8.cc
trunk/birnet/birnetutf8.hh
trunk/birnet/tests/strings.cc
Modified:
trunk/birnet/ChangeLog
trunk/birnet/Makefile.am
trunk/birnet/birnet.hh
trunk/birnet/tests/Makefile.am
Log:
Mon Oct 9 23:00:39 2006 Tim Janik <timj gtk org>
* birnetutf8.hh, birnetutf8.cc: wrap GLib isalnum() and friends to
classify unichar characters. provide inline functions to increment
and decrement positions in UTF-8 strings.
* tests/strings.cc: new test program to test birnetutf8.hh functions.
Modified: trunk/birnet/ChangeLog
===================================================================
--- trunk/birnet/ChangeLog 2006-10-09 21:05:43 UTC (rev 3949)
+++ trunk/birnet/ChangeLog 2006-10-09 21:08:01 UTC (rev 3950)
@@ -1,3 +1,11 @@
+Mon Oct 9 23:00:39 2006 Tim Janik <timj gtk org>
+
+ * birnetutf8.hh, birnetutf8.cc: wrap GLib isalnum() and friends to
+ classify unichar characters. provide inline functions to increment
+ and decrement positions in UTF-8 strings.
+
+ * tests/strings.cc: new test program to test birnetutf8.hh functions.
+
Sun Oct 8 16:20:18 2006 Tim Janik <timj gtk org>
* Makefile.am: ship birnetcdefs.h.
Modified: trunk/birnet/Makefile.am
===================================================================
--- trunk/birnet/Makefile.am 2006-10-09 21:05:43 UTC (rev 3949)
+++ trunk/birnet/Makefile.am 2006-10-09 21:08:01 UTC (rev 3950)
@@ -17,6 +17,7 @@
birnetsignal.hh \
birnettests.h \
birnetthread.hh \
+ birnetutf8.hh \
birnetutils.hh \
)
birnet_sources = $(strip \
@@ -25,6 +26,7 @@
birnetmsg.cc \
birnetsignal.cc \
birnetthread.cc \
+ birnetutf8.cc \
birnetutils.cc \
)
birnet_private_headers = $(strip \
Modified: trunk/birnet/birnet.hh
===================================================================
--- trunk/birnet/birnet.hh 2006-10-09 21:05:43 UTC (rev 3949)
+++ trunk/birnet/birnet.hh 2006-10-09 21:08:01 UTC (rev 3950)
@@ -24,6 +24,7 @@
#include <birnet/birnetmsg.hh>
+#include <birnet/birnetutf8.hh>
#include <birnet/birnetutils.hh>
#include <birnet/birnetsignal.hh>
#include <birnet/birnetthread.hh>
Added: trunk/birnet/birnetutf8.cc
===================================================================
--- trunk/birnet/birnetutf8.cc 2006-10-09 21:05:43 UTC (rev 3949)
+++ trunk/birnet/birnetutf8.cc 2006-10-09 21:08:01 UTC (rev 3950)
@@ -0,0 +1,313 @@
+/* BirnetUtf8 - UTF-8 utilities
+ * Copyright (C) 2006 Tim Janik
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General
+ * Public License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+#include "birnetutf8.hh"
+
+namespace Birnet {
+namespace Unichar {
+
+/* --- unichar ctype.h equivalents --- */
+bool
+isalnum (unichar uc)
+{
+ return g_unichar_isalnum (uc);
+}
+
+bool
+isalpha (unichar uc)
+{
+ return g_unichar_isalpha (uc);
+}
+
+bool
+iscntrl (unichar uc)
+{
+ return g_unichar_iscntrl (uc);
+}
+
+bool
+isdigit (unichar uc)
+{
+ return g_unichar_isdigit (uc);
+}
+
+int
+digit_value (unichar uc)
+{
+ return g_unichar_digit_value (uc);
+}
+
+bool
+isgraph (unichar uc)
+{
+ return g_unichar_isgraph (uc);
+}
+
+bool
+islower (unichar uc)
+{
+ return g_unichar_islower (uc);
+}
+
+unichar
+tolower (unichar uc)
+{
+ return g_unichar_tolower (uc);
+}
+
+bool
+isprint (unichar uc)
+{
+ return g_unichar_isprint (uc);
+}
+
+bool
+ispunct (unichar uc)
+{
+ return g_unichar_ispunct (uc);
+}
+
+bool
+isspace (unichar uc)
+{
+ return g_unichar_isspace (uc);
+}
+
+bool
+isupper (unichar uc)
+{
+ return g_unichar_isupper (uc);
+}
+
+unichar
+toupper (unichar uc)
+{
+ return g_unichar_toupper (uc);
+}
+
+bool
+isxdigit (unichar uc)
+{
+ return g_unichar_isxdigit (uc);
+}
+
+int
+xdigit_value (unichar uc)
+{
+ return g_unichar_xdigit_value (uc);
+}
+
+bool
+istitle (unichar uc)
+{
+ return g_unichar_istitle (uc);
+}
+
+unichar
+totitle (unichar uc)
+{
+ return g_unichar_totitle (uc);
+}
+
+bool
+isdefined (unichar uc)
+{
+ return g_unichar_isdefined (uc);
+}
+
+bool
+iswide (unichar uc)
+{
+ return g_unichar_iswide (uc);
+}
+
+bool
+iswide_cjk (unichar uc)
+{
+#if GLIB_CHECK_VERSION (2, 10, 0)
+ return g_unichar_iswide_cjk (uc);
+#else
+ return false;
+#endif
+}
+
+Type
+get_type (unichar uc)
+{
+ return Type (g_unichar_type (uc));
+}
+
+BreakType
+get_break (unichar uc)
+{
+ return BreakType (g_unichar_break_type (uc));
+}
+
+/* --- ensure castable Birnet::Unichar::Type --- */
+BIRNET_STATIC_ASSERT (Unichar::CONTROL == (int) G_UNICODE_CONTROL);
+BIRNET_STATIC_ASSERT (Unichar::FORMAT == (int) G_UNICODE_FORMAT);
+BIRNET_STATIC_ASSERT (Unichar::UNASSIGNED == (int) G_UNICODE_UNASSIGNED);
+BIRNET_STATIC_ASSERT (Unichar::PRIVATE_USE == (int) G_UNICODE_PRIVATE_USE);
+BIRNET_STATIC_ASSERT (Unichar::SURROGATE == (int) G_UNICODE_SURROGATE);
+BIRNET_STATIC_ASSERT (Unichar::LOWERCASE_LETTER == (int) G_UNICODE_LOWERCASE_LETTER);
+BIRNET_STATIC_ASSERT (Unichar::MODIFIER_LETTER == (int) G_UNICODE_MODIFIER_LETTER);
+BIRNET_STATIC_ASSERT (Unichar::OTHER_LETTER == (int) G_UNICODE_OTHER_LETTER);
+BIRNET_STATIC_ASSERT (Unichar::TITLECASE_LETTER == (int) G_UNICODE_TITLECASE_LETTER);
+BIRNET_STATIC_ASSERT (Unichar::UPPERCASE_LETTER == (int) G_UNICODE_UPPERCASE_LETTER);
+BIRNET_STATIC_ASSERT (Unichar::COMBINING_MARK == (int) G_UNICODE_COMBINING_MARK);
+BIRNET_STATIC_ASSERT (Unichar::ENCLOSING_MARK == (int) G_UNICODE_ENCLOSING_MARK);
+BIRNET_STATIC_ASSERT (Unichar::NON_SPACING_MARK == (int) G_UNICODE_NON_SPACING_MARK);
+BIRNET_STATIC_ASSERT (Unichar::DECIMAL_NUMBER == (int) G_UNICODE_DECIMAL_NUMBER);
+BIRNET_STATIC_ASSERT (Unichar::LETTER_NUMBER == (int) G_UNICODE_LETTER_NUMBER);
+BIRNET_STATIC_ASSERT (Unichar::OTHER_NUMBER == (int) G_UNICODE_OTHER_NUMBER);
+BIRNET_STATIC_ASSERT (Unichar::CONNECT_PUNCTUATION == (int) G_UNICODE_CONNECT_PUNCTUATION);
+BIRNET_STATIC_ASSERT (Unichar::DASH_PUNCTUATION == (int) G_UNICODE_DASH_PUNCTUATION);
+BIRNET_STATIC_ASSERT (Unichar::CLOSE_PUNCTUATION == (int) G_UNICODE_CLOSE_PUNCTUATION);
+BIRNET_STATIC_ASSERT (Unichar::FINAL_PUNCTUATION == (int) G_UNICODE_FINAL_PUNCTUATION);
+BIRNET_STATIC_ASSERT (Unichar::INITIAL_PUNCTUATION == (int) G_UNICODE_INITIAL_PUNCTUATION);
+BIRNET_STATIC_ASSERT (Unichar::OTHER_PUNCTUATION == (int) G_UNICODE_OTHER_PUNCTUATION);
+BIRNET_STATIC_ASSERT (Unichar::OPEN_PUNCTUATION == (int) G_UNICODE_OPEN_PUNCTUATION);
+BIRNET_STATIC_ASSERT (Unichar::CURRENCY_SYMBOL == (int) G_UNICODE_CURRENCY_SYMBOL);
+BIRNET_STATIC_ASSERT (Unichar::MODIFIER_SYMBOL == (int) G_UNICODE_MODIFIER_SYMBOL);
+BIRNET_STATIC_ASSERT (Unichar::MATH_SYMBOL == (int) G_UNICODE_MATH_SYMBOL);
+BIRNET_STATIC_ASSERT (Unichar::OTHER_SYMBOL == (int) G_UNICODE_OTHER_SYMBOL);
+BIRNET_STATIC_ASSERT (Unichar::LINE_SEPARATOR == (int) G_UNICODE_LINE_SEPARATOR);
+BIRNET_STATIC_ASSERT (Unichar::PARAGRAPH_SEPARATOR == (int) G_UNICODE_PARAGRAPH_SEPARATOR);
+BIRNET_STATIC_ASSERT (Unichar::SPACE_SEPARATOR == (int) G_UNICODE_SPACE_SEPARATOR);
+
+/* --- ensure castable Birnet::Unichar::BreakType --- */
+BIRNET_STATIC_ASSERT (Unichar::BREAK_MANDATORY == (int) G_UNICODE_BREAK_MANDATORY);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_CARRIAGE_RETURN == (int) G_UNICODE_BREAK_CARRIAGE_RETURN);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_LINE_FEED == (int) G_UNICODE_BREAK_LINE_FEED);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_COMBINING_MARK == (int) G_UNICODE_BREAK_COMBINING_MARK);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_SURROGATE == (int) G_UNICODE_BREAK_SURROGATE);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_ZERO_WIDTH_SPACE == (int) G_UNICODE_BREAK_ZERO_WIDTH_SPACE);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_INSEPARABLE == (int) G_UNICODE_BREAK_INSEPARABLE);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_NON_BREAKING_GLUE == (int) G_UNICODE_BREAK_NON_BREAKING_GLUE);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_CONTINGENT == (int) G_UNICODE_BREAK_CONTINGENT);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_SPACE == (int) G_UNICODE_BREAK_SPACE);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_AFTER == (int) G_UNICODE_BREAK_AFTER);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_BEFORE == (int) G_UNICODE_BREAK_BEFORE);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_BEFORE_AND_AFTER == (int) G_UNICODE_BREAK_BEFORE_AND_AFTER);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_HYPHEN == (int) G_UNICODE_BREAK_HYPHEN);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_NON_STARTER == (int) G_UNICODE_BREAK_NON_STARTER);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_OPEN_PUNCTUATION == (int) G_UNICODE_BREAK_OPEN_PUNCTUATION);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_CLOSE_PUNCTUATION == (int) G_UNICODE_BREAK_CLOSE_PUNCTUATION);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_QUOTATION == (int) G_UNICODE_BREAK_QUOTATION);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_EXCLAMATION == (int) G_UNICODE_BREAK_EXCLAMATION);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_IDEOGRAPHIC == (int) G_UNICODE_BREAK_IDEOGRAPHIC);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_NUMERIC == (int) G_UNICODE_BREAK_NUMERIC);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_INFIX_SEPARATOR == (int) G_UNICODE_BREAK_INFIX_SEPARATOR);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_SYMBOL == (int) G_UNICODE_BREAK_SYMBOL);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_ALPHABETIC == (int) G_UNICODE_BREAK_ALPHABETIC);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_PREFIX == (int) G_UNICODE_BREAK_PREFIX);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_POSTFIX == (int) G_UNICODE_BREAK_POSTFIX);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_COMPLEX_CONTEXT == (int) G_UNICODE_BREAK_COMPLEX_CONTEXT);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_AMBIGUOUS == (int) G_UNICODE_BREAK_AMBIGUOUS);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_UNKNOWN == (int) G_UNICODE_BREAK_UNKNOWN);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_NEXT_LINE == (int) G_UNICODE_BREAK_NEXT_LINE);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_WORD_JOINER == (int) G_UNICODE_BREAK_WORD_JOINER);
+#if GLIB_CHECK_VERSION (2, 10, 0)
+BIRNET_STATIC_ASSERT (Unichar::BREAK_HANGUL_L_JAMO == (int) G_UNICODE_BREAK_HANGUL_L_JAMO);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_HANGUL_V_JAMO == (int) G_UNICODE_BREAK_HANGUL_V_JAMO);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_HANGUL_T_JAMO == (int) G_UNICODE_BREAK_HANGUL_T_JAMO);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_HANGUL_LV_SYLLABLE == (int) G_UNICODE_BREAK_HANGUL_LV_SYLLABLE);
+BIRNET_STATIC_ASSERT (Unichar::BREAK_HANGUL_LVT_SYLLABLE == (int) G_UNICODE_BREAK_HANGUL_LVT_SYLLABLE);
+#endif
+} // Unichar
+
+/* --- UTF-8 movement --- */
+const int8 utf8_skip_table[256] = {
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
+};
+
+static inline const int8
+utf8_char_length (const uint8 c)
+{
+ return c < 0xfe ? utf8_skip_table[c] : -1;
+}
+static inline const uint8
+utf8_length_bits (const uint8 l)
+{
+ const uint length_bits[] = { 0x00, 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, };
+ return l <= 6 ? length_bits[l] : 0;
+}
+
+static inline const uint8
+utf8_char_mask (const uint8 c)
+{
+ const uint8 char_masks[8] = { 0x00, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01, 0x00 };
+ return char_masks[utf8_skip_table[c]];
+}
+
+static inline uint8
+utf8_length_from_unichar (unichar uc)
+{
+ uint8 l = 1;
+ l += uc >= 0x00000080; /* 2 */
+ l += uc >= 0x00000800; /* 3 */
+ l += uc >= 0x00010000; /* 4 */
+ l += uc >= 0x00200000; /* 5 */
+ l += uc >= 0x04000000; /* 6 */
+ return l;
+}
+
+unichar
+utf8_to_unichar (const char *str)
+{
+ uint8 clen = utf8_char_length (*str);
+ uint8 mask = utf8_char_mask (*str);
+ if (clen < 1)
+ return 0xffffffff;
+ unichar uc = *str & mask;
+ for (uint i = 1; i < clen; i++)
+ {
+ uint8 c = str[i];
+ if ((c & 0xc0) != 0x80)
+ return 0xffffffff;
+ uc = (uc << 6) + (c & 0x3f);
+ }
+ return uc;
+}
+
+int
+utf8_from_unichar (unichar uc,
+ char str[8])
+{
+ const int l = utf8_length_from_unichar (uc);
+ if (!str)
+ return l;
+ uint i = l;
+ str[i] = 0;
+ while (--i)
+ {
+ str[i] = (uc & 0x3f) | 0x80;
+ uc >>= 6;
+ }
+ str[i] = uc | utf8_length_bits (l); /* i == 0 */
+ return l;
+}
+
+} // Birnet
Added: trunk/birnet/birnetutf8.hh
===================================================================
--- trunk/birnet/birnetutf8.hh 2006-10-09 21:05:43 UTC (rev 3949)
+++ trunk/birnet/birnetutf8.hh 2006-10-09 21:08:01 UTC (rev 3950)
@@ -0,0 +1,167 @@
+/* BirnetUtf8 - UTF-8 utilities
+ * Copyright (C) 2006 Tim Janik
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General
+ * Public License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+#ifndef __BIRNET_UTF8_HH__
+#define __BIRNET_UTF8_HH__
+
+#include <birnet/birnetutils.hh>
+
+namespace Birnet {
+
+namespace Unichar {
+bool isalnum (unichar uc) BIRNET_CONST;
+bool isalpha (unichar uc) BIRNET_CONST;
+bool iscntrl (unichar uc) BIRNET_CONST;
+bool isdigit (unichar uc) BIRNET_CONST;
+int digit_value (unichar uc) BIRNET_CONST;
+bool isgraph (unichar uc) BIRNET_CONST;
+bool islower (unichar uc) BIRNET_CONST;
+unichar tolower (unichar uc) BIRNET_CONST;
+bool isprint (unichar uc) BIRNET_CONST;
+bool ispunct (unichar uc) BIRNET_CONST;
+bool isspace (unichar uc) BIRNET_CONST;
+bool isupper (unichar uc) BIRNET_CONST;
+unichar toupper (unichar uc) BIRNET_CONST;
+bool isxdigit (unichar uc) BIRNET_CONST;
+int xdigit_value (unichar uc) BIRNET_CONST;
+bool istitle (unichar uc) BIRNET_CONST;
+unichar totitle (unichar uc) BIRNET_CONST;
+bool isdefined (unichar uc) BIRNET_CONST;
+bool iswide (unichar uc) BIRNET_CONST;
+bool iswide_cjk (unichar uc) BIRNET_CONST;
+typedef enum {
+ CONTROL, FORMAT, UNASSIGNED,
+ PRIVATE_USE, SURROGATE, LOWERCASE_LETTER,
+ MODIFIER_LETTER, OTHER_LETTER, TITLECASE_LETTER,
+ UPPERCASE_LETTER, COMBINING_MARK, ENCLOSING_MARK,
+ NON_SPACING_MARK, DECIMAL_NUMBER, LETTER_NUMBER,
+ OTHER_NUMBER, CONNECT_PUNCTUATION, DASH_PUNCTUATION,
+ CLOSE_PUNCTUATION, FINAL_PUNCTUATION, INITIAL_PUNCTUATION,
+ OTHER_PUNCTUATION, OPEN_PUNCTUATION, CURRENCY_SYMBOL,
+ MODIFIER_SYMBOL, MATH_SYMBOL, OTHER_SYMBOL,
+ LINE_SEPARATOR, PARAGRAPH_SEPARATOR, SPACE_SEPARATOR
+} Type;
+Type get_type (unichar uc) BIRNET_CONST;
+typedef enum {
+ BREAK_MANDATORY, BREAK_CARRIAGE_RETURN, BREAK_LINE_FEED,
+ BREAK_COMBINING_MARK, BREAK_SURROGATE, BREAK_ZERO_WIDTH_SPACE,
+ BREAK_INSEPARABLE, BREAK_NON_BREAKING_GLUE, BREAK_CONTINGENT,
+ BREAK_SPACE, BREAK_AFTER, BREAK_BEFORE,
+ BREAK_BEFORE_AND_AFTER, BREAK_HYPHEN, BREAK_NON_STARTER,
+ BREAK_OPEN_PUNCTUATION, BREAK_CLOSE_PUNCTUATION, BREAK_QUOTATION,
+ BREAK_EXCLAMATION, BREAK_IDEOGRAPHIC, BREAK_NUMERIC,
+ BREAK_INFIX_SEPARATOR, BREAK_SYMBOL, BREAK_ALPHABETIC,
+ BREAK_PREFIX, BREAK_POSTFIX, BREAK_COMPLEX_CONTEXT,
+ BREAK_AMBIGUOUS, BREAK_UNKNOWN, BREAK_NEXT_LINE,
+ BREAK_WORD_JOINER, BREAK_HANGUL_L_JAMO, BREAK_HANGUL_V_JAMO,
+ BREAK_HANGUL_T_JAMO, BREAK_HANGUL_LV_SYLLABLE, BREAK_HANGUL_LVT_SYLLABLE
+} BreakType;
+BreakType get_break (unichar uc) BIRNET_CONST;
+
+} // Unichar
+
+/* --- UTF-8 movement --- */
+inline const char* utf8_next (const char *c);
+inline char* utf8_next (char *c);
+inline const char* utf8_prev (const char *c);
+inline char* utf8_prev (char *c);
+inline const char* utf8_find_next (const char *c,
+ const char *bound = NULL);
+inline char* utf8_find_next (char *c,
+ const char *bound = NULL);
+inline const char* utf8_find_prev (const char *c,
+ const char *start = NULL);
+inline char* utf8_find_prev (char *c,
+ const char *start = NULL);
+unichar utf8_to_unichar (const char *str);
+int utf8_from_unichar (unichar uc,
+ char str[8]);
+
+/* --- implementation bits --- */
+extern const int8 utf8_skip_table[256];
+
+inline const char*
+utf8_next (const char *c)
+{
+ return c + utf8_skip_table[(uint8) *c];
+}
+
+inline char*
+utf8_next (char *c)
+{
+ return c + utf8_skip_table[(uint8) *c];
+}
+
+inline const char*
+utf8_prev (const char *c)
+{
+ do
+ c--;
+ while ((*c & 0xc0) == 0x80);
+ return c;
+}
+
+inline char*
+utf8_prev (char *c)
+{
+ do
+ c--;
+ while ((*c & 0xc0) == 0x80);
+ return c;
+}
+
+inline const char*
+utf8_find_next (const char *c,
+ const char *bound)
+{
+ if (*c)
+ do
+ c++;
+ while ((!bound || c < bound) && (*c & 0xc0) == 0x80);
+ return !bound || c < bound ? c : NULL;
+}
+
+inline char*
+utf8_find_next (char *c,
+ const char *bound)
+{
+ return const_cast<char*> (utf8_find_next (const_cast<const char*> (c), bound));
+}
+
+inline const char*
+utf8_find_prev (const char *c,
+ const char *start)
+{
+ do
+ c--;
+ while (c >= start && (*c & 0xc0) == 0x80);
+ return !start || c >= start ? c : NULL;
+}
+
+inline char*
+utf8_find_prev (char *c,
+ const char *start)
+{
+ return const_cast<char*> (utf8_find_prev (const_cast<const char*> (c), start));
+}
+
+
+} // Birnet
+
+#endif /* __BIRNET_UTF8_HH__ */
+/* vim:set ts=8 sts=2 sw=2: */
Modified: trunk/birnet/tests/Makefile.am
===================================================================
--- trunk/birnet/tests/Makefile.am 2006-10-09 21:05:43 UTC (rev 3949)
+++ trunk/birnet/tests/Makefile.am 2006-10-09 21:08:01 UTC (rev 3950)
@@ -7,16 +7,25 @@
INCLUDES += -I$(top_srcdir) -I$(top_builddir) -I. $(BIRNET_CFLAGS)
DEFS += -DG_LOG_DOMAIN='"$(basename $(@F))"' -DPARANOID -DG_DISABLE_CONST_RETURNS
-TESTS = infotest signal sorting datalist threads
+TESTS =
noinst_PROGRAMS = $(TESTS)
progs_ldadd = $(top_builddir)/birnet/libbirnet.o $(BIRNET_LIBS) -lm
+
+TESTS += infotest
infotest_SOURCES = infotest.cc
infotest_LDADD = $(progs_ldadd)
+TESTS += strings
+strings_SOURCES = strings.cc
+strings_LDADD = $(progs_ldadd)
+TESTS += threads
threads_SOURCES = threads.cc
threads_LDADD = $(progs_ldadd)
+TESTS += signal
signal_SOURCES = signal.cc
signal_LDADD = $(progs_ldadd)
+TESTS += sorting
sorting_SOURCES = sorting.cc
sorting_LDADD = $(progs_ldadd)
+TESTS += datalist
datalist_SOURCES = datalist.cc
datalist_LDADD = $(progs_ldadd)
Added: trunk/birnet/tests/strings.cc
===================================================================
--- trunk/birnet/tests/strings.cc 2006-10-09 21:05:43 UTC (rev 3949)
+++ trunk/birnet/tests/strings.cc 2006-10-09 21:08:01 UTC (rev 3950)
@@ -0,0 +1,211 @@
+/* Birnet
+ * Copyright (C) 2006 Tim Janik
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General
+ * Public License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place, Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+//#define TEST_VERBOSE
+#include <birnet/birnettests.h>
+using namespace Birnet;
+
+namespace {
+using namespace Birnet;
+
+static void
+random_tf8_and_unichar_test (void)
+{
+ TSTART ("utf8<->unichar");
+ const uint count = 1000000;
+ for (uint i = 0; i < count; i++)
+ {
+ if (i % 20000 == 0)
+ TOK();
+ unichar nc, uc = rand() % (0x100 << (i % 24));
+ if (!uc)
+ continue;
+ char buffer[8], gstr[10] = { 0, };
+ int l;
+
+ l = utf8_from_unichar (uc, buffer);
+ TCHECK (l > 0);
+ TCHECK (l <= 7);
+ TCHECK (buffer[l] == 0);
+ TCHECK (l == g_unichar_to_utf8 (uc, gstr));
+ TCHECK (strcmp (gstr, buffer) == 0);
+ nc = utf8_to_unichar (buffer);
+ TCHECK (nc == g_utf8_get_char (buffer));
+ TCHECK (uc == nc);
+ TCHECK (l == utf8_from_unichar (uc, NULL));
+ char *p1 = utf8_next (buffer);
+ TCHECK (p1 == buffer + l);
+ char *p2 = utf8_prev (p1);
+ TCHECK (p2 == buffer);
+
+ char cbuffer[1024];
+ snprintf (cbuffer, 1024, "x%sy7", buffer);
+ char *cur = cbuffer, *pn, *gn, *pp;
+ /* x */
+ pn = utf8_find_next (cur);
+ TCHECK (pn == cur + 1);
+ gn = g_utf8_find_next_char (cur, NULL);
+ TCHECK (pn == gn);
+ pp = utf8_find_prev (pn, cbuffer);
+ TCHECK (pp == cur);
+ /* random unichar */
+ cur = pn;
+ pn = utf8_find_next (cur);
+ TCHECK (pn == cur + l);
+ gn = g_utf8_find_next_char (cur, NULL);
+ TCHECK (pn == gn);
+ pp = utf8_find_prev (pn, cbuffer);
+ TCHECK (pp == cur);
+ /* y */
+ cur = pn;
+ pn = utf8_find_next (cur);
+ TCHECK (pn == cur + 1);
+ gn = g_utf8_find_next_char (cur, NULL);
+ TCHECK (pn == gn);
+ pp = utf8_find_prev (pn, cbuffer);
+ TCHECK (pp == cur);
+ /* 7 (last) */
+ cur = pn;
+ pn = utf8_find_next (cur);
+ TCHECK (pn == cur + 1);
+ gn = g_utf8_find_next_char (cur, NULL);
+ TCHECK (pn == gn);
+ pp = utf8_find_prev (pn, cbuffer);
+ TCHECK (pp == cur);
+ /* last with bounds */
+ pn = utf8_find_next (cur, cur + strlen (cur));
+ TCHECK (pn == NULL);
+ gn = g_utf8_find_next_char (cur, cur + strlen (cur));
+ TCHECK (pn == gn);
+ /* first with bounds */
+ pp = utf8_find_prev (cbuffer, cbuffer);
+ TCHECK (pp == NULL);
+ }
+ TDONE();
+}
+
+static void
+random_unichar_test (void)
+{
+ TSTART ("unichar classification");
+ const uint count = 1000000;
+ for (uint i = 0; i < count; i++)
+ {
+ unichar uc = rand() % (0x100 << (i % 24));
+ unichar bc, gc;
+ gboolean gb;
+ bool bb;
+ int bv, gv;
+ if (i % 20000 == 0)
+ TOK();
+
+ bb = Unichar::isalnum (uc);
+ gb = g_unichar_isalnum (uc);
+ TCHECK (bb == gb);
+ bb = Unichar::isalpha (uc);
+ gb = g_unichar_isalpha (uc);
+ TCHECK (bb == gb);
+ bb = Unichar::iscntrl (uc);
+ gb = g_unichar_iscntrl (uc);
+ TCHECK (bb == gb);
+ bb = Unichar::isdigit (uc);
+ gb = g_unichar_isdigit (uc);
+ TCHECK (bb == gb);
+ bv = Unichar::digit_value (uc);
+ gv = g_unichar_digit_value (uc);
+ TCHECK (bv == gv);
+ bv = Unichar::digit_value ('0' + uc % 10);
+ gv = g_unichar_digit_value ('0' + uc % 10);
+ TCHECK (bv == gv);
+ bb = Unichar::isgraph (uc);
+ gb = g_unichar_isgraph (uc);
+ TCHECK (bv == gv);
+ bb = Unichar::islower (uc);
+ gb = g_unichar_islower (uc);
+ TCHECK (bb == gb);
+ bc = Unichar::tolower (uc);
+ gc = g_unichar_tolower (uc);
+ TCHECK (bc == gc);
+ bb = Unichar::isprint (uc);
+ gb = g_unichar_isprint (uc);
+ TCHECK (bb == gb);
+ bb = Unichar::ispunct (uc);
+ gb = g_unichar_ispunct (uc);
+ TCHECK (bb == gb);
+ bb = Unichar::isspace (uc);
+ gb = g_unichar_isspace (uc);
+ TCHECK (bb == gb);
+ bb = Unichar::isupper (uc);
+ gb = g_unichar_isupper (uc);
+ TCHECK (bb == gb);
+ bc = Unichar::toupper (uc);
+ gc = g_unichar_toupper (uc);
+ TCHECK (bc == gc);
+ bb = Unichar::isxdigit (uc);
+ gb = g_unichar_isxdigit (uc);
+ TCHECK (bb == gb);
+ bv = Unichar::xdigit_value (uc);
+ gv = g_unichar_xdigit_value (uc);
+ TCHECK (bv == gv);
+ bv = Unichar::xdigit_value ('0' + uc % 10);
+ gv = g_unichar_xdigit_value ('0' + uc % 10);
+ TCHECK (bv == gv);
+ bv = Unichar::xdigit_value ('a' + uc % 6);
+ gv = g_unichar_xdigit_value ('a' + uc % 6);
+ TCHECK (bv == gv);
+ bv = Unichar::xdigit_value ('A' + uc % 6);
+ gv = g_unichar_xdigit_value ('A' + uc % 6);
+ TCHECK (bv == gv);
+ bb = Unichar::istitle (uc);
+ gb = g_unichar_istitle (uc);
+ TCHECK (bb == gb);
+ bc = Unichar::totitle (uc);
+ gc = g_unichar_totitle (uc);
+ TCHECK (bc == gc);
+ bb = Unichar::isdefined (uc);
+ gb = g_unichar_isdefined (uc);
+ TCHECK (bb == gb);
+ bb = Unichar::iswide (uc);
+ gb = g_unichar_iswide (uc);
+ TCHECK (bb == gb);
+#if GLIB_CHECK_VERSION (2, 10, 0)
+ bb = Unichar::iswide_cjk (uc);
+ gb = g_unichar_iswide_cjk (uc);
+ TCHECK (bb == gb);
+#endif
+ TCHECK (Unichar::get_type (uc) == (int) g_unichar_type (uc));
+ TCHECK (Unichar::get_break (uc) == (int) g_unichar_break_type (uc));
+ }
+ TDONE();
+}
+
+} // Anon
+
+int
+main (int argc,
+ char *argv[])
+{
+ birnet_init_test (&argc, &argv);
+
+ random_unichar_test();
+ random_tf8_and_unichar_test();
+
+ return 0;
+}
+
+/* vim:set ts=8 sts=2 sw=2: */
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]