[librep] UTF-8 Support [Wang Diancheng]

From: Christopher Bratusek <chrisb src gnome org>
To: svn-commits-list gnome org
Subject: [librep] UTF-8 Support [Wang Diancheng]
Date: Sun, 31 May 2009 06:19:28 -0400 (EDT)
commit 66a27161520048eb85398a9d8aded088a8426e6a
Author: chrisb <zanghar freenet de>
Date:   Sun May 31 12:19:07 2009 +0200

    UTF-8 Support [Wang Diancheng]
---
 ChangeLog       |    3 +
 man/news.texi   |    2 +
 src/Makefile.in |   10 ++-
 src/utf8.c      |  299 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 311 insertions(+), 3 deletions(-)

diff --git a/ChangeLog b/ChangeLog
index bea02c0..47c2edc 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,6 @@
+2009-05-30  Christopher Bratusek <zanghar freenet de>
+	* src/utf8.c: added UTF-8 Support! [Wang Diancheng]
+
 2009-05-06  Christopher Bratusek <zanghar freenet de>
 	* Makefile.in: remove tar target [Ritz]
 
diff --git a/man/news.texi b/man/news.texi
index 7d9ba51..75f136d 100644
--- a/man/news.texi
+++ b/man/news.texi
@@ -7,6 +7,8 @@
 
 @itemize @bullet
 
+ item Added UTF-8 Support! [Wang Diancheng]
+
 @item Remove scheme and unscheme modules
 @end itemize
 
diff --git a/src/Makefile.in b/src/Makefile.in
index ea3d9e0..8359ef4 100644
--- a/src/Makefile.in
+++ b/src/Makefile.in
@@ -37,13 +37,14 @@ SDBM_OBJS = $(SDBM_SRCS:.c=.o)
 SDBM_LOBJS = $(SDBM_SRCS:.c=.lo)
 
 DL_SRCS = repsdbm.c timers.c gettext.c readline.c tables.c repgdbm.c \
-	  record-profile.c safemach.c sockets.c md5.c ffi.c
+	  record-profile.c safemach.c sockets.c md5.c ffi.c utf8.c
 DL_OBJS = sdbm.la timers.la gettext.la readline.la tables.la gdbm.la \
-	  record-profile.la safe-interpreter.la sockets.la md5.la ffi.la
+	  record-profile.la safe-interpreter.la sockets.la md5.la ffi.la \
+	  utf8.la
 DL_DSTS = rep/io/db/sdbm.la rep/io/timers.la rep/i18n/gettext.la \
 	  rep/io/readline.la rep/data/tables.la rep/io/db/gdbm.la \
 	  rep/lang/record-profile.la rep/vm/safe-interpreter.la \
-	  rep/io/sockets.la rep/util/md5.la rep/ffi.la
+	  rep/io/sockets.la rep/util/md5.la rep/ffi.la rep/util/utf8.la
 DL_DIRS = rep rep/io rep/io/db rep/i18n rep/data rep/lang rep/vm rep/util
 
 REP_SRCS = rep.c
@@ -155,6 +156,9 @@ sockets.la : sockets.lo
 md5.la : md5.lo rep-md5.lo
 	$(rep_DL_LD) $(LDFLAGS) -o $@ $^
 
+utf8.la : utf8.lo
+	$(rep_DL_LD) $(LDFLAGS) -o $@ $^
+
 ffi.la : ffi.lo
 	$(rep_DL_LD) $(LDFLAGS) -o $@ $^ $(FFI_LIBS)
 
diff --git a/src/utf8.c b/src/utf8.c
new file mode 100644
index 0000000..60ee159
--- /dev/null
+++ b/src/utf8.c
@@ -0,0 +1,299 @@
+/* This file some code come from glib:
+ * utf8.c - Operations on UTF-8 strings
+ *
+ * Copyright (C) 2009 Wang Diancheng.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+
+#define _GNU_SOURCE
+
+#include <stdlib.h>
+#include <string.h>
+
+#include <config.h>
+#include "repint.h"
+
+#define UTF8_COMPUTE(Char, Mask, Len)					      \
+  if (Char < 128)							      \
+    {									      \
+      Len = 1;								      \
+      Mask = 0x7f;							      \
+    }									      \
+  else if ((Char & 0xe0) == 0xc0)					      \
+    {									      \
+      Len = 2;								      \
+      Mask = 0x1f;							      \
+    }									      \
+  else if ((Char & 0xf0) == 0xe0)					      \
+    {									      \
+      Len = 3;								      \
+      Mask = 0x0f;							      \
+    }									      \
+  else if ((Char & 0xf8) == 0xf0)					      \
+    {									      \
+      Len = 4;								      \
+      Mask = 0x07;							      \
+    }									      \
+  else if ((Char & 0xfc) == 0xf8)					      \
+    {									      \
+      Len = 5;								      \
+      Mask = 0x03;							      \
+    }									      \
+  else if ((Char & 0xfe) == 0xfc)					      \
+    {									      \
+      Len = 6;								      \
+      Mask = 0x01;							      \
+    }									      \
+  else									      \
+    Len = -1;
+
+#define UTF8_LENGTH(Char)              \
+  ((Char) < 0x80 ? 1 :                 \
+   ((Char) < 0x800 ? 2 :               \
+    ((Char) < 0x10000 ? 3 :            \
+     ((Char) < 0x200000 ? 4 :          \
+      ((Char) < 0x4000000 ? 5 : 6)))))
+   
+
+#define UTF8_GET(Result, Chars, Count, Mask, Len)			      \
+  (Result) = (Chars)[0] & (Mask);					      \
+  for ((Count) = 1; (Count) < (Len); ++(Count))				      \
+    {									      \
+      if (((Chars)[(Count)] & 0xc0) != 0x80)				      \
+	{								      \
+	  (Result) = -1;						      \
+	  break;							      \
+	}								      \
+      (Result) <<= 6;							      \
+      (Result) |= ((Chars)[(Count)] & 0x3f);				      \
+    }
+
+#define UNICODE_VALID(Char)                   \
+    ((Char) < 0x110000 &&                     \
+     (((Char) & 0xFFFFF800) != 0xD800) &&     \
+     ((Char) < 0xFDD0 || (Char) > 0xFDEF) &&  \
+     ((Char) & 0xFFFE) != 0xFFFE)
+   
+     
+static const char utf8_skip_data[256] = {
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+  3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
+};
+
+const char * const utf8_skip = utf8_skip_data;
+#define utf8_next_char(p) (char *)((p) + utf8_skip[*(const unsigned char *)(p)])
+
+long
+utf8_strlen (const char *p,
+	     int       max);
+long    
+utf8_pointer_to_offset (const char *str,
+			const char *pos);
+char *
+utf8_offset_to_pointer  (const char *str,
+			 long        offset);
+/**
+ * utf8_strlen:
+ * @p: pointer to the start of a UTF-8 encoded string.
+ * @max: the maximum number of bytes to examine. If @max
+ *       is less than 0, then the string is assumed to be
+ *       nul-terminated. If @max is 0, @p will not be examined and 
+ *       may be %NULL.
+ * 
+ * Returns the length of the string in characters.
+ *
+ * Return value: the length of the string in characters
+ **/
+long
+utf8_strlen (const char *p,
+               int       max)
+{
+  long len = 0;
+  const char *start = p;
+
+  if(p == NULL || max == 0)
+       return 0;
+  
+  if (max < 0)
+    {
+      while (*p)
+        {
+          p = utf8_next_char (p);
+          ++len;
+        }
+    }
+  else
+    {
+      if (max == 0 || !*p)
+        return 0;
+      
+      p = utf8_next_char (p);          
+
+      while (p - start < max && *p)
+        {
+          ++len;
+          p = utf8_next_char (p);          
+        }
+
+      /* only do the last len increment if we got a complete
+       * char (don't count partial chars)
+       */
+      if (p - start <= max)
+        ++len;
+    }
+
+  return len;
+}
+
+
+/**
+ * utf8_pointer_to_offset:
+ * @str: a UTF-8 encoded string
+ * @pos: a pointer to a position within @str
+ * 
+ * Converts from a pointer to position within a string to a integer
+ * character offset.
+ *
+ * this function allows @pos to be before @str, and returns
+ * a negative offset in this case.
+ * 
+ * Return value: the resulting character offset
+ **/
+long    
+utf8_pointer_to_offset (const char *str,
+			  const char *pos)
+{
+  const char *s = str;
+  long offset = 0;    
+
+  if (pos < str) 
+    offset = - utf8_pointer_to_offset (pos, str);
+  else
+    while (s < pos)
+      {
+	s = utf8_next_char (s);
+	offset++;
+      }
+  
+  return offset;
+}
+
+/**
+ * utf8_offset_to_pointer:
+ * @str: a UTF-8 encoded string
+ * @offset: a character offset within @str
+ * 
+ * Converts from an integer character offset to a pointer to a position
+ * within the string.
+ * 
+ * this function allows to pass a negative @offset to
+ * step backwards. It is usually worth stepping backwards from the end
+ * instead of forwards if @offset is in the last fourth of the string, 
+ * since moving forward is about 3 times faster than moving backward.
+ * 
+ * Return value: the resulting pointer
+ **/
+char *
+utf8_offset_to_pointer  (const char *str,
+			   long        offset)    
+{
+  const char *s = str;
+
+  if (offset > 0) 
+    while (offset--)
+      s = utf8_next_char (s);
+  else
+    {
+      const char *s1;
+
+      /* This nice technique for fast backwards stepping 
+       * through a UTF-8 string was dubbed "stutter stepping" 
+       * by its inventor, Larry Ewing.
+       */
+      while (offset)
+	{
+	  s1 = s;
+	  s += offset;
+	  while ((*s & 0xc0) == 0x80)
+	    s--;
+
+	  offset += utf8_pointer_to_offset (s, s1);
+	}
+    }
+
+  return (char *)s;
+}
+
+DEFUN("utf8-string-length", Futf8_string_length, Sutf8_string_length, (repv string), rep_Subr1) /*
+::doc:rep.util.utf8#length::
+length SEQUENCE
+
+Returns the number of characters UTF-8 encoded STRING.
+::end:: */
+{
+     rep_DECLARE1(string, rep_STRINGP);
+     return rep_MAKE_INT(utf8_strlen (rep_STR(string),-1));
+}
+
+DEFUN("utf8-substring", Futf8_substring, Sutf8_substring, (repv string, repv start, repv end), rep_Subr3) /*
+::doc:rep.util.utf8#utf8-substring::
+utf8-substring STRING START [END]
+
+Returns the portion of STRING(a UTF-8 encoded string) starting at
+character number START and ending at the character before END (or the
+end of the string is END is not given).  All indices start at zero.
+::end:: */
+{
+    int utf8len, slen;
+    char *pstart;
+    char *pend;
+    rep_DECLARE1(string, rep_STRINGP);
+    rep_DECLARE2(start, rep_INTP);
+    rep_DECLARE3_OPT(end, rep_INTP);
+    utf8len = utf8_strlen(rep_STR(string), -1);
+    if(rep_INT(start) > utf8len || rep_INT(start) < 0)
+        return(rep_signal_arg_error(start, 2));
+    pstart = utf8_offset_to_pointer(rep_STR(string), rep_INT(start));
+    if(rep_INTP(end))
+    {
+        if((rep_INT(end) > utf8len) || (rep_INT(end) < rep_INT(start)))
+            return(rep_signal_arg_error(end, 3));
+	pend = utf8_offset_to_pointer(rep_STR(string),rep_INT(end));
+        return(rep_string_dupn(pstart, pend - pstart));
+    }
+    else
+    {
+        slen = rep_STRING_LEN(string);
+        return(rep_string_dupn(pstart, slen - (pstart-rep_STR(string))));
+    }
+}
+
+repv
+rep_dl_init (void)
+{
+    repv tem = rep_push_structure ("rep.util.utf8");
+    rep_ADD_SUBR(Sutf8_substring);
+    rep_ADD_SUBR(Sutf8_string_length);
+    return rep_pop_structure (tem);
+}
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]