[librep] several utf8 cleanups/utf8 doc updates



commit 73a7d93a1a23401e10d4c120b48b70be6f0d1efb
Author: chrisb <zanghar freenet de>
Date:   Tue Jun 30 18:57:26 2009 +0200

    several utf8 cleanups/utf8 doc updates

 ChangeLog      |    7 ++++
 man/lang.texi  |   39 ++++++++++++++++++--
 man/news.texi  |    2 +
 src/lispcmds.c |    2 +-
 src/utf8.c     |  108 ++++++++++++-------------------------------------------
 5 files changed, 70 insertions(+), 88 deletions(-)
---
diff --git a/ChangeLog b/ChangeLog
index 4317754..b68ac52 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2009-06-30  Christopher Bratusek <zanghar freenet de>
+	* man/lang.texi: updated for rep.util.utf8 [Teika Kazura]
+
+	* src/lispcmds.c: fixed a typo [Teika Kazura]
+
+	* src/utf8.c: improved copyright notice, removed unused macros [Teika Kazura]
+
 2009-06-19  Christopher Bratusek <zanghar freenet de>
 	* src/unix_processes
 	* src/sdbm.c
diff --git a/man/lang.texi b/man/lang.texi
index 077b18a..1ae5368 100644
--- a/man/lang.texi
+++ b/man/lang.texi
@@ -55,6 +55,7 @@ Miscellaneous features.
 * User Information::            The name of the user
 * Environment Variables::       Reading and writing the environment
 * String Functions::            Misc string manipulation
+* utf-8::                       utf-8 functions
 * Sleeping::                    Waiting for a period of time
 * Beeping::                     Making a ding! sound
 * Messages::                    Writing to the console
@@ -1192,6 +1193,8 @@ escape sequences see @ref{Strings}.
     @result{} 127
 @end lisp
 
+Functions below makes sence for ascii characters only.
+
 @defun alpha-char-p character
 This function returns true when @var{character} is one of the
 alphabetic characters.
@@ -2014,12 +2017,16 @@ they are all @code{()}.
 @node Strings, Array Functions, Vectors, Sequences
 @subsection Strings
 
-A string is a vector of characters (@pxref{Characters}), they are
+A string is a vector of characters (@pxref{Characters}). It is
 generally used for storing and manipulating pieces of text.
 @code{librep} puts no restrictions on the values which may be stored in
 a string---specifically, the null character (@samp{^@@}) may be
 stored with no problems.
 
+In librep, a string is a sequcene of raw bytes, and no encoding is
+assumed. Some special functions for utf-8 encoded strings are provided.
+See @xref{utf-8}.
+
 The read syntax of a string is a double quote character, followed by the
 contents of the string, the object is terminated by a second double quote
 character. For example, @code{"abc"} is the read syntax of the string
@@ -2129,6 +2136,9 @@ all characters until the end of the string are copied.
 (substring "xyzfoobar" 3)
     @result{} "foobar"
 @end lisp
+
+For utf-8 encoded strings, use @code{utf8-substring} instead
+(@pxref{utf-8}).
 @end defun
 
 @defun string= string1 string2
@@ -2241,6 +2251,9 @@ This function returns the length (an integer) of the sequence @var{sequence}.
 (length [x y])
     @result{} 2
 @end lisp
+
+For utf-8 encoded strings, use @code{utf8-string-length}
+(@pxref{utf-8}).
 @end defun
 
 @defun copy-sequence sequence
@@ -8903,7 +8916,7 @@ See also @ref{Process Objects} for the description of the
 @code{process-environment} variable.
 
 
- node String Functions, Sleeping, Environment Variables, The language
+ node String Functions, utf-8, Environment Variables, The language
 @section String Functions
 @cindex String functions
 
@@ -9009,7 +9022,27 @@ Return a string containing a printed representation of the number
 otherwise print it in base 10.
 @end defun
 
- node Sleeping, Beeping, String Functions, The language
+ node utf-8, Sleeping, String Functions, The language
+ section utf-8
+ cindex utf-8
+
+Some functions for utf-8 strings are available. 
+
+They assume that the string is encoded in utf-8. Otherwise, the 
+behavior is not defined.
+
+ defun utf8-string-length string
+Returns the number of characters of utf-8 encoded @var{string}.
+ end defun
+
+ defun utf8-substring string start @t{#!optional} end
+Returns the portion of @var{string}, encoded in utf-8, starting at
+character number @var{start} and ending at the character before
+ var{end} (or the end of the string if @var{end} is not given). All
+indices start at zero.
+ end defun
+
+ node Sleeping, Beeping, utf-8, The language
 @section Sleeping
 @cindex Sleeping
 
diff --git a/man/news.texi b/man/news.texi
index 75f136d..7d77a21 100644
--- a/man/news.texi
+++ b/man/news.texi
@@ -10,6 +10,8 @@
 @item Added UTF-8 Support! [Wang Diancheng]
 
 @item Remove scheme and unscheme modules
+
+ item Going on with code-cleanup
 @end itemize
 
 @heading 0.17.4
diff --git a/src/lispcmds.c b/src/lispcmds.c
index 12a5e01..ca362c5 100644
--- a/src/lispcmds.c
+++ b/src/lispcmds.c
@@ -947,7 +947,7 @@ DEFUN("substring", Fsubstring, Ssubstring, (repv string, repv start, repv end),
 substring STRING START [END]
 
 Returns the portion of STRING starting at character number START and ending
-at the character before END (or the end of the string is END is not given).
+at the character before END (or the end of the string if END is not given).
 All indices start at zero.
 ::end:: */
 {
diff --git a/src/utf8.c b/src/utf8.c
index 60ee159..0982269 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -1,95 +1,35 @@
-/* This file some code come from glib:
- * utf8.c - Operations on UTF-8 strings
+/* utf8.c - Operations on UTF-8 strings
+ * Some codes in this file are borrowed from glib-2.x/glib/gutf8.c
  *
+ * Copyright (C) 1999 Tom Tromey
+ * Copyright (C) 2000 Red Hat, Inc.
  * Copyright (C) 2009 Wang Diancheng.
  *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * This file is part of librep.
  *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
- * Lesser General Public License for more details.
+ * librep is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * librep is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with librep; see the file COPYING.     If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 02111-1307, USA.
  */
 
+// More functions for utf-8 are available in glib-x.y.z/glib/gutf8.c.
 
 #define _GNU_SOURCE
 
-#include <stdlib.h>
-#include <string.h>
-
 #include <config.h>
 #include "repint.h"
 
-#define UTF8_COMPUTE(Char, Mask, Len)					      \
-  if (Char < 128)							      \
-    {									      \
-      Len = 1;								      \
-      Mask = 0x7f;							      \
-    }									      \
-  else if ((Char & 0xe0) == 0xc0)					      \
-    {									      \
-      Len = 2;								      \
-      Mask = 0x1f;							      \
-    }									      \
-  else if ((Char & 0xf0) == 0xe0)					      \
-    {									      \
-      Len = 3;								      \
-      Mask = 0x0f;							      \
-    }									      \
-  else if ((Char & 0xf8) == 0xf0)					      \
-    {									      \
-      Len = 4;								      \
-      Mask = 0x07;							      \
-    }									      \
-  else if ((Char & 0xfc) == 0xf8)					      \
-    {									      \
-      Len = 5;								      \
-      Mask = 0x03;							      \
-    }									      \
-  else if ((Char & 0xfe) == 0xfc)					      \
-    {									      \
-      Len = 6;								      \
-      Mask = 0x01;							      \
-    }									      \
-  else									      \
-    Len = -1;
-
-#define UTF8_LENGTH(Char)              \
-  ((Char) < 0x80 ? 1 :                 \
-   ((Char) < 0x800 ? 2 :               \
-    ((Char) < 0x10000 ? 3 :            \
-     ((Char) < 0x200000 ? 4 :          \
-      ((Char) < 0x4000000 ? 5 : 6)))))
-   
-
-#define UTF8_GET(Result, Chars, Count, Mask, Len)			      \
-  (Result) = (Chars)[0] & (Mask);					      \
-  for ((Count) = 1; (Count) < (Len); ++(Count))				      \
-    {									      \
-      if (((Chars)[(Count)] & 0xc0) != 0x80)				      \
-	{								      \
-	  (Result) = -1;						      \
-	  break;							      \
-	}								      \
-      (Result) <<= 6;							      \
-      (Result) |= ((Chars)[(Count)] & 0x3f);				      \
-    }
-
-#define UNICODE_VALID(Char)                   \
-    ((Char) < 0x110000 &&                     \
-     (((Char) & 0xFFFFF800) != 0xD800) &&     \
-     ((Char) < 0xFDD0 || (Char) > 0xFDEF) &&  \
-     ((Char) & 0xFFFE) != 0xFFFE)
-   
-     
 static const char utf8_skip_data[256] = {
   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
@@ -246,10 +186,10 @@ utf8_offset_to_pointer  (const char *str,
 }
 
 DEFUN("utf8-string-length", Futf8_string_length, Sutf8_string_length, (repv string), rep_Subr1) /*
-::doc:rep.util.utf8#length::
-length SEQUENCE
+::doc:rep.util.utf8#utf8-string-length::
+utf8-string-length STRING
 
-Returns the number of characters UTF-8 encoded STRING.
+Returns the number of characters in utf-8 encoded STRING.
 ::end:: */
 {
      rep_DECLARE1(string, rep_STRINGP);
@@ -260,9 +200,9 @@ DEFUN("utf8-substring", Futf8_substring, Sutf8_substring, (repv string, repv sta
 ::doc:rep.util.utf8#utf8-substring::
 utf8-substring STRING START [END]
 
-Returns the portion of STRING(a UTF-8 encoded string) starting at
+Returns the portion of STRING, encoded in utf-8, starting at
 character number START and ending at the character before END (or the
-end of the string is END is not given).  All indices start at zero.
+end of the string if END is not given). All indices start at zero.
 ::end:: */
 {
     int utf8len, slen;



[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]