[librep] several utf8 cleanups/utf8 doc updates
- From: Christopher Bratusek <chrisb src gnome org>
- To: svn-commits-list gnome org
- Subject: [librep] several utf8 cleanups/utf8 doc updates
- Date: Tue, 30 Jun 2009 16:54:05 +0000 (UTC)
commit 73a7d93a1a23401e10d4c120b48b70be6f0d1efb
Author: chrisb <zanghar freenet de>
Date: Tue Jun 30 18:57:26 2009 +0200
several utf8 cleanups/utf8 doc updates
ChangeLog | 7 ++++
man/lang.texi | 39 ++++++++++++++++++--
man/news.texi | 2 +
src/lispcmds.c | 2 +-
src/utf8.c | 108 ++++++++++++-------------------------------------------
5 files changed, 70 insertions(+), 88 deletions(-)
---
diff --git a/ChangeLog b/ChangeLog
index 4317754..b68ac52 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2009-06-30 Christopher Bratusek <zanghar freenet de>
+ * man/lang.texi: updated for rep.util.utf8 [Teika Kazura]
+
+ * src/lispcmds.c: fixed a typo [Teika Kazura]
+
+ * src/utf8.c: improved copyright notice, removed unused macros [Teika Kazura]
+
2009-06-19 Christopher Bratusek <zanghar freenet de>
* src/unix_processes
* src/sdbm.c
diff --git a/man/lang.texi b/man/lang.texi
index 077b18a..1ae5368 100644
--- a/man/lang.texi
+++ b/man/lang.texi
@@ -55,6 +55,7 @@ Miscellaneous features.
* User Information:: The name of the user
* Environment Variables:: Reading and writing the environment
* String Functions:: Misc string manipulation
+* utf-8:: utf-8 functions
* Sleeping:: Waiting for a period of time
* Beeping:: Making a ding! sound
* Messages:: Writing to the console
@@ -1192,6 +1193,8 @@ escape sequences see @ref{Strings}.
@result{} 127
@end lisp
+Functions below makes sence for ascii characters only.
+
@defun alpha-char-p character
This function returns true when @var{character} is one of the
alphabetic characters.
@@ -2014,12 +2017,16 @@ they are all @code{()}.
@node Strings, Array Functions, Vectors, Sequences
@subsection Strings
-A string is a vector of characters (@pxref{Characters}), they are
+A string is a vector of characters (@pxref{Characters}). It is
generally used for storing and manipulating pieces of text.
@code{librep} puts no restrictions on the values which may be stored in
a string---specifically, the null character (@samp{^@@}) may be
stored with no problems.
+In librep, a string is a sequcene of raw bytes, and no encoding is
+assumed. Some special functions for utf-8 encoded strings are provided.
+See @xref{utf-8}.
+
The read syntax of a string is a double quote character, followed by the
contents of the string, the object is terminated by a second double quote
character. For example, @code{"abc"} is the read syntax of the string
@@ -2129,6 +2136,9 @@ all characters until the end of the string are copied.
(substring "xyzfoobar" 3)
@result{} "foobar"
@end lisp
+
+For utf-8 encoded strings, use @code{utf8-substring} instead
+(@pxref{utf-8}).
@end defun
@defun string= string1 string2
@@ -2241,6 +2251,9 @@ This function returns the length (an integer) of the sequence @var{sequence}.
(length [x y])
@result{} 2
@end lisp
+
+For utf-8 encoded strings, use @code{utf8-string-length}
+(@pxref{utf-8}).
@end defun
@defun copy-sequence sequence
@@ -8903,7 +8916,7 @@ See also @ref{Process Objects} for the description of the
@code{process-environment} variable.
- node String Functions, Sleeping, Environment Variables, The language
+ node String Functions, utf-8, Environment Variables, The language
@section String Functions
@cindex String functions
@@ -9009,7 +9022,27 @@ Return a string containing a printed representation of the number
otherwise print it in base 10.
@end defun
- node Sleeping, Beeping, String Functions, The language
+ node utf-8, Sleeping, String Functions, The language
+ section utf-8
+ cindex utf-8
+
+Some functions for utf-8 strings are available.
+
+They assume that the string is encoded in utf-8. Otherwise, the
+behavior is not defined.
+
+ defun utf8-string-length string
+Returns the number of characters of utf-8 encoded @var{string}.
+ end defun
+
+ defun utf8-substring string start @t{#!optional} end
+Returns the portion of @var{string}, encoded in utf-8, starting at
+character number @var{start} and ending at the character before
+ var{end} (or the end of the string if @var{end} is not given). All
+indices start at zero.
+ end defun
+
+ node Sleeping, Beeping, utf-8, The language
@section Sleeping
@cindex Sleeping
diff --git a/man/news.texi b/man/news.texi
index 75f136d..7d77a21 100644
--- a/man/news.texi
+++ b/man/news.texi
@@ -10,6 +10,8 @@
@item Added UTF-8 Support! [Wang Diancheng]
@item Remove scheme and unscheme modules
+
+ item Going on with code-cleanup
@end itemize
@heading 0.17.4
diff --git a/src/lispcmds.c b/src/lispcmds.c
index 12a5e01..ca362c5 100644
--- a/src/lispcmds.c
+++ b/src/lispcmds.c
@@ -947,7 +947,7 @@ DEFUN("substring", Fsubstring, Ssubstring, (repv string, repv start, repv end),
substring STRING START [END]
Returns the portion of STRING starting at character number START and ending
-at the character before END (or the end of the string is END is not given).
+at the character before END (or the end of the string if END is not given).
All indices start at zero.
::end:: */
{
diff --git a/src/utf8.c b/src/utf8.c
index 60ee159..0982269 100644
--- a/src/utf8.c
+++ b/src/utf8.c
@@ -1,95 +1,35 @@
-/* This file some code come from glib:
- * utf8.c - Operations on UTF-8 strings
+/* utf8.c - Operations on UTF-8 strings
+ * Some codes in this file are borrowed from glib-2.x/glib/gutf8.c
*
+ * Copyright (C) 1999 Tom Tromey
+ * Copyright (C) 2000 Red Hat, Inc.
* Copyright (C) 2009 Wang Diancheng.
*
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
+ * This file is part of librep.
*
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
+ * librep is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2, or (at your option)
+ * any later version.
+ *
+ * librep is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with librep; see the file COPYING. If not, write to
+ * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
*
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 02111-1307, USA.
*/
+// More functions for utf-8 are available in glib-x.y.z/glib/gutf8.c.
#define _GNU_SOURCE
-#include <stdlib.h>
-#include <string.h>
-
#include <config.h>
#include "repint.h"
-#define UTF8_COMPUTE(Char, Mask, Len) \
- if (Char < 128) \
- { \
- Len = 1; \
- Mask = 0x7f; \
- } \
- else if ((Char & 0xe0) == 0xc0) \
- { \
- Len = 2; \
- Mask = 0x1f; \
- } \
- else if ((Char & 0xf0) == 0xe0) \
- { \
- Len = 3; \
- Mask = 0x0f; \
- } \
- else if ((Char & 0xf8) == 0xf0) \
- { \
- Len = 4; \
- Mask = 0x07; \
- } \
- else if ((Char & 0xfc) == 0xf8) \
- { \
- Len = 5; \
- Mask = 0x03; \
- } \
- else if ((Char & 0xfe) == 0xfc) \
- { \
- Len = 6; \
- Mask = 0x01; \
- } \
- else \
- Len = -1;
-
-#define UTF8_LENGTH(Char) \
- ((Char) < 0x80 ? 1 : \
- ((Char) < 0x800 ? 2 : \
- ((Char) < 0x10000 ? 3 : \
- ((Char) < 0x200000 ? 4 : \
- ((Char) < 0x4000000 ? 5 : 6)))))
-
-
-#define UTF8_GET(Result, Chars, Count, Mask, Len) \
- (Result) = (Chars)[0] & (Mask); \
- for ((Count) = 1; (Count) < (Len); ++(Count)) \
- { \
- if (((Chars)[(Count)] & 0xc0) != 0x80) \
- { \
- (Result) = -1; \
- break; \
- } \
- (Result) <<= 6; \
- (Result) |= ((Chars)[(Count)] & 0x3f); \
- }
-
-#define UNICODE_VALID(Char) \
- ((Char) < 0x110000 && \
- (((Char) & 0xFFFFF800) != 0xD800) && \
- ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \
- ((Char) & 0xFFFE) != 0xFFFE)
-
-
static const char utf8_skip_data[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
@@ -246,10 +186,10 @@ utf8_offset_to_pointer (const char *str,
}
DEFUN("utf8-string-length", Futf8_string_length, Sutf8_string_length, (repv string), rep_Subr1) /*
-::doc:rep.util.utf8#length::
-length SEQUENCE
+::doc:rep.util.utf8#utf8-string-length::
+utf8-string-length STRING
-Returns the number of characters UTF-8 encoded STRING.
+Returns the number of characters in utf-8 encoded STRING.
::end:: */
{
rep_DECLARE1(string, rep_STRINGP);
@@ -260,9 +200,9 @@ DEFUN("utf8-substring", Futf8_substring, Sutf8_substring, (repv string, repv sta
::doc:rep.util.utf8#utf8-substring::
utf8-substring STRING START [END]
-Returns the portion of STRING(a UTF-8 encoded string) starting at
+Returns the portion of STRING, encoded in utf-8, starting at
character number START and ending at the character before END (or the
-end of the string is END is not given). All indices start at zero.
+end of the string if END is not given). All indices start at zero.
::end:: */
{
int utf8len, slen;
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]