Support for extended character sets



Hi all,

I have written some code to unify single-byte character sets and extended character sets. With this code, it should be easier to enable extended character set support not only for UTF-8, but also for any other character set. Especially, this code tries to reduce the amount of #ifdef's that is needed to support both kinds of character sets. It only makes use of features that appeared in the ANSI C89 standard library, so it should work nearly everywhere.

This is my first try, so don't expect it to be perfect.

Another big question is how to interpret filenames. Consider a setup where the root filesystem is encoded in iso8859-1, but there's a Samba filesystem mounted somewhere with utf-8 encoding. I would like mc to be able to handle this, but I know it will take some time and many thoughts to properly implement it.

Roland
Index: configure.ac
===================================================================
RCS file: /cvsroot/mc/mc/configure.ac,v
retrieving revision 1.20
diff -u -p -r1.20 configure.ac
--- configure.ac	30 Jan 2005 05:06:13 -0000	1.20
+++ configure.ac	10 Mar 2005 18:54:20 -0000
@@ -84,6 +84,15 @@ dnl Only list browsers here that can be 
 AC_CHECK_PROGS(X11_WWW, [gnome-moz-remote mozilla konqueror opera netscape])
 
 dnl
+dnl Extended Character Sets
+dnl
+AC_ARG_ENABLE([extcharset],
+	AS_HELP_STRING([--enable-extcharset], [Enable extended character sets]))
+if test x"$enable_extcharset" = x"yes"; then
+  AC_DEFINE([EXTCHARSET_ENABLED], 1, [Enable extended character sets?])
+fi
+
+dnl
 dnl Ovverriding mmap support.  This has to be before AC_FUNC_MMAP is used.
 dnl We use only part of the functionality of mmap, so on AIX,
 dnl it's possible to use mmap, even if it doesn't pass the autoconf test.
@@ -153,7 +162,7 @@ AC_CHECK_HEADERS([unistd.h string.h memo
 	stdlib.h termios.h utime.h fcntl.h pwd.h sys/statfs.h sys/time.h \
 	sys/timeb.h sys/select.h sys/ioctl.h stropts.h arpa/inet.h \
 	security/pam_misc.h sys/socket.h sys/sysmacros.h sys/types.h \
-	sys/mkdev.h])
+	sys/mkdev.h wchar.h wctype.h])
 
 AC_HEADER_TIME
 AC_HEADER_SYS_WAIT
Index: src/Makefile.am
===================================================================
RCS file: /cvsroot/mc/mc/src/Makefile.am,v
retrieving revision 1.43
diff -u -p -r1.43 Makefile.am
--- src/Makefile.am	10 Mar 2005 09:44:36 -0000	1.43
+++ src/Makefile.am	10 Mar 2005 18:54:21 -0000
@@ -12,9 +12,12 @@ else
 AM_CPPFLAGS = -DDATADIR=\""$(pkgdatadir)/"\" -DLOCALEDIR=\""$(localedir)"\"
 endif
 
-noinst_PROGRAMS = man2hlp
+noinst_PROGRAMS = man2hlp ecssup-test
 man2hlp_LDADD = $(GLIB_LIBS)
 
+ecssup_test_LDADD =	$(GLIB_LIBS)
+ecssup_test_SOURCES =	ecssup.c ecssup-test.c
+
 mcmfmt_SOURCES = mfmt.c
 
 cons_saver_SOURCES = cons.saver.c
@@ -60,6 +63,7 @@ SRCS =	achown.c achown.h background.c ba
 	tree.c tree.h treestore.c treestore.h tty.h user.c user.h	\
 	util.c util.h utilunix.c view.c view.h vfsdummy.h widget.c	\
 	widget.h win.c win.h wtools.c wtools.h unixcompat.h		\
+	ecssup.h ecssup.c
 
 if CHARSET
 mc_SOURCES = $(SRCS) $(CHARSET_SRC)
--- /dev/null	Thu Mar 10 19:55:33 2005
+++ src/ecssup-test.c	Thu Mar 10 19:03:14 2005
@@ -0,0 +1,193 @@
+#include <config.h>
+
+#undef NDEBUG
+#include <assert.h>
+#include <stdio.h>
+
+#include "global.h"
+#include "ecssup.h"
+
+static gboolean
+change_locale(const char *loc)
+{
+	const char *ident;
+
+	ident = setlocale(LC_CTYPE, loc);
+	if (!ident) {
+		(void) printf("Skipping %s locale\n", loc);
+		return FALSE;
+	} else {
+		(void) printf("Testing %s locale \"%s\"\n", loc, ident);
+		return TRUE;
+	}
+}
+
+static void
+test_locale_C(void)
+{
+	if (!change_locale("C")) return;
+
+	assert(ecs_strlen(ECS_STR("foo")) == 3);
+	assert(ecs_strlen(ECS_STR("Zuckert\374te")) == 10);
+}
+
+static void
+test_locale_en_US_UTF_8(void)
+{
+	const char     *teststr_mb  = "Zuckert\303\214te";
+	const ecs_char *teststr_ecs = ECS_STR("Zuckert\374te");
+	const char     *teststr_c   = "Zuckert\374te";
+	ecs_char       *ecs;
+	char           *mbs;
+	gboolean        valid;
+
+	if (!change_locale("en_US.UTF-8")) return;
+
+	valid = ecs_mbstr_to_str(&ecs, teststr_c);
+	assert(!valid);
+
+	valid = ecs_mbstr_to_str(&ecs, teststr_mb);
+	assert(valid);
+	assert(ecs_strlen(ecs) == 10);
+	g_free(ecs);
+
+	valid = ecs_str_to_mbstr(&mbs, teststr_ecs);
+	assert(valid);
+	assert(strlen(mbs) == 11);
+	g_free(mbs);
+}
+
+/* ecs_strcpy */
+/* ecs_strncpy */
+/* ecs_strcat */
+/* ecs_strncat */
+
+static void
+test_ecs_strcmp(void)
+{
+	/* This test assumes ASCII encoding */
+
+	(void) puts("Testing ecs_strcmp ...");
+	assert(ecs_strcmp(ECS_STR("foo"), ECS_STR("bar")) > 0);
+	assert(ecs_strcmp(ECS_STR("bar"), ECS_STR("foo")) < 0);
+	assert(ecs_strcmp(ECS_STR(""), ECS_STR("")) == 0);
+	assert(ecs_strcmp(ECS_STR("f"), ECS_STR("")) > 0);
+	assert(ecs_strcmp(ECS_STR(""), ECS_STR("f")) < 0);
+}
+
+/* ecs_strcoll */
+/* ecs_strncmp */
+/* ecs_strxfrm */
+
+static void
+test_ecs_strchr(void)
+{
+	const ecs_char foo[] = ECS_STR("foo");
+
+	(void) puts("Testing ecs_strchr ...");
+	assert(ecs_strchr(foo, ECS_CHAR('f')) == foo);
+	assert(ecs_strchr(foo, ECS_CHAR('o')) == foo + 1);
+	assert(ecs_strchr(foo, ECS_CHAR('\0')) == foo + 3);
+	assert(ecs_strchr(foo, ECS_CHAR('b')) == NULL);
+}
+
+static void
+test_ecs_strcspn(void)
+{
+	const ecs_char test[] = ECS_STR("test string0123");
+
+	(void) puts("Testing ecs_strcspn ...");
+	assert(ecs_strcspn(test, ECS_STR("t")) == 0);
+	assert(ecs_strcspn(test, ECS_STR("e")) == 1);
+	assert(ecs_strcspn(test, ECS_STR("te")) == 0);
+	assert(ecs_strcspn(test, ECS_STR("et")) == 0);
+	assert(ecs_strcspn(test, ECS_STR("")) == 15);
+	assert(ecs_strcspn(test, ECS_STR("XXX")) == 15);
+}
+
+/* ecs_strpbrk */
+
+static void
+test_ecs_strrchr(void)
+{
+	const ecs_char foo[] = ECS_STR("foo");
+
+	(void) puts("Testing ecs_strrchr ...");
+	assert(ecs_strrchr(foo, ECS_CHAR('f')) == foo);
+	assert(ecs_strrchr(foo, ECS_CHAR('o')) == foo + 2);
+	assert(ecs_strrchr(foo, ECS_CHAR('\0')) == foo + 3);
+	assert(ecs_strrchr(foo, ECS_CHAR('b')) == NULL);
+}
+
+/* extern ecs_char *ecs_strstr(const ecs_char *, const ecs_char *); */
+
+/* ecs_strtok */
+
+static void
+test_ecs_strlen(void)
+{
+	(void) puts("Testing ecs_strlen ...");
+	assert(ecs_strlen(ECS_STR("")) == 0);
+	assert(ecs_strlen(ECS_STR("foo")) == 3);
+	assert(ecs_strlen(ECS_STR("\1\2\3\4\5")) == 5);
+}
+
+/* extern ecs_char *ecs_xstrdup(const ecs_char *); */
+
+static void
+test_ecs_strlcpy(void)
+{
+	ecs_char dest[20];
+
+	(void) puts("Testing ecs_strlcpy ...");
+	assert(ecs_strlcpy(dest, sizeof(dest), ECS_STR("")) == 0);
+	assert(dest[0] == ECS_CHAR('\0'));
+	assert(ecs_strlcpy(dest, sizeof(dest), ECS_STR("onetwothree")) == 11);
+	assert(dest[11] == ECS_CHAR('\0'));
+	assert(ecs_strcmp(dest, ECS_STR("onetwothree")) == 0);
+	assert(ecs_strlcpy(dest, 5, ECS_STR("onetwothree")) == 11);
+	assert(dest[4] == ECS_CHAR('\0'));
+	assert(ecs_strcmp(dest, ECS_STR("onet")) == 0);
+}
+
+static void
+test_ecs_strlcat(void)
+{
+	ecs_char dest[20];
+
+	(void) puts("Testing ecs_strlcat ...");
+	dest[0] = ECS_CHAR('\0');
+	assert(ecs_strlcat(dest, 0, ECS_STR("foo")) == 3);
+	assert(dest[0] == ECS_CHAR('\0'));
+	assert(ecs_strlcat(dest, 1, ECS_STR("foo")) == 3);
+	assert(dest[0] == ECS_CHAR('\0'));
+	assert(ecs_strlcat(dest, 2, ECS_STR("foo")) == 3);
+	assert(dest[0] == ECS_CHAR('f'));
+	assert(dest[1] == ECS_CHAR('\0'));
+	dest[1] = ECS_CHAR('X');
+	assert(ecs_strlcat(dest, 1, ECS_STR("bar")) == 3);
+	assert(dest[0] == ECS_CHAR('f'));
+	assert(dest[1] == ECS_CHAR('X'));
+}
+
+/* extern size_t ecs_strlcat(ecs_char *, size_t, const ecs_char *); */
+
+/* extern void ecs_strbox(const ecs_char *, size_t *ret_width,
+	size_t *ret_height); */
+
+int main(void)
+{
+#ifdef EXTCHARSET_ENABLED
+	test_locale_C();
+	test_locale_en_US_UTF_8();
+#endif
+	test_ecs_strcmp();
+	test_ecs_strchr();
+	test_ecs_strrchr();
+	test_ecs_strlen();
+	test_ecs_strlcpy();
+	test_ecs_strlcat();
+	test_ecs_strcspn();
+	(void) puts("All tests passed.");
+	return 0;
+}
--- /dev/null	Thu Mar 10 19:55:33 2005
+++ src/ecssup.c	Thu Mar 10 19:49:27 2005
@@ -0,0 +1,276 @@
+#include <config.h>
+
+#include <assert.h>
+#include <ctype.h>
+
+#include "global.h"
+#include "ecssup.h"
+
+/*
+ * String type conversion
+ */
+
+extern gboolean ecs_mbstr_to_str(ecs_char **ret_str, const char *s)
+{
+#ifdef EXTCHARSET_ENABLED
+	size_t maxlen, len;
+	ecs_char *str;
+
+	maxlen = strlen(s);
+
+	str = g_new(ecs_char, maxlen + 1);
+	len = mbstowcs(str, s, maxlen + 1);
+	if (len == (size_t) -1) {
+		g_free(str);
+		return FALSE;
+	}
+
+	assert(len <= maxlen);
+	*ret_str = g_renew(ecs_char, str, len + 1);
+	return TRUE;
+#else
+	*ret_str = g_strdup(s);
+	return TRUE;
+#endif
+}
+
+extern gboolean ecs_str_to_mbstr(char **ret_str, const ecs_char *s)
+{
+#ifdef EXTCHARSET_ENABLED
+	size_t maxlen, len;
+	char *str;
+
+	maxlen = ecs_strlen(s) * MB_CUR_MAX;
+
+	str = g_new(char, maxlen + 1);
+	len = wcstombs(str, s, maxlen + 1);
+	if (len == (size_t) -1) {
+		g_free(str);
+		return FALSE;
+	}
+
+	assert(len <= maxlen);
+	*ret_str = g_renew(char, str, len + 1);
+	return TRUE;
+#else
+	*ret_str = g_strdup(s);
+	return TRUE;
+#endif
+}
+
+/*
+ * Character classification
+ */
+
+#ifdef EXTCHARSET_ENABLED
+# ifdef HAVE_WCTYPE_H
+#  include <wctype.h>
+#  define ECS_CTYPE(wf, cf, c) \
+	(wf(c))
+# else
+#  define ECS_CTYPE(wf, cf, c) \
+	((c > (wchar_t) (UCHAR_MAX)) ? FALSE : (cf(c)))
+# endif
+#else
+# define ECS_CTYPE(wf, cf, c) \
+	(cf(c))
+#endif
+
+extern gboolean ecs_isalnum(ecs_char c)
+{
+	return ECS_CTYPE(iswalnum, isalnum, c);
+}
+
+extern gboolean ecs_isalpha(ecs_char c)
+{
+	return ECS_CTYPE(iswalpha, isalpha, c);
+}
+
+extern gboolean ecs_isdigit(ecs_char c)
+{
+	return ECS_CTYPE(iswdigit, isdigit, c);
+}
+
+extern gboolean ecs_ispunct(ecs_char c)
+{
+	return ECS_CTYPE(iswpunct, ispunct, c);
+}
+
+extern gboolean ecs_isspace(ecs_char c)
+{
+	return ECS_CTYPE(iswspace, isspace, c);
+}
+
+#undef ECS_CTYPE
+
+/*
+ * ISO C90 <string.h> functions
+ */
+
+/* missing: ecs_strcpy */
+/* missing: ecs_strncpy */
+/* missing: ecs_strcat */
+/* missing: ecs_strncat */
+
+int
+ecs_strcmp(const ecs_char *a, const ecs_char *b)
+{
+	size_t i;
+	unsigned long ca, cb;
+
+	for (i = 0; a[i] == b[i]; i++) {
+		if (a[i] == ECS_CHAR('\0'))
+			return 0;
+	}
+	ca = (unsigned long) a[i];
+	cb = (unsigned long) b[i];
+	return (ca < cb) ? -1 : (ca > cb) ? 1 : 0;
+}
+
+/* missing: ecs_strcoll */
+/* missing: ecs_strncmp */
+/* missing: ecs_strxfrm */
+
+ecs_char *
+ecs_strchr(const ecs_char *s, ecs_char c)
+{
+	size_t i;
+
+	for (i = 0; s[i] != c; i++) {
+		if (s[i] == ECS_CHAR('\0'))
+			return NULL;
+	}
+	return (ecs_char *) s + i;
+}
+
+size_t
+ecs_strcspn(const ecs_char *haystack, const ecs_char *needles)
+{
+	size_t i, j;
+
+	for (i = 0; haystack[i] != ECS_CHAR('\0'); i++) {
+		for (j = 0; needles[j] != ECS_CHAR('\0'); j++) {
+			if (haystack[i] == needles[j])
+				return i;
+		}
+	}
+	return i;
+}
+
+/* missing: ecs_strpbrk */
+
+ecs_char *
+ecs_strrchr(const ecs_char *s, ecs_char c)
+{
+	ecs_char *pos;
+	size_t i;
+
+	for (i = 0, pos = NULL;; i++) {
+		if (s[i] == c)
+			pos = (ecs_char *) s + i;
+		if (s[i] == ECS_CHAR('\0'))
+			return pos;
+	}
+}
+
+size_t
+ecs_strspn(const ecs_char *s, const ecs_char *chars)
+{
+	size_t i;
+
+	for (i = 0; s[i] != ECS_CHAR('\0'); i++) {
+		if (ecs_strchr(chars, s[i]) == NULL)
+			break;
+	}
+	return i;
+}
+
+ecs_char *
+ecs_strstr(const ecs_char *s, const ecs_char *sub)
+{
+	size_t i, j;
+
+	for (i = 0; s[i] != ECS_CHAR('\0'); i++) {
+		for (j = 0; sub[j] != ECS_CHAR('\0'); j++) {
+			if (s[i + j] != sub[j])
+				goto next_i;
+		}
+		return (ecs_char *) s + i;
+	next_i:
+		continue;
+	}
+	return NULL;
+}
+
+/* missing: ecs_strtok */
+
+size_t
+ecs_strlen(const ecs_char *s)
+{
+	size_t i;
+
+	for (i = 0; s[i] != ECS_CHAR('\0'); i++)
+		continue;
+	return i;
+}
+
+/*
+ * Other functions
+ */
+
+ecs_char *ecs_xstrdup(const ecs_char *s)
+{
+	ecs_char *retval;
+	size_t len;
+
+	len = ecs_strlen(s);
+	retval = g_new(ecs_char, len + 1);
+	memcpy(retval, s, (len + 1) * sizeof(ecs_char));
+	return retval;
+}
+
+size_t
+ecs_strlcpy(ecs_char *dst, size_t dstsize, const ecs_char *src)
+{
+	size_t i;
+
+	i = 0;
+	if (dstsize >= 1) {
+		while (dstsize-- > 1 && (dst[i] = src[i]) != ECS_CHAR('\0')) {
+			i++;
+		}
+		dst[i] = ECS_CHAR('\0');
+	}
+	while (src[i] != ECS_CHAR('\0'))
+		i++;
+	return i;
+}
+
+size_t
+ecs_strlcat(ecs_char *dst, size_t dstsize, const ecs_char *src)
+{
+	size_t di;
+
+	for (di = 0; di < dstsize && dst[di] != ECS_CHAR('\0'); di++)
+		continue;
+	return ecs_strlcpy(dst + di, dstsize - di, src);
+}
+
+void
+ecs_strbox(const ecs_char *s, size_t *ret_width, size_t *ret_height)
+{
+	size_t nlines = 0, ncolumns = 0, colindex = 0, i;
+
+	for (i = 0; s[i] != ECS_CHAR('\0'); i++) {
+		if (s[i] == ECS_CHAR('\n')) {
+			nlines++;
+			colindex = 0;
+		} else {
+			colindex++;
+			if (colindex > ncolumns)
+				ncolumns = colindex;
+		}
+	}
+	*ret_width  = ncolumns;
+	*ret_height = nlines;
+}
--- /dev/null	Thu Mar 10 19:55:33 2005
+++ src/ecssup.h	Thu Mar 10 19:07:32 2005
@@ -0,0 +1,74 @@
+#ifndef MC_ECSSUP_H
+#define MC_ECSSUP_H
+
+/*
+ * This header provides string processing functions for extended
+ * character sets (ECS), as well as for the traditional one-to-one
+ * byte-to-character encoding.
+ */
+
+#ifdef EXTCHARSET_ENABLED
+# include <stdlib.h>
+typedef wchar_t ecs_char;
+#define ECS_CHAR(c)		(L##c)
+#define ECS_STR(s)		(L##s)
+#else
+typedef char ecs_char;
+#define ECS_CHAR(c)		(c)
+#define ECS_STR(s)		(s)
+#endif
+
+/*
+ * String conversion functions between the wide character encoding and
+ * the multibyte encoding. The returned strings should be freed using
+ * g_free after use. The return value is TRUE if the string is valid
+ * and has been converted, FALSE otherwise.
+ */
+
+extern gboolean ecs_mbstr_to_str(ecs_char **ret_str, const char *);
+extern gboolean ecs_str_to_mbstr(char **ret_str, const ecs_char *);
+
+/*
+ * Replacements for the ISO C90 <ctype.h> functions.
+ */
+
+extern gboolean ecs_isalnum(ecs_char);
+extern gboolean ecs_isalpha(ecs_char);
+extern gboolean ecs_isdigit(ecs_char);
+extern gboolean ecs_ispunct(ecs_char);
+extern gboolean ecs_isspace(ecs_char);
+
+/*
+ * Replacements for the ISO C90 <string.h> functions.
+ */
+
+/* missing: ecs_strcpy */
+/* missing: ecs_strncpy */
+/* missing: ecs_strcat */
+/* missing: ecs_strncat */
+extern int ecs_strcmp(const ecs_char *, const ecs_char *);
+/* missing: ecs_strcoll */
+/* missing: ecs_strncmp */
+/* missing: ecs_strxfrm */
+extern ecs_char *ecs_strchr(const ecs_char *, ecs_char);
+extern size_t ecs_strcspn(const ecs_char *, const ecs_char *);
+/* missing: ecs_strpbrk */
+extern ecs_char *ecs_strrchr(const ecs_char *, ecs_char);
+extern size_t ecs_strspn(const ecs_char *, const ecs_char *);
+extern ecs_char *ecs_strstr(const ecs_char *, const ecs_char *);
+/* missing: ecs_strtok */
+extern size_t ecs_strlen(const ecs_char *);
+
+/*
+ * Other string functions.
+ */
+
+extern ecs_char *ecs_xstrdup(const ecs_char *);
+
+extern size_t ecs_strlcpy(ecs_char *, size_t, const ecs_char *);
+extern size_t ecs_strlcat(ecs_char *, size_t, const ecs_char *);
+
+extern void ecs_strbox(const ecs_char *, size_t *ret_width,
+	size_t *ret_height);
+
+#endif


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]