Re: [evolution-patches] fix for bug #24026 (try harder not to sed in UTF-8)

From: Not Zed <notzed ximian com>
To: Jeffrey Stedfast <fejj ximian com>
Cc: evolution-patches ximian com
Subject: Re: [evolution-patches] fix for bug #24026 (try harder not to sed in UTF-8)
Date: Fri, 23 Jul 2004 11:44:28 +0800

shouldn't we just do something similar to the decoding code? i.e. if we can't use 7bit, just try the locale charset? if that doesn't work, then try this or perhaps just drop back to utf8.

On Thu, 2004-07-22 at 13:42 -0400, Jeffrey Stedfast wrote:

http://bugzilla.ximian.com/show_bug.cgi?id=24026

I don't expect this to be accepted absed on NotZed's comments in the
bug, but jp asked me to send in a patch so here it is.

btw, I think at least the charset_mask_best() changes should go in even
if the other changes don't.

Jeff

Plain text document attachment (24026.patch)

Index: ChangeLog
===================================================================
RCS file: /cvs/gnome/evolution/camel/ChangeLog,v
retrieving revision 1.2223
diff -u -r1.2223 ChangeLog
--- ChangeLog	21 Jul 2004 15:55:00 -0000	1.2223
+++ ChangeLog	22 Jul 2004 17:29:34 -0000
@@ -1,3 +1,14 @@
+2004-07-22  Jeffrey Stedfast  <fejj novell com>
+
+	* camel-charset-map.c (main): Add some multibyte charsets to the
+	table.
+	(camel_charset_best_mask): Changed the logic slightly to only
+	match certain charsets if the locale matches (Macedonians don't
+	want to use koi8-r for example). This logic also prevents the use
+	of a multibyte charset for mixed languages (such as DanW's feared
+	Greek and Japanese example) as long as the locale isn't Japanese
+	(in which case they probably want Japanese anyway).
+
 2004-07-19  Jeffrey Stedfast  <fejj novell com>
 
 	* providers/imap/camel-imap-store.c (get_subscribed_folders): Free
Index: camel-charset-map.c
===================================================================
RCS file: /cvs/gnome/evolution/camel/camel-charset-map.c,v
retrieving revision 1.38
diff -u -r1.38 camel-charset-map.c
--- camel-charset-map.c	9 Jul 2003 19:05:12 -0000	1.38
+++ camel-charset-map.c	22 Jul 2004 17:29:34 -0000
@@ -49,11 +49,13 @@
 
 #ifdef BUILD_MAP
 #include <iconv.h>
+#include <errno.h>
 #include <glib.h>
 
 static struct {
-	char *name;
-	unsigned int bit;	/* assigned bit */
+	char *name;        /* charset name */
+	int multibyte;     /* charset type */
+	unsigned int bit;  /* assigned bit */
 } tables[] = {
 	/* These are the 8bit character sets (other than iso-8859-1,
 	 * which is special-cased) which are supported by both other
@@ -61,20 +63,34 @@
 	 * they're listed in is the order they'll be tried in, so put
 	 * the more-popular ones first.
 	 */
-	{ "iso-8859-2", 0 },	/* Central/Eastern European */
-	{ "iso-8859-4", 0 },	/* Baltic */
-	{ "koi8-r", 0 },	/* Russian */
-	{ "koi8-u", 0 },	/* Ukranian */
-	{ "iso-8859-5", 0 },	/* Least-popular Russian encoding */
-	{ "iso-8859-7", 0 },	/* Greek */
-	{ "iso-8859-8", 0 },    /* Hebrew; Visual */
-	{ "iso-8859-9", 0 },	/* Turkish */
-	{ "iso-8859-13", 0 },	/* Baltic again */
-	{ "iso-8859-15", 0 },	/* New-and-improved iso-8859-1, but most
-				 * programs that support this support UTF8
-				 */
-	{ "windows-1251", 0 },	/* Russian */
-	{ 0, 0 }
+	{ "iso-8859-2",   0, 0 },  /* Central/Eastern European */
+	{ "iso-8859-4",   0, 0 },  /* Baltic */
+	{ "koi8-r",       0, 0 },  /* Russian */
+	{ "koi8-u",       0, 0 },  /* Ukranian */
+	{ "iso-8859-5",   0, 0 },  /* Least-popular Russian encoding */
+	{ "iso-8859-7",   0, 0 },  /* Greek */
+	{ "iso-8859-8",   0, 0 },  /* Hebrew; Visual */
+	{ "iso-8859-9",   0, 0 },  /* Turkish */
+	{ "iso-8859-13",  0, 0 },  /* Baltic again */
+	{ "iso-8859-15",  0, 0 },  /* New-and-improved iso-8859-1, but most
+				    * programs that support this support UTF8
+				    */
+	{ "windows-1251", 0, 0 },  /* Russian */
+	
+	/* These are the multibyte character sets which are commonly
+	 * supported by other mail clients. Note: order for multibyte
+	 * charsets does not affect priority unlike the 8bit charsets
+	 * listed above.
+	 */
+	{ "iso-2022-jp",  1, 0 },  /* Japanese designed for use over the Net */
+	{ "Shift-JIS",    1, 0 },  /* Japanese as used by Windows and MacOS systems */
+	{ "euc-jp",       1, 0 },  /* Japanese traditionally used on Unix systems */
+	{ "euc-kr",       1, 0 },  /* Korean */
+	{ "iso-2022-kr",  1, 0 },  /* Korean (less popular than euc-kr) */
+	{ "gb2312",       1, 0 },  /* Simplified Chinese */
+	{ "Big5",         1, 0 },  /* Traditional Chinese */
+	{ "euc-tw",       1, 0 },
+	{ NULL,           0, 0 }
 };
 
 unsigned int encoding_map[256 * 256];
@@ -85,115 +101,181 @@
 #define UCS "UCS-4LE"
 #endif
 
-int main (void)
+int main (int argc, char **argv)
 {
-	int i, j;
-	int max, min;
-	int bit = 0x01;
-	int k;
+	GHashTable *table_hash;
+	size_t inleft, outleft;
+	char *inbuf, *outbuf;
+	guint32 out[128], c;
+	unsigned int bit = 0x01;
+	char in[128];
+	int i, j, k;
 	int bytes;
 	iconv_t cd;
-	char in[128];
-	guint32 out[128];
-	char *inptr, *outptr;
-	size_t inlen, outlen;
-
+	
 	/* dont count the terminator */
-	bytes = ((sizeof(tables)/sizeof(tables[0]))+7-1)/8;
-
+	bytes = (G_N_ELEMENTS (tables) + 7 - 1) / 8;
+	
 	for (i = 0; i < 128; i++)
 		in[i] = i + 128;
-
-	for (j = 0; tables[j].name; j++) {
+	
+	for (j = 0; tables[j].name && !tables[j].multibyte; j++) {
 		cd = iconv_open (UCS, tables[j].name);
-		inptr = in;
-		outptr = (char *)(out);
-		inlen = sizeof (in);
-		outlen = sizeof (out);
-		while (iconv (cd, &inptr, &inlen, &outptr, &outlen) == -1) {
+		inbuf = in;
+		outbuf = (char *)(out);
+		inleft = sizeof (in);
+		outleft = sizeof (out);
+		while (iconv (cd, &inbuf, &inleft, &outbuf, &outleft) == -1) {
 			if (errno == EILSEQ) {
-				inptr++;
-				inlen--;
+				inbuf++;
+				inleft--;
 			} else {
-				printf ("%s\n", strerror (errno));
+				fprintf (stderr, "iconv (%s->UCS4, ..., %d, ..., %d): %s",
+					 tables[j].name, inleft, outleft,
+					 strerror (errno));
 				exit (1);
 			}
 		}
 		iconv_close (cd);
-
-		for (i = 0; i < 128 - outlen / 4; i++) {
+		
+		for (i = 0; i < 128 - outleft / 4; i++) {
 			encoding_map[i] |= bit;
 			encoding_map[out[i]] |= bit;
 		}
-
+		
 		tables[j].bit = bit;
 		bit <<= 1;
 	}
-
-	printf("/* This file is automatically generated: DO NOT EDIT */\n\n");
-
-	for (i=0;i<256;i++) {
+	
+	/* Mutibyte tables */
+	for ( ; tables[j].name && tables[j].multibyte; j++) {
+		cd = iconv_open (tables[j].name, UCS);
+		if (cd == (iconv_t) -1)
+			continue;
+		
+		for (c = 128, i = 0; c < 65535 && i < 65535; c++) {
+			inbuf = (char *) &c;
+			inleft = sizeof (c);
+			outbuf = in;
+			outleft = sizeof (in);
+			
+			if (iconv (cd, &inbuf, &inleft, &outbuf, &outleft) != (size_t) -1) {
+				/* this is a legal character in charset table[j].name */
+				iconv (cd, NULL, NULL, &outbuf, &outleft);
+				encoding_map[i++] |= bit;
+				encoding_map[c] |= bit;
+			} else {
+				/* reset the iconv descriptor */
+				iconv (cd, NULL, NULL, NULL, NULL);
+			}
+		}
+		
+		iconv_close (cd);
+		
+		tables[j].bit = bit;
+		bit <<= 1;
+	}
+	
+	printf ("/* This file is automatically generated: DO NOT EDIT */\n\n");
+	
+	/* FIXME: we can condense better than what my quick hack does,
+	   but it'd be more work and I'm not sure if it's worth it or
+	   not. Currently I'm just making it so that tables that
+	   contain all of the same values will only ever be
+	   one-of-a-kind by making duplicates into macro aliases for
+	   the original */
+	
+	table_hash = g_hash_table_new (g_int_hash, g_int_equal);
+	
+	for (i = 0; i < 256; i++) {
 		/* first, do we need this block? */
-		for (k=0;k<bytes;k++) {
-			for (j=0;j<256;j++) {
-				if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
-					break;
+		for (k = 0; k < bytes; k++) {
+			int first = encoding_map[i * 256] & (0xff << (k * 8));
+			int same = TRUE;
+			int dump = FALSE;
+			
+			for (j = 0; j < 256; j++) {
+				same = same && (encoding_map[i * 256 + j] & (0xff << (k * 8))) == first;
+				if ((encoding_map[i * 256 + j] & (0xff << (k * 8))) != 0)
+					dump = TRUE;
 			}
-			if (j < 256) {
+			
+			if (dump) {
+				if (same) {
+					/* this table is aliasable */
+					char *table_name;
+					
+					if ((table_name = g_hash_table_lookup (table_hash, &first))) {
+						/* we've already written out a table with the exact same
+						   values so we can just alias it with a macro. */
+						printf ("#define m%02x%x %s\n\n", i, k, table_name);
+						continue;
+					} else {
+						table_name = g_strdup_printf ("m%02x%x", i, k);
+						g_hash_table_insert (table_hash, &first, table_name);
+					}
+				}
+				
 				/* yes, dump it */
-				printf("static unsigned char m%02x%x[256] = {\n\t", i, k);
-				for (j=0;j<256;j++) {
-					printf("0x%02x, ", (encoding_map[i*256+j] >> (k*8)) & 0xff );
-					if (((j+1)&7) == 0 && j<255)
-						printf("\n\t");
+				printf ("static unsigned char m%02x%x[256] = {\n\t", i, k);
+				for (j = 0; j < 256; j++) {
+					printf ("0x%02x, ", (encoding_map[i * 256 + j] >> (k * 8)) & 0xff);
+					if (((j + 1) & 7) == 0 && j < 255)
+						printf ("\n\t");
 				}
-				printf("\n};\n\n");
+				printf ("\n};\n\n");
 			}
 		}
 	}
-
-	printf("struct {\n");
-	for (k=0;k<bytes;k++) {
-		printf("\tunsigned char *bits%d;\n", k);
+	
+	printf ("struct {\n");
+	for (k = 0; k < bytes; k++) {
+		printf ("\tunsigned char *bits%d;\n", k);
 	}
-	printf("} camel_charmap[256] = {\n\t");
-	for (i=0;i<256;i++) {
+	
+	printf ("} camel_charmap[256] = {\n\t");
+	for (i = 0; i < 256; i++) {
 		/* first, do we need this block? */
-		printf("{ ");
-		for (k=0;k<bytes;k++) {
-			for (j=0;j<256;j++) {
-				if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
+		printf ("{ ");
+		for (k = 0; k < bytes; k++) {
+			for (j = 0; j < 256; j++) {
+				if ((encoding_map[i * 256 + j] & (0xff << (k * 8))) != 0)
 					break;
 			}
+			
 			if (j < 256) {
-				printf("m%02x%x, ", i, k);
+				printf ("m%02x%x, ", i, k);
 			} else {
-				printf("0, ");
+				printf ("0, ");
 			}
 		}
-		printf("}, ");
-		if (((i+1)&7) == 0 && i<255)
-			printf("\n\t");
+		
+		printf ("}, ");
+		if (((i + 1) & 7) == 0 && i < 255)
+			printf ("\n\t");
 	}
-	printf("\n};\n\n");
-
-	printf("struct {\n\tconst char *name;\n\tunsigned int bit;\n} camel_charinfo[] = {\n");
-	for (j=0;tables[j].name;j++) {
-		printf("\t{ \"%s\", 0x%04x },\n", tables[j].name, tables[j].bit);
+	printf ("\n};\n\n");
+	
+	printf ("struct {\n\tconst char *name;\n\tunsigned int bit;\n} camel_charinfo[] = {\n");
+	for (j = 0; tables[j].name; j++) {
+		printf ("\t{ \"%s\", 0x%04x },\n", tables[j].name, tables[j].bit);
 	}
-	printf("};\n\n");
-
+	printf ("};\n\n");
+	
 	printf("#define charset_mask(x) \\\n");
-	for (k=0;k<bytes;k++) {
-		if (k!=0)
-			printf("\t| ");
+	for (k = 0; k < bytes; k++) {
+		if (k != 0)
+			printf ("\t| ");
 		else
-			printf("\t");
-		printf("(camel_charmap[(x)>>8].bits%d?camel_charmap[(x)>>8].bits%d[(x)&0xff]<<%d:0)", k, k, k*8);
-		if (k<bytes-1)
-			printf("\t\\\n");
+			printf ("\t");
+		
+		printf ("(camel_charmap[(x) >> 8].bits%d ? camel_charmap[(x) >> 8].bits%d[(x) & 0xff] << %d : 0)",
+			k, k, k * 8);
+		
+		if (k < bytes - 1)
+			printf ("\t\\\n");
 	}
-	printf("\n\n");
+	printf ("\n\n");
 	
 	return 0;
 }
@@ -211,6 +293,8 @@
 #include <langinfo.h>
 #endif
 
+#include <gal/util/e-iconv.h>
+
 void
 camel_charset_init (CamelCharset *c)
 {
@@ -261,12 +345,19 @@
 static const char *
 camel_charset_best_mask(unsigned int mask)
 {
+	const char *locale_lang, *lang;
 	int i;
-
-	for (i=0;i<sizeof(camel_charinfo)/sizeof(camel_charinfo[0]);i++) {
-		if (camel_charinfo[i].bit & mask)
-			return camel_charinfo[i].name;
+	
+	locale_lang = e_iconv_locale_language ();
+	for (i = 0; i < G_N_ELEMENTS (camel_charinfo); i++) {
+		if (camel_charinfo[i].bit & mask) {
+			lang = e_iconv_charset_language (camel_charinfo[i].name);
+			
+			if (!lang || (locale_lang && !strncmp (locale_lang, lang, 2)))
+				return camel_charinfo[i].name;
+		}
 	}
+	
 	return "UTF-8";
 }

Michael Zucchi <notzed ximian com>
"born to die, live to work, it's all downhill from here"
Novell's Evolution and Free Software Developer

Follow-Ups:
- Re: [evolution-patches] fix for bug #24026 (try harder not to sed in UTF-8)
  - From: Jeffrey Stedfast

References:
- [evolution-patches] fix for bug #24026 (try harder not to sed in UTF-8)
  - From: Jeffrey Stedfast

[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]