[evolution-patches] fix for bug #24026 (try harder not to sed in UTF-8)

From: Jeffrey Stedfast <fejj ximian com>
To: evolution-patches ximian com
Subject: [evolution-patches] fix for bug #24026 (try harder not to sed in UTF-8)
Date: Thu, 22 Jul 2004 13:42:52 -0400

http://bugzilla.ximian.com/show_bug.cgi?id=24026

I don't expect this to be accepted absed on NotZed's comments in the
bug, but jp asked me to send in a patch so here it is.

btw, I think at least the charset_mask_best() changes should go in even
if the other changes don't.

Jeff

-- 
Jeffrey Stedfast
Evolution Hacker - Novell, Inc.
fejj ximian com  - www.novell.com

Index: ChangeLog
===================================================================
RCS file: /cvs/gnome/evolution/camel/ChangeLog,v
retrieving revision 1.2223
diff -u -r1.2223 ChangeLog
--- ChangeLog	21 Jul 2004 15:55:00 -0000	1.2223
+++ ChangeLog	22 Jul 2004 17:29:34 -0000
@@ -1,3 +1,14 @@
+2004-07-22  Jeffrey Stedfast  <fejj novell com>
+
+	* camel-charset-map.c (main): Add some multibyte charsets to the
+	table.
+	(camel_charset_best_mask): Changed the logic slightly to only
+	match certain charsets if the locale matches (Macedonians don't
+	want to use koi8-r for example). This logic also prevents the use
+	of a multibyte charset for mixed languages (such as DanW's feared
+	Greek and Japanese example) as long as the locale isn't Japanese
+	(in which case they probably want Japanese anyway).
+
 2004-07-19  Jeffrey Stedfast  <fejj novell com>
 
 	* providers/imap/camel-imap-store.c (get_subscribed_folders): Free
Index: camel-charset-map.c
===================================================================
RCS file: /cvs/gnome/evolution/camel/camel-charset-map.c,v
retrieving revision 1.38
diff -u -r1.38 camel-charset-map.c
--- camel-charset-map.c	9 Jul 2003 19:05:12 -0000	1.38
+++ camel-charset-map.c	22 Jul 2004 17:29:34 -0000
@@ -49,11 +49,13 @@
 
 #ifdef BUILD_MAP
 #include <iconv.h>
+#include <errno.h>
 #include <glib.h>
 
 static struct {
-	char *name;
-	unsigned int bit;	/* assigned bit */
+	char *name;        /* charset name */
+	int multibyte;     /* charset type */
+	unsigned int bit;  /* assigned bit */
 } tables[] = {
 	/* These are the 8bit character sets (other than iso-8859-1,
 	 * which is special-cased) which are supported by both other
@@ -61,20 +63,34 @@
 	 * they're listed in is the order they'll be tried in, so put
 	 * the more-popular ones first.
 	 */
-	{ "iso-8859-2", 0 },	/* Central/Eastern European */
-	{ "iso-8859-4", 0 },	/* Baltic */
-	{ "koi8-r", 0 },	/* Russian */
-	{ "koi8-u", 0 },	/* Ukranian */
-	{ "iso-8859-5", 0 },	/* Least-popular Russian encoding */
-	{ "iso-8859-7", 0 },	/* Greek */
-	{ "iso-8859-8", 0 },    /* Hebrew; Visual */
-	{ "iso-8859-9", 0 },	/* Turkish */
-	{ "iso-8859-13", 0 },	/* Baltic again */
-	{ "iso-8859-15", 0 },	/* New-and-improved iso-8859-1, but most
-				 * programs that support this support UTF8
-				 */
-	{ "windows-1251", 0 },	/* Russian */
-	{ 0, 0 }
+	{ "iso-8859-2",   0, 0 },  /* Central/Eastern European */
+	{ "iso-8859-4",   0, 0 },  /* Baltic */
+	{ "koi8-r",       0, 0 },  /* Russian */
+	{ "koi8-u",       0, 0 },  /* Ukranian */
+	{ "iso-8859-5",   0, 0 },  /* Least-popular Russian encoding */
+	{ "iso-8859-7",   0, 0 },  /* Greek */
+	{ "iso-8859-8",   0, 0 },  /* Hebrew; Visual */
+	{ "iso-8859-9",   0, 0 },  /* Turkish */
+	{ "iso-8859-13",  0, 0 },  /* Baltic again */
+	{ "iso-8859-15",  0, 0 },  /* New-and-improved iso-8859-1, but most
+				    * programs that support this support UTF8
+				    */
+	{ "windows-1251", 0, 0 },  /* Russian */
+	
+	/* These are the multibyte character sets which are commonly
+	 * supported by other mail clients. Note: order for multibyte
+	 * charsets does not affect priority unlike the 8bit charsets
+	 * listed above.
+	 */
+	{ "iso-2022-jp",  1, 0 },  /* Japanese designed for use over the Net */
+	{ "Shift-JIS",    1, 0 },  /* Japanese as used by Windows and MacOS systems */
+	{ "euc-jp",       1, 0 },  /* Japanese traditionally used on Unix systems */
+	{ "euc-kr",       1, 0 },  /* Korean */
+	{ "iso-2022-kr",  1, 0 },  /* Korean (less popular than euc-kr) */
+	{ "gb2312",       1, 0 },  /* Simplified Chinese */
+	{ "Big5",         1, 0 },  /* Traditional Chinese */
+	{ "euc-tw",       1, 0 },
+	{ NULL,           0, 0 }
 };
 
 unsigned int encoding_map[256 * 256];
@@ -85,115 +101,181 @@
 #define UCS "UCS-4LE"
 #endif
 
-int main (void)
+int main (int argc, char **argv)
 {
-	int i, j;
-	int max, min;
-	int bit = 0x01;
-	int k;
+	GHashTable *table_hash;
+	size_t inleft, outleft;
+	char *inbuf, *outbuf;
+	guint32 out[128], c;
+	unsigned int bit = 0x01;
+	char in[128];
+	int i, j, k;
 	int bytes;
 	iconv_t cd;
-	char in[128];
-	guint32 out[128];
-	char *inptr, *outptr;
-	size_t inlen, outlen;
-
+	
 	/* dont count the terminator */
-	bytes = ((sizeof(tables)/sizeof(tables[0]))+7-1)/8;
-
+	bytes = (G_N_ELEMENTS (tables) + 7 - 1) / 8;
+	
 	for (i = 0; i < 128; i++)
 		in[i] = i + 128;
-
-	for (j = 0; tables[j].name; j++) {
+	
+	for (j = 0; tables[j].name && !tables[j].multibyte; j++) {
 		cd = iconv_open (UCS, tables[j].name);
-		inptr = in;
-		outptr = (char *)(out);
-		inlen = sizeof (in);
-		outlen = sizeof (out);
-		while (iconv (cd, &inptr, &inlen, &outptr, &outlen) == -1) {
+		inbuf = in;
+		outbuf = (char *)(out);
+		inleft = sizeof (in);
+		outleft = sizeof (out);
+		while (iconv (cd, &inbuf, &inleft, &outbuf, &outleft) == -1) {
 			if (errno == EILSEQ) {
-				inptr++;
-				inlen--;
+				inbuf++;
+				inleft--;
 			} else {
-				printf ("%s\n", strerror (errno));
+				fprintf (stderr, "iconv (%s->UCS4, ..., %d, ..., %d): %s",
+					 tables[j].name, inleft, outleft,
+					 strerror (errno));
 				exit (1);
 			}
 		}
 		iconv_close (cd);
-
-		for (i = 0; i < 128 - outlen / 4; i++) {
+		
+		for (i = 0; i < 128 - outleft / 4; i++) {
 			encoding_map[i] |= bit;
 			encoding_map[out[i]] |= bit;
 		}
-
+		
 		tables[j].bit = bit;
 		bit <<= 1;
 	}
-
-	printf("/* This file is automatically generated: DO NOT EDIT */\n\n");
-
-	for (i=0;i<256;i++) {
+	
+	/* Mutibyte tables */
+	for ( ; tables[j].name && tables[j].multibyte; j++) {
+		cd = iconv_open (tables[j].name, UCS);
+		if (cd == (iconv_t) -1)
+			continue;
+		
+		for (c = 128, i = 0; c < 65535 && i < 65535; c++) {
+			inbuf = (char *) &c;
+			inleft = sizeof (c);
+			outbuf = in;
+			outleft = sizeof (in);
+			
+			if (iconv (cd, &inbuf, &inleft, &outbuf, &outleft) != (size_t) -1) {
+				/* this is a legal character in charset table[j].name */
+				iconv (cd, NULL, NULL, &outbuf, &outleft);
+				encoding_map[i++] |= bit;
+				encoding_map[c] |= bit;
+			} else {
+				/* reset the iconv descriptor */
+				iconv (cd, NULL, NULL, NULL, NULL);
+			}
+		}
+		
+		iconv_close (cd);
+		
+		tables[j].bit = bit;
+		bit <<= 1;
+	}
+	
+	printf ("/* This file is automatically generated: DO NOT EDIT */\n\n");
+	
+	/* FIXME: we can condense better than what my quick hack does,
+	   but it'd be more work and I'm not sure if it's worth it or
+	   not. Currently I'm just making it so that tables that
+	   contain all of the same values will only ever be
+	   one-of-a-kind by making duplicates into macro aliases for
+	   the original */
+	
+	table_hash = g_hash_table_new (g_int_hash, g_int_equal);
+	
+	for (i = 0; i < 256; i++) {
 		/* first, do we need this block? */
-		for (k=0;k<bytes;k++) {
-			for (j=0;j<256;j++) {
-				if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
-					break;
+		for (k = 0; k < bytes; k++) {
+			int first = encoding_map[i * 256] & (0xff << (k * 8));
+			int same = TRUE;
+			int dump = FALSE;
+			
+			for (j = 0; j < 256; j++) {
+				same = same && (encoding_map[i * 256 + j] & (0xff << (k * 8))) == first;
+				if ((encoding_map[i * 256 + j] & (0xff << (k * 8))) != 0)
+					dump = TRUE;
 			}
-			if (j < 256) {
+			
+			if (dump) {
+				if (same) {
+					/* this table is aliasable */
+					char *table_name;
+					
+					if ((table_name = g_hash_table_lookup (table_hash, &first))) {
+						/* we've already written out a table with the exact same
+						   values so we can just alias it with a macro. */
+						printf ("#define m%02x%x %s\n\n", i, k, table_name);
+						continue;
+					} else {
+						table_name = g_strdup_printf ("m%02x%x", i, k);
+						g_hash_table_insert (table_hash, &first, table_name);
+					}
+				}
+				
 				/* yes, dump it */
-				printf("static unsigned char m%02x%x[256] = {\n\t", i, k);
-				for (j=0;j<256;j++) {
-					printf("0x%02x, ", (encoding_map[i*256+j] >> (k*8)) & 0xff );
-					if (((j+1)&7) == 0 && j<255)
-						printf("\n\t");
+				printf ("static unsigned char m%02x%x[256] = {\n\t", i, k);
+				for (j = 0; j < 256; j++) {
+					printf ("0x%02x, ", (encoding_map[i * 256 + j] >> (k * 8)) & 0xff);
+					if (((j + 1) & 7) == 0 && j < 255)
+						printf ("\n\t");
 				}
-				printf("\n};\n\n");
+				printf ("\n};\n\n");
 			}
 		}
 	}
-
-	printf("struct {\n");
-	for (k=0;k<bytes;k++) {
-		printf("\tunsigned char *bits%d;\n", k);
+	
+	printf ("struct {\n");
+	for (k = 0; k < bytes; k++) {
+		printf ("\tunsigned char *bits%d;\n", k);
 	}
-	printf("} camel_charmap[256] = {\n\t");
-	for (i=0;i<256;i++) {
+	
+	printf ("} camel_charmap[256] = {\n\t");
+	for (i = 0; i < 256; i++) {
 		/* first, do we need this block? */
-		printf("{ ");
-		for (k=0;k<bytes;k++) {
-			for (j=0;j<256;j++) {
-				if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
+		printf ("{ ");
+		for (k = 0; k < bytes; k++) {
+			for (j = 0; j < 256; j++) {
+				if ((encoding_map[i * 256 + j] & (0xff << (k * 8))) != 0)
 					break;
 			}
+			
 			if (j < 256) {
-				printf("m%02x%x, ", i, k);
+				printf ("m%02x%x, ", i, k);
 			} else {
-				printf("0, ");
+				printf ("0, ");
 			}
 		}
-		printf("}, ");
-		if (((i+1)&7) == 0 && i<255)
-			printf("\n\t");
+		
+		printf ("}, ");
+		if (((i + 1) & 7) == 0 && i < 255)
+			printf ("\n\t");
 	}
-	printf("\n};\n\n");
-
-	printf("struct {\n\tconst char *name;\n\tunsigned int bit;\n} camel_charinfo[] = {\n");
-	for (j=0;tables[j].name;j++) {
-		printf("\t{ \"%s\", 0x%04x },\n", tables[j].name, tables[j].bit);
+	printf ("\n};\n\n");
+	
+	printf ("struct {\n\tconst char *name;\n\tunsigned int bit;\n} camel_charinfo[] = {\n");
+	for (j = 0; tables[j].name; j++) {
+		printf ("\t{ \"%s\", 0x%04x },\n", tables[j].name, tables[j].bit);
 	}
-	printf("};\n\n");
-
+	printf ("};\n\n");
+	
 	printf("#define charset_mask(x) \\\n");
-	for (k=0;k<bytes;k++) {
-		if (k!=0)
-			printf("\t| ");
+	for (k = 0; k < bytes; k++) {
+		if (k != 0)
+			printf ("\t| ");
 		else
-			printf("\t");
-		printf("(camel_charmap[(x)>>8].bits%d?camel_charmap[(x)>>8].bits%d[(x)&0xff]<<%d:0)", k, k, k*8);
-		if (k<bytes-1)
-			printf("\t\\\n");
+			printf ("\t");
+		
+		printf ("(camel_charmap[(x) >> 8].bits%d ? camel_charmap[(x) >> 8].bits%d[(x) & 0xff] << %d : 0)",
+			k, k, k * 8);
+		
+		if (k < bytes - 1)
+			printf ("\t\\\n");
 	}
-	printf("\n\n");
+	printf ("\n\n");
 	
 	return 0;
 }
@@ -211,6 +293,8 @@
 #include <langinfo.h>
 #endif
 
+#include <gal/util/e-iconv.h>
+
 void
 camel_charset_init (CamelCharset *c)
 {
@@ -261,12 +345,19 @@
 static const char *
 camel_charset_best_mask(unsigned int mask)
 {
+	const char *locale_lang, *lang;
 	int i;
-
-	for (i=0;i<sizeof(camel_charinfo)/sizeof(camel_charinfo[0]);i++) {
-		if (camel_charinfo[i].bit & mask)
-			return camel_charinfo[i].name;
+	
+	locale_lang = e_iconv_locale_language ();
+	for (i = 0; i < G_N_ELEMENTS (camel_charinfo); i++) {
+		if (camel_charinfo[i].bit & mask) {
+			lang = e_iconv_charset_language (camel_charinfo[i].name);
+			
+			if (!lang || (locale_lang && !strncmp (locale_lang, lang, 2)))
+				return camel_charinfo[i].name;
+		}
 	}
+	
 	return "UTF-8";
 }

Attachment: smime.p7s
Description: S/MIME cryptographic signature

Follow-Ups:
- Re: [evolution-patches] fix for bug #24026 (try harder not to sed in UTF-8)
  - From: Not Zed

[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]