Re: [evolution-patches] patch for 24026 try harder to not send headers in UTF-8

From: Jeffrey Stedfast <fejj ximian com>
To: Suresh Chandrasekharan <Suresh Chandrasekharan Eng Sun COM>
Cc: evolution-patches ximian com
Subject: Re: [evolution-patches] patch for 24026 try harder to not send headers in UTF-8
Date: Thu, 16 Oct 2003 13:40:27 -0400

rejected.

as I told you in email, I had a semi-working fix for this, but it's not
ready for prime-time.

there are quite a few problems with this patch.

1. the use of gconf is a no-no. first, we are trying to split camel off
into its own self-contained library. the only reason camel depnds on gal
is for e-iconv. we DO NOT under any circumstances want to increase the
number of higher-level libs that camel depends on, especially for so
little gain. Thirdly, camel is multithreaded while gconf is not
thread-safe. This is a Bad Thing (tm).

2. camel should not use evo-mail specific user data anyway (such as
gconf keys or any other data that the mailer front-end saves
independently of camel)

3. and most importantly... 

-       return "UTF-8";
+       return (camel_get_default_charset());

that is just plain wrong and likely to cause data corruption. Do you
understand why the code returns UTF-8 there? It returns UTF-8 there
because that is the only charset that is guarenteed to be able to
contain all of the characters in the string without data loss.

returning an arbitrary charset without having actually tested to see
that the data could fit into that charset is asking for problems.

the attached patch is better but also not acceptable for inclusion in
1.4

Jeff

On Thu, 2003-10-16 at 12:40, Suresh Chandrasekharan wrote:
> Hi Jeff,
> 
> 	Call me a sinner, I have created this camel patch for getting the 
> charset the user sets in Composer Preferences, this will be applied only if all 
> other detection mechanisms fail.
> 	In other words camel which was a low level library is now currently 
> married to gconf, which is not as bad as it already is dependant upon gal-2.0.
> Pl. check this to see whether this is a gross violation of some camel 
> fundamentals.
> 
> Thanks & Regards,
> Suresh
-- 
Jeffrey Stedfast
Evolution Hacker - Ximian, Inc.
fejj ximian com  - www.ximian.com

--- camel-charset-map.c	2003-02-25 14:43:22.000000000 -0500
+++ charset-map.c	2003-10-16 13:31:47.000000000 -0400
@@ -53,8 +53,9 @@
 #include <glib.h>
 
 static struct {
-	char *name;
-	unsigned int bit;	/* assigned bit */
+	char *name;        /* charset name */
+	int multibyte;     /* charset type */
+	unsigned int bit;  /* assigned bit */
 } tables[] = {
 	/* These are the 8bit character sets (other than iso-8859-1,
 	 * which is special-cased) which are supported by both other
@@ -62,20 +63,34 @@
 	 * they're listed in is the order they'll be tried in, so put
 	 * the more-popular ones first.
 	 */
-	{ "iso-8859-2", 0 },	/* Central/Eastern European */
-	{ "iso-8859-4", 0 },	/* Baltic */
-	{ "koi8-r", 0 },	/* Russian */
-	{ "koi8-u", 0 },	/* Ukranian */
-	{ "iso-8859-5", 0 },	/* Least-popular Russian encoding */
-	{ "iso-8859-7", 0 },	/* Greek */
-	{ "iso-8859-8", 0 },    /* Hebrew; Visual */
-	{ "iso-8859-9", 0 },	/* Turkish */
-	{ "iso-8859-13", 0 },	/* Baltic again */
-	{ "iso-8859-15", 0 },	/* New-and-improved iso-8859-1, but most
-				 * programs that support this support UTF8
-				 */
-	{ "windows-1251", 0 },	/* Russian */
-	{ 0, 0 }
+	{ "iso-8859-2",   0, 0 },  /* Central/Eastern European */
+	{ "iso-8859-4",   0, 0 },  /* Baltic */
+	{ "koi8-r",       0, 0 },  /* Russian */
+	{ "koi8-u",       0, 0 },  /* Ukranian */
+	{ "iso-8859-5",   0, 0 },  /* Least-popular Russian encoding */
+	{ "iso-8859-7",   0, 0 },  /* Greek */
+	{ "iso-8859-8",   0, 0 },  /* Hebrew; Visual */
+	{ "iso-8859-9",   0, 0 },  /* Turkish */
+	{ "iso-8859-13",  0, 0 },  /* Baltic again */
+	{ "iso-8859-15",  0, 0 },  /* New-and-improved iso-8859-1, but most
+				    * programs that support this support UTF8
+				    */
+	{ "windows-1251", 0, 0 },  /* Russian */
+	
+	/* These are the multibyte character sets which are commonly
+	 * supported by other mail clients. Note: order for multibyte
+	 * charsets does not affect priority unlike the 8bit charsets
+	 * listed above.
+	 */
+	{ "iso-2022-jp",  1, 0 },  /* Japanese designed for use over the Net */
+	{ "Shift-JIS",    1, 0 },  /* Japanese as used by Windows and MacOS systems */
+	{ "euc-jp",       1, 0 },  /* Japanese traditionally used on Unix systems */
+	{ "euc-kr",       1, 0 },  /* Korean */
+	{ "iso-2022-kr",  1, 0 },  /* Korean (less popular than euc-kr) */
+	{ "gb2312",       1, 0 },  /* Simplified Chinese */
+	{ "Big5",         1, 0 },  /* Traditional Chinese */
+	{ "euc-tw",       1, 0 },
+	{ NULL, 0, 0}
 };
 
 unsigned int encoding_map[256 * 256];
@@ -88,113 +103,169 @@
 
 int main (void)
 {
-	int i, j;
+	GHashTable *table_hash;
+	size_t inleft, outleft;
+	char *inbuf, *outbuf;
+	guint32 out[128], c;
+	unsigned int bit = 0x01;
+	int i, j, k;
 	int max, min;
-	int bit = 0x01;
-	int k;
 	int bytes;
 	iconv_t cd;
 	char in[128];
-	guint32 out[128];
-	char *inptr, *outptr;
-	size_t inlen, outlen;
-
+	
 	/* dont count the terminator */
-	bytes = ((sizeof(tables)/sizeof(tables[0]))+7-1)/8;
-
+	bytes = ((sizeof (tables) / sizeof (tables[0])) + 7 - 1) / 8;
+	
 	for (i = 0; i < 128; i++)
 		in[i] = i + 128;
-
-	for (j = 0; tables[j].name; j++) {
+	
+	for (j = 0; tables[j].name && !tables[j].multibyte; j++) {
 		cd = iconv_open (UCS, tables[j].name);
-		inptr = in;
-		outptr = (char *)(out);
-		inlen = sizeof (in);
-		outlen = sizeof (out);
-		while (iconv (cd, &inptr, &inlen, &outptr, &outlen) == -1) {
+		inbuf = in;
+		outbuf = (char *) out;
+		inleft = sizeof (in);
+		outleft = sizeof (out);
+		while (iconv (cd, &inbuf, &inleft, &outbuf, &outleft) == -1) {
 			if (errno == EILSEQ) {
-				inptr++;
-				inlen--;
+				inbuf++;
+				inleft--;
 			} else {
 				printf ("%s\n", strerror (errno));
 				exit (1);
 			}
 		}
 		iconv_close (cd);
-
-		for (i = 0; i < 128 - outlen / 4; i++) {
+		
+		for (i = 0; i < 128 - outleft / 4; i++) {
 			encoding_map[i] |= bit;
 			encoding_map[out[i]] |= bit;
 		}
-
+		
 		tables[j].bit = bit;
 		bit <<= 1;
 	}
-
-	printf("/* This file is automatically generated: DO NOT EDIT */\n\n");
-
-	for (i=0;i<256;i++) {
+	
+	for ( ; tables[j].name && tables[j].multibyte; j++) {
+		cd = iconv_open (tables[j].name, UCS);
+		
+		for (c = 128, i = 0; c < 65535 && i < 65535; c++) {
+			inbuf = (char *) &c;
+			inleft = sizeof (c);
+			outbuf = in;
+			outleft = sizeof (in);
+			
+			if (iconv (cd, &inbuf, &inleft, &outbuf, &outleft) != (size_t) -1) {
+				/* this is a legal character in charset table[j].name */
+				iconv (cd, NULL, NULL, &outbuf, &outleft);
+				encoding_map[i++] |= bit;
+				encoding_map[c] |= bit;
+			} else {
+				/* reset the iconv descriptor */
+				iconv (cd, NULL, NULL, NULL, NULL);
+			}
+		}
+		
+		iconv_close (cd);
+		
+		tables[j].bit = bit;
+		bit <<= 1;
+	}
+	
+	printf ("/* This file is automatically generated: DO NOT EDIT */\n\n");
+	
+	/* FIXME: we can condense better than what my quick hack does,
+	   but it'd be more work and I'm not sure if it's worth it or
+	   not. Currently I'm just making it so that tables that
+	   contain all of the same values will only ever be
+	   one-of-a-kind by making duplicates into macro aliases for
+	   the original */
+	
+	table_hash = g_hash_table_new (g_int_hash, g_int_equal);
+	
+	for (i = 0; i < 256; i++) {
 		/* first, do we need this block? */
-		for (k=0;k<bytes;k++) {
-			for (j=0;j<256;j++) {
-				if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
-					break;
+		for (k = 0; k < bytes; k++) {
+			int first = encoding_map[i * 256] & (0xff << (k * 8));
+			int same = TRUE;
+			int dump = FALSE;
+			
+			for (j = 0; j < 256; j++) {
+				same = same && (encoding_map[i * 256 + j] & (0xff << (k * 8))) == first;
+				if ((encoding_map[i * 256 + j] & (0xff << (k * 8))) != 0)
+					dump = TRUE;
 			}
-			if (j < 256) {
+			
+			if (dump) {
+				if (same) {
+					/* this table is aliasable */
+					char *table_name;
+					
+					if ((table_name = g_hash_table_lookup (table_hash, &first))) {
+						/* we've already written out a table with the exact same
+						   values so we can just alias it with a macro. */
+						printf ("#define m%02x%x %s\n\n", i, k, table_name);
+						continue;
+					} else {
+						table_name = g_strdup_printf ("m%02x%x", i, k);
+						g_hash_table_insert (table_hash, &first, table_name);
+					}
+				}
+				
 				/* yes, dump it */
-				printf("static unsigned char m%02x%x[256] = {\n\t", i, k);
-				for (j=0;j<256;j++) {
-					printf("0x%02x, ", (encoding_map[i*256+j] >> (k*8)) & 0xff );
-					if (((j+1)&7) == 0 && j<255)
-						printf("\n\t");
+				printf ("static unsigned char m%02x%x[256] = {\n\t", i, k);
+				for (j = 0; j < 256; j++) {
+					printf ("0x%02x, ", (encoding_map[i * 256 + j] >> (k * 8)) & 0xff);
+					if (((j + 1) & 7) == 0 && j < 255)
+						printf ("\n\t");
 				}
-				printf("\n};\n\n");
+				printf ("\n};\n\n");
 			}
 		}
 	}
-
-	printf("struct {\n");
-	for (k=0;k<bytes;k++) {
-		printf("\tunsigned char *bits%d;\n", k);
+	
+	printf ("struct {\n");
+	for (k = 0; k < bytes; k++) {
+		printf ("\tunsigned char *bits%d;\n", k);
 	}
-	printf("} camel_charmap[256] = {\n\t");
-	for (i=0;i<256;i++) {
+	printf ("} camel_charmap[256] = {\n\t");
+	for (i = 0; i < 256; i++) {
 		/* first, do we need this block? */
-		printf("{ ");
-		for (k=0;k<bytes;k++) {
-			for (j=0;j<256;j++) {
+		printf ("{ ");
+		for (k = 0; k < bytes; k++) {
+			for (j = 0; j < 256; j++) {
 				if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
 					break;
 			}
 			if (j < 256) {
-				printf("m%02x%x, ", i, k);
+				printf ("m%02x%x, ", i, k);
 			} else {
-				printf("0, ");
+				printf ("0, ");
 			}
 		}
-		printf("}, ");
-		if (((i+1)&7) == 0 && i<255)
-			printf("\n\t");
+		printf ("}, ");
+		if (((i+1) & 7) == 0 && i < 255)
+			printf ("\n\t");
 	}
-	printf("\n};\n\n");
-
-	printf("struct {\n\tconst char *name;\n\tunsigned int bit;\n} camel_charinfo[] = {\n");
-	for (j=0;tables[j].name;j++) {
-		printf("\t{ \"%s\", 0x%04x },\n", tables[j].name, tables[j].bit);
+	printf ("\n};\n\n");
+	
+	printf ("struct {\n\tconst char *name;\n\tint multibyte;\n\tunsigned int bit;\n} camel_charinfo[] = {\n");
+	for (j = 0; tables[j].name; j++) {
+		printf ("\t{ \"%s\", %d, 0x%04x },\n", tables[j].name, tables[j].multibyte, tables[j].bit);
 	}
-	printf("};\n\n");
-
-	printf("#define charset_mask(x) \\\n");
-	for (k=0;k<bytes;k++) {
-		if (k!=0)
-			printf("\t| ");
+	printf ("};\n\n");
+	
+	printf ("#define charset_mask(x) \\\n");
+	for (k = 0; k < bytes; k++) {
+		if (k != 0)
+			printf ("\t| ");
 		else
-			printf("\t");
-		printf("(camel_charmap[(x)>>8].bits%d?camel_charmap[(x)>>8].bits%d[(x)&0xff]<<%d:0)", k, k, k*8);
-		if (k<bytes-1)
-			printf("\t\\\n");
+			printf ("\t");
+		printf ("(camel_charmap[(x)>>8].bits%d?camel_charmap[(x)>>8].bits%d[(x)&0xff]<<%d:0)", k, k, k*8);
+		if (k < bytes - 1)
+			printf ("\t\\\n");
 	}
-	printf("\n\n");
+	printf ("\n\n");
 	
 	return 0;
 }

Follow-Ups:
- Re: [evolution-patches] patch for 24026 try harder to not send headers in UTF-8
  - From: Dan Winship

References:
- [evolution-patches] patch for 24026 try harder to not send headers in UTF-8
  - From: Suresh Chandrasekharan

[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]