http://bugzilla.ximian.com/show_bug.cgi?id=24026 I don't expect this to be accepted absed on NotZed's comments in the bug, but jp asked me to send in a patch so here it is. btw, I think at least the charset_mask_best() changes should go in even if the other changes don't. Jeff -- Jeffrey Stedfast Evolution Hacker - Novell, Inc. fejj ximian com - www.novell.com
Index: ChangeLog =================================================================== RCS file: /cvs/gnome/evolution/camel/ChangeLog,v retrieving revision 1.2223 diff -u -r1.2223 ChangeLog --- ChangeLog 21 Jul 2004 15:55:00 -0000 1.2223 +++ ChangeLog 22 Jul 2004 17:29:34 -0000 @@ -1,3 +1,14 @@ +2004-07-22 Jeffrey Stedfast <fejj novell com> + + * camel-charset-map.c (main): Add some multibyte charsets to the + table. + (camel_charset_best_mask): Changed the logic slightly to only + match certain charsets if the locale matches (Macedonians don't + want to use koi8-r for example). This logic also prevents the use + of a multibyte charset for mixed languages (such as DanW's feared + Greek and Japanese example) as long as the locale isn't Japanese + (in which case they probably want Japanese anyway). + 2004-07-19 Jeffrey Stedfast <fejj novell com> * providers/imap/camel-imap-store.c (get_subscribed_folders): Free Index: camel-charset-map.c =================================================================== RCS file: /cvs/gnome/evolution/camel/camel-charset-map.c,v retrieving revision 1.38 diff -u -r1.38 camel-charset-map.c --- camel-charset-map.c 9 Jul 2003 19:05:12 -0000 1.38 +++ camel-charset-map.c 22 Jul 2004 17:29:34 -0000 @@ -49,11 +49,13 @@ #ifdef BUILD_MAP #include <iconv.h> +#include <errno.h> #include <glib.h> static struct { - char *name; - unsigned int bit; /* assigned bit */ + char *name; /* charset name */ + int multibyte; /* charset type */ + unsigned int bit; /* assigned bit */ } tables[] = { /* These are the 8bit character sets (other than iso-8859-1, * which is special-cased) which are supported by both other @@ -61,20 +63,34 @@ * they're listed in is the order they'll be tried in, so put * the more-popular ones first. */ - { "iso-8859-2", 0 }, /* Central/Eastern European */ - { "iso-8859-4", 0 }, /* Baltic */ - { "koi8-r", 0 }, /* Russian */ - { "koi8-u", 0 }, /* Ukranian */ - { "iso-8859-5", 0 }, /* Least-popular Russian encoding */ - { "iso-8859-7", 0 }, /* Greek */ - { "iso-8859-8", 0 }, /* Hebrew; Visual */ - { "iso-8859-9", 0 }, /* Turkish */ - { "iso-8859-13", 0 }, /* Baltic again */ - { "iso-8859-15", 0 }, /* New-and-improved iso-8859-1, but most - * programs that support this support UTF8 - */ - { "windows-1251", 0 }, /* Russian */ - { 0, 0 } + { "iso-8859-2", 0, 0 }, /* Central/Eastern European */ + { "iso-8859-4", 0, 0 }, /* Baltic */ + { "koi8-r", 0, 0 }, /* Russian */ + { "koi8-u", 0, 0 }, /* Ukranian */ + { "iso-8859-5", 0, 0 }, /* Least-popular Russian encoding */ + { "iso-8859-7", 0, 0 }, /* Greek */ + { "iso-8859-8", 0, 0 }, /* Hebrew; Visual */ + { "iso-8859-9", 0, 0 }, /* Turkish */ + { "iso-8859-13", 0, 0 }, /* Baltic again */ + { "iso-8859-15", 0, 0 }, /* New-and-improved iso-8859-1, but most + * programs that support this support UTF8 + */ + { "windows-1251", 0, 0 }, /* Russian */ + + /* These are the multibyte character sets which are commonly + * supported by other mail clients. Note: order for multibyte + * charsets does not affect priority unlike the 8bit charsets + * listed above. + */ + { "iso-2022-jp", 1, 0 }, /* Japanese designed for use over the Net */ + { "Shift-JIS", 1, 0 }, /* Japanese as used by Windows and MacOS systems */ + { "euc-jp", 1, 0 }, /* Japanese traditionally used on Unix systems */ + { "euc-kr", 1, 0 }, /* Korean */ + { "iso-2022-kr", 1, 0 }, /* Korean (less popular than euc-kr) */ + { "gb2312", 1, 0 }, /* Simplified Chinese */ + { "Big5", 1, 0 }, /* Traditional Chinese */ + { "euc-tw", 1, 0 }, + { NULL, 0, 0 } }; unsigned int encoding_map[256 * 256]; @@ -85,115 +101,181 @@ #define UCS "UCS-4LE" #endif -int main (void) +int main (int argc, char **argv) { - int i, j; - int max, min; - int bit = 0x01; - int k; + GHashTable *table_hash; + size_t inleft, outleft; + char *inbuf, *outbuf; + guint32 out[128], c; + unsigned int bit = 0x01; + char in[128]; + int i, j, k; int bytes; iconv_t cd; - char in[128]; - guint32 out[128]; - char *inptr, *outptr; - size_t inlen, outlen; - + /* dont count the terminator */ - bytes = ((sizeof(tables)/sizeof(tables[0]))+7-1)/8; - + bytes = (G_N_ELEMENTS (tables) + 7 - 1) / 8; + for (i = 0; i < 128; i++) in[i] = i + 128; - - for (j = 0; tables[j].name; j++) { + + for (j = 0; tables[j].name && !tables[j].multibyte; j++) { cd = iconv_open (UCS, tables[j].name); - inptr = in; - outptr = (char *)(out); - inlen = sizeof (in); - outlen = sizeof (out); - while (iconv (cd, &inptr, &inlen, &outptr, &outlen) == -1) { + inbuf = in; + outbuf = (char *)(out); + inleft = sizeof (in); + outleft = sizeof (out); + while (iconv (cd, &inbuf, &inleft, &outbuf, &outleft) == -1) { if (errno == EILSEQ) { - inptr++; - inlen--; + inbuf++; + inleft--; } else { - printf ("%s\n", strerror (errno)); + fprintf (stderr, "iconv (%s->UCS4, ..., %d, ..., %d): %s", + tables[j].name, inleft, outleft, + strerror (errno)); exit (1); } } iconv_close (cd); - - for (i = 0; i < 128 - outlen / 4; i++) { + + for (i = 0; i < 128 - outleft / 4; i++) { encoding_map[i] |= bit; encoding_map[out[i]] |= bit; } - + tables[j].bit = bit; bit <<= 1; } - - printf("/* This file is automatically generated: DO NOT EDIT */\n\n"); - - for (i=0;i<256;i++) { + + /* Mutibyte tables */ + for ( ; tables[j].name && tables[j].multibyte; j++) { + cd = iconv_open (tables[j].name, UCS); + if (cd == (iconv_t) -1) + continue; + + for (c = 128, i = 0; c < 65535 && i < 65535; c++) { + inbuf = (char *) &c; + inleft = sizeof (c); + outbuf = in; + outleft = sizeof (in); + + if (iconv (cd, &inbuf, &inleft, &outbuf, &outleft) != (size_t) -1) { + /* this is a legal character in charset table[j].name */ + iconv (cd, NULL, NULL, &outbuf, &outleft); + encoding_map[i++] |= bit; + encoding_map[c] |= bit; + } else { + /* reset the iconv descriptor */ + iconv (cd, NULL, NULL, NULL, NULL); + } + } + + iconv_close (cd); + + tables[j].bit = bit; + bit <<= 1; + } + + printf ("/* This file is automatically generated: DO NOT EDIT */\n\n"); + + /* FIXME: we can condense better than what my quick hack does, + but it'd be more work and I'm not sure if it's worth it or + not. Currently I'm just making it so that tables that + contain all of the same values will only ever be + one-of-a-kind by making duplicates into macro aliases for + the original */ + + table_hash = g_hash_table_new (g_int_hash, g_int_equal); + + for (i = 0; i < 256; i++) { /* first, do we need this block? */ - for (k=0;k<bytes;k++) { - for (j=0;j<256;j++) { - if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0) - break; + for (k = 0; k < bytes; k++) { + int first = encoding_map[i * 256] & (0xff << (k * 8)); + int same = TRUE; + int dump = FALSE; + + for (j = 0; j < 256; j++) { + same = same && (encoding_map[i * 256 + j] & (0xff << (k * 8))) == first; + if ((encoding_map[i * 256 + j] & (0xff << (k * 8))) != 0) + dump = TRUE; } - if (j < 256) { + + if (dump) { + if (same) { + /* this table is aliasable */ + char *table_name; + + if ((table_name = g_hash_table_lookup (table_hash, &first))) { + /* we've already written out a table with the exact same + values so we can just alias it with a macro. */ + printf ("#define m%02x%x %s\n\n", i, k, table_name); + continue; + } else { + table_name = g_strdup_printf ("m%02x%x", i, k); + g_hash_table_insert (table_hash, &first, table_name); + } + } + /* yes, dump it */ - printf("static unsigned char m%02x%x[256] = {\n\t", i, k); - for (j=0;j<256;j++) { - printf("0x%02x, ", (encoding_map[i*256+j] >> (k*8)) & 0xff ); - if (((j+1)&7) == 0 && j<255) - printf("\n\t"); + printf ("static unsigned char m%02x%x[256] = {\n\t", i, k); + for (j = 0; j < 256; j++) { + printf ("0x%02x, ", (encoding_map[i * 256 + j] >> (k * 8)) & 0xff); + if (((j + 1) & 7) == 0 && j < 255) + printf ("\n\t"); } - printf("\n};\n\n"); + printf ("\n};\n\n"); } } } - - printf("struct {\n"); - for (k=0;k<bytes;k++) { - printf("\tunsigned char *bits%d;\n", k); + + printf ("struct {\n"); + for (k = 0; k < bytes; k++) { + printf ("\tunsigned char *bits%d;\n", k); } - printf("} camel_charmap[256] = {\n\t"); - for (i=0;i<256;i++) { + + printf ("} camel_charmap[256] = {\n\t"); + for (i = 0; i < 256; i++) { /* first, do we need this block? */ - printf("{ "); - for (k=0;k<bytes;k++) { - for (j=0;j<256;j++) { - if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0) + printf ("{ "); + for (k = 0; k < bytes; k++) { + for (j = 0; j < 256; j++) { + if ((encoding_map[i * 256 + j] & (0xff << (k * 8))) != 0) break; } + if (j < 256) { - printf("m%02x%x, ", i, k); + printf ("m%02x%x, ", i, k); } else { - printf("0, "); + printf ("0, "); } } - printf("}, "); - if (((i+1)&7) == 0 && i<255) - printf("\n\t"); + + printf ("}, "); + if (((i + 1) & 7) == 0 && i < 255) + printf ("\n\t"); } - printf("\n};\n\n"); - - printf("struct {\n\tconst char *name;\n\tunsigned int bit;\n} camel_charinfo[] = {\n"); - for (j=0;tables[j].name;j++) { - printf("\t{ \"%s\", 0x%04x },\n", tables[j].name, tables[j].bit); + printf ("\n};\n\n"); + + printf ("struct {\n\tconst char *name;\n\tunsigned int bit;\n} camel_charinfo[] = {\n"); + for (j = 0; tables[j].name; j++) { + printf ("\t{ \"%s\", 0x%04x },\n", tables[j].name, tables[j].bit); } - printf("};\n\n"); - + printf ("};\n\n"); + printf("#define charset_mask(x) \\\n"); - for (k=0;k<bytes;k++) { - if (k!=0) - printf("\t| "); + for (k = 0; k < bytes; k++) { + if (k != 0) + printf ("\t| "); else - printf("\t"); - printf("(camel_charmap[(x)>>8].bits%d?camel_charmap[(x)>>8].bits%d[(x)&0xff]<<%d:0)", k, k, k*8); - if (k<bytes-1) - printf("\t\\\n"); + printf ("\t"); + + printf ("(camel_charmap[(x) >> 8].bits%d ? camel_charmap[(x) >> 8].bits%d[(x) & 0xff] << %d : 0)", + k, k, k * 8); + + if (k < bytes - 1) + printf ("\t\\\n"); } - printf("\n\n"); + printf ("\n\n"); return 0; } @@ -211,6 +293,8 @@ #include <langinfo.h> #endif +#include <gal/util/e-iconv.h> + void camel_charset_init (CamelCharset *c) { @@ -261,12 +345,19 @@ static const char * camel_charset_best_mask(unsigned int mask) { + const char *locale_lang, *lang; int i; - - for (i=0;i<sizeof(camel_charinfo)/sizeof(camel_charinfo[0]);i++) { - if (camel_charinfo[i].bit & mask) - return camel_charinfo[i].name; + + locale_lang = e_iconv_locale_language (); + for (i = 0; i < G_N_ELEMENTS (camel_charinfo); i++) { + if (camel_charinfo[i].bit & mask) { + lang = e_iconv_charset_language (camel_charinfo[i].name); + + if (!lang || (locale_lang && !strncmp (locale_lang, lang, 2))) + return camel_charinfo[i].name; + } } + return "UTF-8"; }
Attachment:
smime.p7s
Description: S/MIME cryptographic signature