[Evolution-hackers] improved rfc2047 decode patch



This patch is a port of my GMime rfc2047 decoder which is even more
liberal in what it accepts than Thunderbird and is what I will be
committing to svn.

closing bugs:

#302991
#315513
#502178

Jeff

Index: camel-mime-utils.c
===================================================================
--- camel-mime-utils.c	(revision 8315)
+++ camel-mime-utils.c	(working copy)
@@ -821,116 +821,321 @@
 	*in = inptr;
 }
 
-/* decode rfc 2047 encoded string segment */
 static char *
-rfc2047_decode_word(const char *in, size_t len)
+camel_iconv_strndup (iconv_t cd, const char *string, size_t n)
 {
-	const char *inptr = in+2;
-	const char *inend = in+len-2;
+	size_t inleft, outleft, converted = 0;
+	char *out, *outbuf;
 	const char *inbuf;
-	const char *charset;
-	char *encname, *p;
-	int tmplen;
-	size_t ret;
-	char *decword = NULL;
-	char *decoded = NULL;
-	char *outbase = NULL;
-	char *outbuf;
-	size_t inlen, outlen;
-	gboolean retried = FALSE;
-	iconv_t ic;
-
-	d(printf("rfc2047: decoding '%.*s'\n", len, in));
-
-	/* quick check to see if this could possibly be a real encoded word */
-	if (len < 8 || !(in[0] == '=' && in[1] == '?' && in[len-1] == '=' && in[len-2] == '?')) {
-		d(printf("invalid\n"));
-		return NULL;
-	}
-
-	/* skip past the charset to the encoding type */
-	inptr = memchr (inptr, '?', inend-inptr);
-	if (inptr != NULL && inptr < inend + 2 && inptr[2] == '?') {
-		d(printf("found ?, encoding is '%c'\n", inptr[0]));
-		inptr++;
-		tmplen = inend-inptr-2;
-		decword = g_alloca (tmplen); /* this will always be more-than-enough room */
-		switch(toupper(inptr[0])) {
-		case 'Q':
-			inlen = quoted_decode((const unsigned char *) inptr+2, tmplen, (unsigned char *) decword);
-			break;
-		case 'B': {
-			int state = 0;
-			unsigned int save = 0;
-
-			inlen = camel_base64_decode_step((unsigned char *) inptr+2, tmplen, (unsigned char *) decword, &state, &save);
-			/* if state != 0 then error? */
-			break;
+	size_t outlen;
+	int errnosav;
+	
+	if (cd == (iconv_t) -1)
+		return g_strndup (string, n);
+	
+	outlen = n * 2 + 16;
+	out = g_malloc (outlen + 4);
+	
+	inbuf = string;
+	inleft = n;
+	
+	do {
+		errno = 0;
+		outbuf = out + converted;
+		outleft = outlen - converted;
+		
+		converted = iconv (cd, (char **) &inbuf, &inleft, &outbuf, &outleft);
+		if (converted == (size_t) -1) {
+			if (errno != E2BIG && errno != EINVAL)
+				goto fail;
 		}
-		default:
-			/* uhhh, unknown encoding type - probably an invalid encoded word string */
-			return NULL;
+		
+		/*
+		 * E2BIG   There is not sufficient room at *outbuf.
+		 *
+		 * We just need to grow our outbuffer and try again.
+		 */
+		
+		converted = outbuf - out;
+		if (errno == E2BIG) {
+			outlen += inleft * 2 + 16;
+			out = g_realloc (out, outlen + 4);
+			outbuf = out + converted;
 		}
-		d(printf("The encoded length = %d\n", inlen));
-		if (inlen > 0) {
-			/* yuck, all this snot is to setup iconv! */
-			tmplen = inptr - in - 3;
-			encname = g_alloca (tmplen + 1);
-			memcpy (encname, in + 2, tmplen);
-			encname[tmplen] = '\0';
+	} while (errno == E2BIG && inleft > 0);
+	
+	/*
+	 * EINVAL  An  incomplete  multibyte sequence has been encoun­
+	 *         tered in the input.
+	 *
+	 * We'll just have to ignore it...
+	 */
+	
+	/* flush the iconv conversion */
+	iconv (cd, NULL, NULL, &outbuf, &outleft);
+	
+	/* Note: not all charsets can be nul-terminated with a single
+           nul byte. UCS2, for example, needs 2 nul bytes and UCS4
+           needs 4. I hope that 4 nul bytes is enough to terminate all
+           multibyte charsets? */
+	
+	/* nul-terminate the string */
+	memset (outbuf, 0, 4);
+	
+	/* reset the cd */
+	iconv (cd, NULL, NULL, NULL, NULL);
+	
+	return out;
+	
+ fail:
+	
+	errnosav = errno;
+	
+	w(g_warning ("camel_iconv_strndup: %s at byte %lu", strerror (errno), n - inleft));
+	
+	g_free (out);
+	
+	/* reset the cd */
+	iconv (cd, NULL, NULL, NULL, NULL);
+	
+	errno = errnosav;
+	
+	return NULL;
+}
 
-			/* rfc2231 updates rfc2047 encoded words...
-			 * The ABNF given in RFC 2047 for encoded-words is:
-			 *   encoded-word := "=?" charset "?" encoding "?" encoded-text "?="
-			 * This specification changes this ABNF to:
-			 *   encoded-word := "=?" charset ["*" language] "?" encoding "?" encoded-text "?="
-			 */
+#define is_ascii(c) isascii ((int) ((unsigned char) (c)))
 
-			/* trim off the 'language' part if it's there... */
-			p = strchr (encname, '*');
-			if (p)
-				*p = '\0';
-
-			charset = e_iconv_charset_name (encname);
-
-			inbuf = decword;
-
-			outlen = inlen * 6 + 16;
-			outbase = g_alloca (outlen);
-			outbuf = outbase;
-
-		retry:
-			ic = e_iconv_open ("UTF-8", charset);
-			if (ic != (iconv_t) -1) {
-				ret = e_iconv (ic, &inbuf, &inlen, &outbuf, &outlen);
-				if (ret != (size_t) -1) {
-					e_iconv (ic, NULL, 0, &outbuf, &outlen);
-					*outbuf = 0;
-					decoded = g_strdup (outbase);
+static char *
+decode_8bit (const char *text, size_t len, const char *default_charset)
+{
+	const char *charsets[4] = { "UTF-8", NULL, NULL, NULL };
+	size_t inleft, outleft, outlen, rc, min, n;
+	const char *locale_charset, *best;
+	char *out, *outbuf;
+	const char *inbuf;
+	iconv_t cd;
+	int i = 1;
+	
+	if (default_charset && g_ascii_strcasecmp (default_charset, "UTF-8") != 0)
+		charsets[i++] = default_charset;
+	
+	locale_charset = e_iconv_locale_charset ();
+	if (g_ascii_strcasecmp (locale_charset, "UTF-8") != 0)
+		charsets[i++] = locale_charset;
+	
+	min = len;
+	best = charsets[0];
+	
+	outlen = (len * 2) + 16; 
+	out = g_malloc (outlen + 1);
+	
+	for (i = 0; charsets[i]; i++) {
+		if ((cd = e_iconv_open ("UTF-8", charsets[i])) == (iconv_t) -1)
+			continue;
+		
+		outleft = outlen;
+		outbuf = out;
+		inleft = len;
+		inbuf = text;
+		n = 0;
+		
+		do {
+			rc = iconv (cd, (char **) &inbuf, &inleft, &outbuf, &outleft);
+			if (rc == (size_t) -1) {
+				if (errno == EINVAL) {
+					/* incomplete sequence at the end of the input buffer */
+					n += inleft;
+					break;
 				}
-				e_iconv_close (ic);
-			} else {
-				w(g_warning ("Cannot decode charset, header display may be corrupt: %s: %s",
-					     charset, strerror (errno)));
-
-				if (!retried) {
-					charset = e_iconv_locale_charset ();
-					if (!charset)
-						charset = "iso-8859-1";
-
-					retried = TRUE;
-					goto retry;
+				
+				if (errno == E2BIG) {
+					outlen += (inleft * 2) + 16;
+					rc = (size_t) (outbuf - out);
+					out = g_realloc (out, outlen + 1);
+					outleft = outlen - rc;
+					outbuf = out + rc;
+				} else {
+					inleft--;
+					inbuf++;
+					n++;
 				}
-
-				/* we return the encoded word here because we've got to return valid utf8 */
-				decoded = g_strndup (in, inlen);
 			}
+		} while (inleft > 0);
+		
+		rc = iconv (cd, NULL, NULL, &outbuf, &outleft);
+		*outbuf = '\0';
+		
+		e_iconv_close (cd);
+		
+		if (rc != (size_t) -1 && n == 0)
+			return out;
+		
+		if (n < min) {
+			best = charsets[i];
+			min = n;
 		}
 	}
+	
+	/* if we get here, then none of the charsets fit the 8bit text flawlessly...
+	 * try to find the one that fit the best and use that to convert what we can,
+	 * replacing any byte we can't convert with a '?' */
+	
+	if ((cd = e_iconv_open ("UTF-8", best)) == (iconv_t) -1) {
+		/* this shouldn't happen... but if we are here, then
+		 * it did...  the only thing we can do at this point
+		 * is replace the 8bit garbage and pray */
+		register const char *inptr = text;
+		const char *inend = inptr + len;
+		
+		outbuf = out;
+		
+		while (inptr < inend) {
+			if (is_ascii (*inptr))
+				*outbuf++ = *inptr++;
+			else
+				*outbuf++ = '?';
+		}
+		
+		*outbuf = '\0';
+		
+		return out;
+	}
+	
+	outleft = outlen;
+	outbuf = out;
+	inleft = len;
+	inbuf = text;
+	
+	do {
+		rc = iconv (cd, (char **) &inbuf, &inleft, &outbuf, &outleft);
+		if (rc == (size_t) -1) {
+			if (errno == EINVAL) {
+				/* incomplete sequence at the end of the input buffer */
+				break;
+			}
+			
+			if (errno == E2BIG) {
+				rc = outbuf - out;
+				outlen += inleft * 2 + 16;
+				out = g_realloc (out, outlen + 1);
+				outleft = outlen - rc;
+				outbuf = out + rc;
+			} else {
+				*outbuf++ = '?';
+				outleft--;
+				inleft--;
+				inbuf++;
+			}
+		}
+	} while (inleft > 0);
+	
+	iconv (cd, NULL, NULL, &outbuf, &outleft);
+	*outbuf = '\0';
+	
+	e_iconv_close (cd);
+	
+	return out;
+}
 
-	d(printf("decoded '%s'\n", decoded));
+#define is_rfc2047_encoded_word(atom, len) (len >= 7 && !strncmp (atom, "=?", 2) && !strncmp (atom + len - 2, "?=", 2))
 
-	return decoded;
+/* decode an rfc2047 encoded-word token */
+static char *
+rfc2047_decode_word (const char *in, size_t inlen, const char *default_charset)
+{
+	const unsigned char *instart = (const unsigned char *) in;
+	const register unsigned char *inptr = instart + 2;
+	const unsigned char *inend = instart + inlen - 2;
+	unsigned char *decoded;
+	const char *charset;
+	char *charenc, *p;
+	guint32 save = 0;
+	ssize_t declen;
+	int state = 0;
+	size_t len;
+	iconv_t cd;
+	char *buf;
+	
+	/* skip over the charset */
+	if (!(inptr = memchr (inptr, '?', inend - inptr)) || inptr[2] != '?')
+		return NULL;
+	
+	inptr++;
+	
+	switch (*inptr) {
+	case 'B':
+	case 'b':
+		inptr += 2;
+		decoded = g_alloca (inend - inptr);
+		declen = camel_base64_decode_step ((unsigned char *) inptr, inend - inptr, decoded, &state, &save);
+		break;
+	case 'Q':
+	case 'q':
+		inptr += 2;
+		decoded = g_alloca (inend - inptr);
+		declen = quoted_decode (inptr, inend - inptr, decoded);
+		
+		if (declen == -1) {
+			d(fprintf (stderr, "encountered broken 'Q' encoding\n"));
+			return NULL;
+		}
+		break;
+	default:
+		d(fprintf (stderr, "unknown encoding\n"));
+		return NULL;
+	}
+	
+	len = (inptr - 3) - (instart + 2);
+	charenc = g_alloca (len + 1);
+	memcpy (charenc, in + 2, len);
+	charenc[len] = '\0';
+	charset = charenc;
+	
+	/* rfc2231 updates rfc2047 encoded words...
+	 * The ABNF given in RFC 2047 for encoded-words is:
+	 *   encoded-word := "=?" charset "?" encoding "?" encoded-text "?="
+	 * This specification changes this ABNF to:
+	 *   encoded-word := "=?" charset ["*" language] "?" encoding "?" encoded-text "?="
+	 */
+	
+	/* trim off the 'language' part if it's there... */
+	if ((p = strchr (charset, '*')))
+		*p = '\0';
+	
+	/* slight optimization? */
+	if (!g_ascii_strcasecmp (charset, "UTF-8")) {
+		p = (char *) decoded;
+		len = declen;
+		
+		while (!g_utf8_validate (p, len, (const char **) &p)) {
+			len = declen - (p - (char *) decoded);
+			*p = '?';
+		}
+		
+		return g_strndup ((char *) decoded, declen);
+	}
+	
+	if (charset[0])
+		charset = e_iconv_charset_name (charset);
+	
+	if (!charset[0] || (cd = e_iconv_open ("UTF-8", charset)) == (iconv_t) -1) {
+		w(g_warning ("Cannot convert from %s to UTF-8, header display may "
+			     "be corrupt: %s", charset[0] ? charset : "unspecified charset",
+			     g_strerror (errno)));
+		
+		return decode_8bit ((char *) decoded, declen, default_charset);
+	}
+	
+	buf = camel_iconv_strndup (cd, (char *) decoded, declen);
+	e_iconv_close (cd);
+	
+	if (buf != NULL)
+		return buf;
+	
+	w(g_warning ("Failed to convert \"%.*s\" to UTF-8, display may be "
+		     "corrupt: %s", declen, decoded, g_strerror (errno)));
+	
+	return decode_8bit ((char *) decoded, declen, default_charset);
 }
 
 /* ok, a lot of mailers are BROKEN, and send iso-latin1 encoded
@@ -988,7 +1193,7 @@
 }
 
 static GString *
-append_quoted_pair (GString *str, const char *in, gssize inlen)
+append_quoted_pair (GString *str, const char *in, size_t inlen)
 {
 	register const char *inptr = in;
 	const char *inend = in + inlen;
@@ -1007,67 +1212,117 @@
 
 /* decodes a simple text, rfc822 + rfc2047 */
 static char *
-header_decode_text (const char *in, size_t inlen, int ctext, const char *default_charset)
+header_decode_text (const char *in, int ctext, const char *default_charset)
 {
+	register const char *inptr = in;
+	gboolean encoded = FALSE;
+	const char *lwsp, *text;
+	size_t nlwsp, n;
+	gboolean ascii;
+	char *decoded;
 	GString *out;
-	const char *inptr, *inend, *start, *chunk, *locale_charset;
-	GString *(* append) (GString *, const char *, gssize);
-	char *dword = NULL;
-	guint32 mask;
-
-	locale_charset = e_iconv_locale_charset ();
-
-	if (ctext) {
-		mask = (CAMEL_MIME_IS_SPECIAL | CAMEL_MIME_IS_SPACE | CAMEL_MIME_IS_CTRL);
-		append = append_quoted_pair;
-	} else {
-		mask = (CAMEL_MIME_IS_LWSP);
-		append = g_string_append_len;
-	}
-
-	out = g_string_new ("");
-	inptr = in;
-	inend = inptr + inlen;
-	chunk = NULL;
-
-	while (inptr < inend) {
-		start = inptr;
-		while (inptr < inend && camel_mime_is_type (*inptr, mask))
+	
+	if (in == NULL)
+		return g_strdup ("");
+	
+	out = g_string_sized_new (strlen (in) + 1);
+	
+	while (*inptr != '\0') {
+		lwsp = inptr;
+		while (camel_mime_is_lwsp (*inptr))
 			inptr++;
-
-		if (inptr == inend) {
-			append (out, start, inptr - start);
+		
+		nlwsp = (size_t) (inptr - lwsp);
+		
+		if (*inptr != '\0') {
+			text = inptr;
+			ascii = TRUE;
+			
+			if (!strncmp (inptr, "=?", 2)) {
+				inptr += 2;
+				
+				/* skip past the charset (if one is even declared, sigh) */
+				while (*inptr && *inptr != '?') {
+					ascii = ascii && is_ascii (*inptr);
+					inptr++;
+				}
+				
+				/* sanity check encoding type */
+				if (inptr[0] != '?' || !strchr ("BbQq", inptr[1]) || inptr[2] != '?')
+					goto non_rfc2047;
+				
+				inptr += 3;
+				
+				/* find the end of the rfc2047 encoded word token */
+				while (*inptr && strncmp (inptr, "?=", 2) != 0) {
+					ascii = ascii && is_ascii (*inptr);
+					inptr++;
+				}
+				
+				if (!strncmp (inptr, "?=", 2))
+					inptr += 2;
+			} else {
+			non_rfc2047:
+				/* stop if we encounter a possible rfc2047 encoded
+				 * token even if it's inside another word, sigh. */
+				while (*inptr && !camel_mime_is_lwsp (*inptr) &&
+				       strncmp (inptr, "=?", 2) != 0) {
+					ascii = ascii && is_ascii (*inptr);
+					inptr++;
+				}
+			}
+			
+			n = (size_t) (inptr - text);
+			if (is_rfc2047_encoded_word (text, n)) {
+				if ((decoded = rfc2047_decode_word (text, n, default_charset))) {
+					/* rfc2047 states that you must ignore all
+					 * whitespace between encoded words */
+					if (!encoded)
+						g_string_append_len (out, lwsp, nlwsp);
+					
+					g_string_append (out, decoded);
+					g_free (decoded);
+					
+					encoded = TRUE;
+				} else {
+					/* append lwsp and invalid rfc2047 encoded-word token */
+					g_string_append_len (out, lwsp, nlwsp + n);
+					encoded = FALSE;
+				}
+			} else {
+				/* append lwsp */
+				g_string_append_len (out, lwsp, nlwsp);
+				
+				/* append word token */
+				if (!ascii) {
+					/* *sigh* I hate broken mailers... */
+					decoded = decode_8bit (text, n, default_charset);
+					n = strlen (decoded);
+					text = decoded;
+				} else {
+					decoded = NULL;
+				}
+				
+				if (!ctext)
+					g_string_append_len (out, text, n);
+				else
+					append_quoted_pair (out, text, n);
+				
+				g_free (decoded);
+				
+				encoded = FALSE;
+			}
+		} else {
+			/* appending trailing lwsp */
+			g_string_append_len (out, lwsp, nlwsp);
 			break;
-		} else if (dword == NULL) {
-			append (out, start, inptr - start);
-		} else {
-			chunk = start;
 		}
-
-		start = inptr;
-		while (inptr < inend && !camel_mime_is_type (*inptr, mask))
-			inptr++;
-
-		dword = rfc2047_decode_word(start, inptr-start);
-		if (dword) {
-			g_string_append(out, dword);
-			g_free(dword);
-		} else {
-			if (!chunk)
-				chunk = start;
-
-			if ((default_charset == NULL || !append_8bit (out, chunk, inptr-chunk, default_charset))
-			    && (locale_charset == NULL || !append_8bit(out, chunk, inptr-chunk, locale_charset)))
-				append_latin1(out, chunk, inptr-chunk);
-		}
-
-		chunk = NULL;
 	}
-
-	dword = out->str;
+	
+	decoded = out->str;
 	g_string_free (out, FALSE);
-
-	return dword;
+	
+	return decoded;
 }
 
 
@@ -1086,7 +1341,8 @@
 {
 	if (in == NULL)
 		return NULL;
-	return header_decode_text (in, strlen (in), FALSE, default_charset);
+	
+	return header_decode_text (in, FALSE, default_charset);
 }
 
 
@@ -1106,7 +1362,8 @@
 {
 	if (in == NULL)
 		return NULL;
-	return header_decode_text (in, strlen (in), TRUE, default_charset);
+	
+	return header_decode_text (in, TRUE, default_charset);
 }
 
 /* how long a sequence of pre-encoded words should be less than, to attempt to
@@ -2342,8 +2599,7 @@
 			g_free(text);
 
 			/* or maybe that we've added up a bunch of broken bits to make an encoded word */
-			text = rfc2047_decode_word(name->str, name->len);
-			if (text) {
+			if ((text = rfc2047_decode_word (name->str, name->len, charset))) {
 				g_string_truncate(name, 0);
 				g_string_append(name, text);
 				g_free(text);
@@ -2901,7 +3157,7 @@
 	node->next = NULL;
 	node->name = name;
 	if (strncmp(value, "=?", 2) == 0
-	    && (node->value = header_decode_text(value, strlen(value), FALSE, NULL))) {
+	    && (node->value = header_decode_text(value, FALSE, NULL))) {
 		g_free(value);
 	} else if (g_ascii_strcasecmp (name, "boundary") != 0 && !g_utf8_validate(value, -1, NULL)) {
 		const char *charset = e_iconv_locale_charset();
Index: camel-charset-map.c
===================================================================
--- camel-charset-map.c	(revision 8315)
+++ camel-charset-map.c	(working copy)
@@ -52,8 +52,9 @@
 #include <glib.h>
 
 static struct {
-	char *name;
-	unsigned int bit;	/* assigned bit */
+	char *name;        /* charset name */
+	int multibyte;     /* charset type */
+	unsigned int bit;  /* assigned bit */
 } tables[] = {
 	/* These are the 8bit character sets (other than iso-8859-1,
 	 * which is special-cased) which are supported by both other
@@ -61,20 +62,35 @@
 	 * they're listed in is the order they'll be tried in, so put
 	 * the more-popular ones first.
 	 */
-	{ "iso-8859-2", 0 },	/* Central/Eastern European */
-	{ "iso-8859-4", 0 },	/* Baltic */
-	{ "koi8-r", 0 },	/* Russian */
-	{ "koi8-u", 0 },	/* Ukranian */
-	{ "iso-8859-5", 0 },	/* Least-popular Russian encoding */
-	{ "iso-8859-7", 0 },	/* Greek */
-	{ "iso-8859-8", 0 },    /* Hebrew; Visual */
-	{ "iso-8859-9", 0 },	/* Turkish */
-	{ "iso-8859-13", 0 },	/* Baltic again */
-	{ "iso-8859-15", 0 },	/* New-and-improved iso-8859-1, but most
-				 * programs that support this support UTF8
-				 */
-	{ "windows-1251", 0 },	/* Russian */
-	{ 0, 0 }
+	{ "iso-8859-2",   0, 0 },  /* Central/Eastern European */
+	{ "iso-8859-4",   0, 0 },  /* Baltic */
+	{ "koi8-r",       0, 0 },  /* Russian */
+	{ "koi8-u",       0, 0 },  /* Ukranian */
+	{ "iso-8859-5",   0, 0 },  /* Least-popular Russian encoding */
+	{ "iso-8859-6",   0, 0 },  /* Arabic */
+	{ "iso-8859-7",   0, 0 },  /* Greek */
+	{ "iso-8859-8",   0, 0 },  /* Hebrew; Visual */
+	{ "iso-8859-9",   0, 0 },  /* Turkish */
+	{ "iso-8859-13",  0, 0 },  /* Baltic again */
+	{ "iso-8859-15",  0, 0 },  /* New-and-improved iso-8859-1, but most
+				    * programs that support this support UTF8
+				    */
+	{ "windows-1251", 0, 0 },  /* Russian */
+	
+	/* These are the multibyte character sets which are commonly
+	 * supported by other mail clients. Note: order for multibyte
+	 * charsets does not affect priority unlike the 8bit charsets
+	 * listed above.
+	 */
+	{ "iso-2022-jp",  1, 0 },  /* Japanese designed for use over the Net */
+	{ "Shift-JIS",    1, 0 },  /* Japanese as used by Windows and MacOS systems */
+	{ "euc-jp",       1, 0 },  /* Japanese traditionally used on Unix systems */
+	{ "euc-kr",       1, 0 },  /* Korean */
+	{ "iso-2022-kr",  1, 0 },  /* Korean (less popular than euc-kr) */
+	{ "gb2312",       1, 0 },  /* Simplified Chinese */
+	{ "Big5",         1, 0 },  /* Traditional Chinese */
+	{ "euc-tw",       1, 0 },
+	{ NULL,           0, 0 }
 };
 
 unsigned int encoding_map[256 * 256];
@@ -85,118 +101,196 @@
 #define UCS "UCS-4LE"
 #endif
 
-int main (void)
+static guint
+block_hash (gconstpointer v)
 {
-	int i, j;
-	int max, min;
-	int bit = 0x01;
-	int k;
+	const signed char *p = v;
+	guint32 h = *p++;
+	int i;
+	
+	for (i = 0; i < 256; i++)
+		h = (h << 5) - h + *p++;
+	
+	return h;
+}
+
+static int
+block_equal (gconstpointer v1, gconstpointer v2)
+{
+	return !memcmp (v1, v2, 256);
+}
+
+int main (int argc, char **argv)
+{
+	unsigned char *block = NULL;
+	unsigned int bit = 0x01;
+	GHashTable *table_hash;
+	size_t inleft, outleft;
+	char *inbuf, *outbuf;
+	guint32 out[128], c;
+	char in[128];
+	int i, j, k;
 	int bytes;
 	iconv_t cd;
-	char in[128];
-	guint32 out[128];
-	char *inptr, *outptr;
-	size_t inlen, outlen;
-
+	
 	/* dont count the terminator */
-	bytes = ((sizeof(tables)/sizeof(tables[0]))+7-1)/8;
-
+	bytes = ((sizeof (tables) / sizeof (tables[0])) + 7 - 1) / 8;
+	g_assert (bytes <= 4);
+	
 	for (i = 0; i < 128; i++)
 		in[i] = i + 128;
-
-	for (j = 0; tables[j].name; j++) {
+	
+	for (j = 0; tables[j].name && !tables[j].multibyte; j++) {
 		cd = iconv_open (UCS, tables[j].name);
-		if (cd == (iconv_t)-1)
-			exit (1);
-		inptr = in;
-		outptr = (char *)(out);
-		inlen = sizeof (in);
-		outlen = sizeof (out);
-		while (iconv (cd, &inptr, &inlen, &outptr, &outlen) == -1) {
+		inbuf = in;
+		inleft = sizeof (in);
+		outbuf = (char *) out;
+		outleft = sizeof (out);
+		while (iconv (cd, &inbuf, &inleft, &outbuf, &outleft) == -1) {
 			if (errno == EILSEQ) {
-				inptr++;
-				inlen--;
+				inbuf++;
+				inleft--;
 			} else {
-				printf ("%s\n", strerror (errno));
+				g_warning ("iconv (%s->UCS4, ..., %d, ..., %d): %s",
+					   tables[j].name, inleft, outleft,
+					   g_strerror (errno));
 				exit (1);
 			}
 		}
 		iconv_close (cd);
-
-		for (i = 0; i < 128 - outlen / 4; i++) {
+		
+		for (i = 0; i < 128 - outleft / 4; i++) {
 			encoding_map[i] |= bit;
 			encoding_map[out[i]] |= bit;
 		}
-
+		
 		tables[j].bit = bit;
 		bit <<= 1;
 	}
-
-	printf("/* This file is automatically generated: DO NOT EDIT */\n\n");
-
-	for (i=0;i<256;i++) {
-		/* first, do we need this block? */
-		for (k=0;k<bytes;k++) {
-			for (j=0;j<256;j++) {
-				if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
-					break;
+	
+	/* Mutibyte tables */
+	for ( ; tables[j].name && tables[j].multibyte; j++) {
+		cd = iconv_open (tables[j].name, UCS);
+		if (cd == (iconv_t) -1)
+			continue;
+		
+		for (c = 128, i = 0; c < 65535 && i < 65535; c++) {
+			inbuf = (char *) &c;
+			inleft = sizeof (c);
+			outbuf = in;
+			outleft = sizeof (in);
+			
+			if (iconv (cd, &inbuf, &inleft, &outbuf, &outleft) != (size_t) -1) {
+				/* this is a legal character in charset table[j].name */
+				iconv (cd, NULL, NULL, &outbuf, &outleft);
+				encoding_map[i++] |= bit;
+				encoding_map[c] |= bit;
+			} else {
+				/* reset the iconv descriptor */
+				iconv (cd, NULL, NULL, NULL, NULL);
 			}
-			if (j < 256) {
-				/* yes, dump it */
-				printf("static const unsigned char m%02x%x[256] = {\n\t", i, k);
-				for (j=0;j<256;j++) {
-					printf("0x%02x, ", (encoding_map[i*256+j] >> (k*8)) & 0xff );
-					if (((j+1)&7) == 0 && j<255)
-						printf("\n\t");
+		}
+		
+		iconv_close (cd);
+		
+		tables[j].bit = bit;
+		bit <<= 1;
+	}
+	
+	printf ("/* This file is automatically generated: DO NOT EDIT */\n\n");
+	
+	table_hash = g_hash_table_new_full (block_hash, block_equal, g_free, g_free);
+	
+	for (i = 0; i < 256; i++) {
+		for (k = 0; k < bytes; k++) {
+			char name[32], *alias;
+			int has_bits = FALSE;
+			
+			if (!block) {
+				/* we reuse malloc'd blocks that are not added to the
+				 * hash table to avoid unnecessary malloc/free's */
+				block = g_malloc (256);
+			}
+			
+			for (j = 0; j < 256; j++) {
+				if ((block[j] = (encoding_map[i * 256 + j] >> (k * 8)) & 0xff))
+					has_bits = TRUE;
+			}
+			
+			if (!has_bits)
+				continue;
+			
+			sprintf (name, "m%02x%x", i, k);
+			
+			if ((alias = g_hash_table_lookup (table_hash, block))) {
+				/* this block is identical to an earlier block, just alias it */
+				printf ("#define %s %s\n\n", name, alias);
+			} else {
+				/* unique block, dump it */
+				g_hash_table_insert (table_hash, block, g_strdup (name));
+				
+				printf ("static unsigned char %s[256] = {\n\t", name);
+				for (j = 0; j < 256; j++) {
+					printf ("0x%02x, ", block[j]);
+					if (((j + 1) & 7) == 0 && j < 255)
+						printf ("\n\t");
 				}
-				printf("\n};\n\n");
+				printf ("\n};\n\n");
+				
+				/* force the next loop to malloc a new block */
+				block = NULL;
 			}
 		}
 	}
-
-	printf("static const struct {\n");
-	for (k=0;k<bytes;k++) {
-		printf("\tconst unsigned char *bits%d;\n", k);
-	}
-	printf("} camel_charmap[256] = {\n\t");
-	for (i=0;i<256;i++) {
-		/* first, do we need this block? */
-		printf("{ ");
-		for (k=0;k<bytes;k++) {
-			for (j=0;j<256;j++) {
-				if ((encoding_map[i*256 + j] & (0xff << (k*8))) != 0)
+	
+	g_hash_table_destroy (table_hash);
+	g_free (block);
+	
+	printf ("struct {\n");
+	for (k = 0; k < bytes; k++)
+		printf ("\tunsigned char *bits%d;\n", k);
+	
+	printf ("} camel_charmap[256] = {\n\t");
+	for (i = 0; i < 256; i++) {
+		printf ("{ ");
+		for (k = 0; k < bytes; k++) {
+			for (j = 0; j < 256; j++) {
+				if ((encoding_map[i * 256 + j] & (0xff << (k * 8))) != 0)
 					break;
 			}
-			if (j < 256) {
-				printf("m%02x%x, ", i, k);
-			} else {
-				printf("NULL, ");
-			}
+			
+			if (j < 256)
+				printf ("m%02x%x, ", i, k);
+			else
+				printf ("NULL, ");
 		}
-		printf("}, ");
-		if (((i+1)&7) == 0 && i<255)
-			printf("\n\t");
+		
+		printf ("}, ");
+		if (((i + 1) & 3) == 0 && i < 255)
+			printf ("\n\t");
 	}
-	printf("\n};\n\n");
-
-	printf("static const struct {\n\tconst char *name;\n\tunsigned int bit;\n} camel_charinfo[] = {\n");
-	for (j=0;tables[j].name;j++) {
-		printf("\t{ \"%s\", 0x%04x },\n", tables[j].name, tables[j].bit);
-	}
-	printf("};\n\n");
-
-	printf("#define charset_mask(x) \\\n");
-	for (k=0;k<bytes;k++) {
-		if (k!=0)
-			printf("\t| ");
+	printf ("\n};\n\n");
+	
+	printf ("struct {\n\tconst char *name;\n\tunsigned int bit;\n} camel_charinfo[] = {\n");
+	for (j = 0; tables[j].name; j++)
+		printf ("\t{ \"%s\", 0x%08x },\n", tables[j].name, tables[j].bit);
+	printf ("};\n\n");
+	
+	printf ("#define charset_mask(x) \\\n");
+	for (k = 0; k < bytes; k++) {
+		if (k != 0)
+			printf ("\t| ");
 		else
-			printf("\t");
-		printf("(camel_charmap[(x)>>8].bits%d?camel_charmap[(x)>>8].bits%d[(x)&0xff]<<%d:0)", k, k, k*8);
-		if (k<bytes-1)
-			printf("\t\\\n");
+			printf ("\t");
+		
+		printf ("(camel_charmap[(x) >> 8].bits%d ? camel_charmap[(x) >> 8].bits%d[(x) & 0xff] << %d : 0)",
+			k, k, k * 8);
+		
+		if (k < bytes - 1)
+			printf ("\t\\\n");
 	}
-	printf("\n\n");
-
+	printf ("\n\n");
+	
 	return 0;
 }
 


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]