Patch: Better multipart handling + HTML-to-text conversion on reply

From: Toralf Lund <toralf kscanners com>
To: Balsa Mailing List <balsa-list gnome org>
Subject: Patch: Better multipart handling + HTML-to-text conversion on reply
Date: Tue, 13 Nov 2001 10:27:43 +0100

A rerun of one of some of my favourite updates, here. The attached patch 
will:

1. Provide a better part selection routine.
2. Implement part selection when quoting a message.
3. Introduce a HTML-to-text conversion step for replies to messages that 
are purely HTML.

-- 
Toralf Lund <toralf@kscanners.com>  +47 66 85 51 22
Kongsberg Scanners AS               +47 66 85 51 00 (switchboard)
http://www.kscanners.no/~toralf     +47 66 85 51 01 (fax)

Index: libbalsa/mime.c
===================================================================
RCS file: /cvs/gnome/balsa/libbalsa/mime.c,v
retrieving revision 1.41
diff -u -b -r1.41 mime.c
--- libbalsa/mime.c	2001/09/23 18:04:48	1.41
+++ libbalsa/mime.c	2001/11/13 09:18:24
@@ -20,21 +20,216 @@
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  
  * 02111-1307, USA.
  */
+#include <stdlib.h>
 
 #include <string.h>
+#include <ctype.h>
 #include "config.h"
 
 #include "libbalsa.h"
 #include "mailbackend.h"
 
-/* FIXME: The content of this file could go to message.c */
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <libgnome/libgnome.h>
+
+
+
+
+#define HTML_TO_TEXT "lynx -force_html -dump file:%s > %s" /* *** FIXME: Set via ./configure */
 
-static GString *process_mime_multipart(LibBalsaMessage * message,
+
+static GString *
+process_mime_multipart(LibBalsaMessage * message,
                                        LibBalsaMessageBody * body,
-				       gchar * reply_prefix_str,
-				       gint llen, gboolean ignore_html,
-                                       gboolean flow);
+		       gchar * reply_prefix_str, gint llen,
+		       gboolean ignore_html, gboolean flow);
+
+/* FIXME: The content of this file could go to message.c */
+
+static gchar char_ref_char(const gchar *char_ref) 
+{
+    if(char_ref[1]=='#')
+	return (gchar )atoi(char_ref+1);
+    /* These are essential: */
+    else if(g_strcasecmp(char_ref, "lt")==0)
+	return '<';
+    else if(g_strcasecmp(char_ref, "gt")==0)
+	return '>';
+    else if(g_strcasecmp(char_ref, "amp")==0)
+	return '&';
+    else if(g_strcasecmp(char_ref, "nbsp")==0)
+	return ' ';
+    /* Some special characters I'm using a lot... */
+    else if(strcmp(char_ref, "aelig")==0)
+	return 'æ';
+    else if(strcmp(char_ref, "AElig")==0)
+	return 'Æ';
+    else if(strcmp(char_ref, "oslash")==0)
+	return 'ø';
+    else if(strcmp(char_ref, "Oslash")==0)
+	return 'Ø';
+    else if(strcmp(char_ref, "aring")==0)
+	return 'å';
+    else if(strcmp(char_ref, "Aring")==0)
+	return 'Å';
+    /* Return first character for the rest. Often makes sense because reference
+       is "<character base><info on accent etc.>" */
+    return char_ref[0];
+}
+
+static gchar *extract_tag(gchar **html, gchar end_sep)
+/* Pre : *html is at start of tag, i.e. points to '<' or similar
+   Post: *html points to position after end of tag. */
+{
+    gchar *end=strchr(*html, end_sep), *tag;
+
+    if(!end)			/* No '>', skip rest of string. */
+	end=*html+strlen(*html);
+    
+    tag=g_strndup(*html+1, end-*html-1);
+    *html=end+1;
+    
+    return tag;
+}
+
+static gchar *convert_html_internal(const gchar *html)
+/* Remarks: Fall-back used when no real HTML converter was configured, or 
+            call failed. */
+{
+    gchar *txt=calloc(strlen(html)+1, sizeof(gchar));
+
+    if(txt) {
+	gchar *c_html=(gchar *)html, *c_txt=txt;
+	gboolean skip=FALSE;
+
+	while(*c_html) {
+	    if(*c_html=='<') {
+		gchar *tag=extract_tag(&c_html, '>');
+		
+		if(g_strncasecmp(tag, "head", 4)==0) {
+		    skip=TRUE;
+		} else if(g_strncasecmp(tag, "body", 4)==0) {
+		    skip=FALSE;
+		} 
+
+		if(!skip) {
+		    if(g_strcasecmp(tag, "br")==0 || 
+		       g_strcasecmp(tag, "tr")==0) {
+			*c_txt++='\n';
+		    } else if(g_strcasecmp(tag, "p")==0 ||
+			      (tolower(tag[0])=='h' && isdigit(tag[1]))) {
+			*c_txt++='\n';
+			*c_txt++='\n';
+		    } else if(g_strcasecmp(tag, "li")==0) {
+			*c_txt++='\n';
+			*c_txt++='*';
+			*c_txt++='\t';
+		    } else if(g_strcasecmp(tag, "td")==0 || 
+			      g_strcasecmp(tag, "th")==0) {
+		     /* Note: Can't do a lot more than starting a new
+			      paragraph (more or less), since we are not
+			      able to interleave text from different cells. */
+			*c_txt++='\n';
+			*c_txt++='\n';
+			*c_txt++='\t';
+		    } else {
+			gchar *ref=strstr(tag, "href=");
+		    
+			if(!ref) { /* *** Want 'strcasestr' */
+			    ref=strstr(tag, "HREF=");
+			}
+		    
+			if(ref) {
+			    *c_txt++='[';
+			    ref+=5;
+			    
+			    while(*ref && *ref!=' ')
+				*c_txt++=*ref++;
+			    *c_txt++=']';
+			}
+		    }
+		}
+		g_free(tag);
+	    } else if(skip) {
+		c_html++;
+	    } else if(*c_html=='&') {
+		gchar *char_ref=extract_tag(&c_html, ';');
+
+		*c_txt++=char_ref_char(char_ref);
+		g_free(char_ref);
+	    } else if(*c_html=='\n' || *c_html==' ' || *c_html=='\t') {
+		gchar prev=*(c_txt-1);
+
+		c_html++;
+		
+		if(c_txt!=txt && prev!=' ' && prev!='\n' && prev!='\t')
+		    *c_txt++=' ';
+	    } else {
+		*c_txt++=*c_html++;
+	    }
+	}
+    }
+    return txt;
+}
+
 
+
+gchar *convert_html(const gchar *html)
+{
+#ifdef HTML_TO_TEXT
+    gchar html_file[PATH_MAX + 1], txt_file[PATH_MAX + 1];
+    gchar *txt=NULL;
+    FILE *fp;
+
+    libbalsa_lock_mutt();
+    mutt_mktemp(html_file);
+    mutt_mktemp(txt_file);
+    libbalsa_unlock_mutt();
+
+    fp=safe_fopen(html_file, "w");
+    
+    if(fp) {
+	gchar cmd[PATH_MAX + 1];
+	pid_t cmd_pid;
+
+	fprintf(fp, "%s", html);
+	fclose(fp);
+
+	snprintf(cmd, sizeof(cmd), HTML_TO_TEXT, html_file, txt_file);
+
+	/* Note: Should probably find gnome_ alternative to fork() + exec(),
+	         but this is at least better than system() (gnome_execute_shell
+		 is not what we want as it will start process in the background 
+	*/
+
+	if((cmd_pid=fork())==0) {
+	    gchar *shell=gnome_util_user_shell();
+
+	    execl(shell, shell, "-c", cmd, NULL);
+	    g_free(shell);	/* In case exec fails */
+	    _exit(1);
+	} else if(cmd_pid>0) {
+	    waitpid(cmd_pid, NULL, 0);
+	    fp=fopen(txt_file, "r");
+	    if(fp) {
+		libbalsa_readfile(fp, &txt);
+		fclose(fp);
+	    }
+	}
+    }
+    unlink(txt_file);
+    unlink(html_file);
+
+    if(txt)
+	return txt;
+
+#endif
+    return convert_html_internal(html);
+}
+
+
+
 /* process_mime_part:
    returns string representation of given message part.
    NOTE: may return NULL(!).
@@ -48,6 +243,9 @@
     size_t alloced;
     gchar *res = NULL;
     GString *reply = NULL;
+    gchar *content_type = libbalsa_message_body_get_content_type(body);
+    gboolean ishtml=(g_strcasecmp(content_type, "text/html") == 0);
+
 
     switch (libbalsa_message_body_type(body)) {
     case LIBBALSA_MESSAGE_BODY_TYPE_OTHER:
@@ -63,11 +261,6 @@
                                        llen, ignore_html, flow);
 	break;
     case LIBBALSA_MESSAGE_BODY_TYPE_TEXT:
-	/* don't return text/html stuff... */
-	if (ignore_html && body->mutt_body->subtype &&
-	    !strcmp("html", body->mutt_body->subtype))
-	    break;
-
 	libbalsa_message_body_save_temporary(body, NULL);
 
 	part = fopen(body->temp_filename, "r");
@@ -78,6 +271,16 @@
 	if (!res)
 	    break;
 
+	if(ishtml) {
+	    gchar *res_ascii=convert_html(res);
+	    
+	    if(res_ascii) {
+		g_free(res);
+		res=res_ascii;
+	    }
+	}
+	    
+
 	if (llen > 0) {
             if (flow && libbalsa_flowed_rfc2646(body)) {
                 /* we're making a `format=flowed' message, and the
@@ -126,6 +329,8 @@
 	g_free(res);
 	break;
     }
+    g_free(content_type);	
+
     return reply;
 }
 
@@ -137,10 +342,22 @@
 {
     LibBalsaMessageBody *part;
     GString *res = NULL, *s;
+    gchar *content_type = libbalsa_message_body_get_content_type(body);
 
+    fprintf(stderr, "process_mime_multipart\n");
+
+    if(g_strcasecmp(content_type, "multipart/alternative")==0) {
+	part = preferred_part(body->parts, ignore_html);
+	if(part)
+	    res = process_mime_part(message, part, reply_prefix_str, llen,
+				    ignore_html, flow);
+    } else {
     for (part = body->parts; part; part = part->next) {
+	    if(part->mutt_body->disposition==DISPINLINE)
 	s = process_mime_part(message, part, reply_prefix_str, llen,
                           ignore_html, flow);
+	    else
+		s = NULL;
 	if (!s)
 	    continue;
 	if (res) {
@@ -149,6 +366,9 @@
 	} else
 	    res = s;
     }
+    }
+
+    g_free(content_type);
     return res;
 }
 
@@ -173,4 +393,38 @@
     }
 
     return reply;
+}
+
+LibBalsaMessageBody*
+preferred_part(LibBalsaMessageBody *parts, gboolean ignore_html)
+/* Remarks: Try to find LAST suitable part, which should be considered the
+            best according to RFC 1341 */
+{
+    /* TODO: - Look for additional specific types, and have more flags 
+               to control their selection?
+             - Let user preferences or previous selections override order 
+	       (e.g. choose text/html part even when text/plain is included
+	       after it)? */
+    LibBalsaMessageBody *body;
+    gchar *content_type;
+    LibBalsaMessageBody *best_part=NULL, *last_part=NULL;
+
+    for(body=parts; body; body=body->next) {
+	content_type = libbalsa_message_body_get_content_type(body);
+	
+	if(g_strcasecmp(content_type, "text/plain")==0 ||
+	   !ignore_html && (g_strcasecmp(content_type, "text/html")==0 ||
+			    g_strcasecmp(content_type, "multipart/related")==0) ||
+	   !best_part && g_strncasecmp(content_type, "text/", 5)==0)
+	    best_part=body;
+	   
+	last_part=body;
+
+	g_free(content_type);
+    }
+    
+    if(!best_part)
+	best_part=last_part;
+    
+    return best_part;
 }
Index: libbalsa/mime.h
===================================================================
RCS file: /cvs/gnome/balsa/libbalsa/mime.h,v
retrieving revision 1.16
diff -u -b -r1.16 mime.h
--- libbalsa/mime.h	2001/09/23 18:04:48	1.16
+++ libbalsa/mime.h	2001/11/13 09:18:24
@@ -31,8 +31,14 @@
 			   LibBalsaMessageBody * body,
 			   gchar * reply_prefix_str, gint llen,
 			   gboolean ignore_html, gboolean flow);
+
 GString *content2reply(LibBalsaMessage * message,
 		       gchar * reply_prefix_str, gint llen,
 		       gboolean ignore_html, gboolean flow);	/* arp */
+
+gchar *convert_html(const gchar *html);
+
+LibBalsaMessageBody*
+preferred_part(LibBalsaMessageBody *parts, gboolean ignore_html);
 
 #endif				/* __LIBBALSA_MIME_H__ */

Follow-Ups:
- Re: Patch: Better multipart handling + HTML-to-text conversion on reply
  - From: Toralf Lund

[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]