[geary/wip/362-utf8-attachments: 3/3] Don't munge Base64 encoded text parts when saving to disk



commit 2cb0a83342bcdab3375b866fd549ec5b5c9dd744
Author: Michael Gratton <mike vee net>
Date:   Thu Jul 18 14:38:38 2019 +1000

    Don't munge Base64 encoded text parts when saving to disk
    
    Avoid doing character encoding conversion and CRLF stripping when saving
    text parts to disk. This prevents UTF-8 encoded text attachments from
    being broken when base64 encoded and no encoding is specified.
    
    See #362

 .../conversation-viewer/conversation-message.vala  |  5 +-
 src/engine/imap-db/imap-db-attachment.vala         |  2 +-
 src/engine/rfc822/rfc822-message-data.vala         |  4 +-
 src/engine/rfc822/rfc822-message.vala              |  1 +
 src/engine/rfc822/rfc822-part.vala                 | 63 +++++++++++++++++-----
 src/engine/rfc822/rfc822-utils.vala                | 30 -----------
 test/engine/rfc822-part-test.vala                  | 16 ++++--
 7 files changed, 73 insertions(+), 48 deletions(-)
---
diff --git a/src/client/conversation-viewer/conversation-message.vala 
b/src/client/conversation-viewer/conversation-message.vala
index c8fef70b..a52b6ff8 100644
--- a/src/client/conversation-viewer/conversation-message.vala
+++ b/src/client/conversation-viewer/conversation-message.vala
@@ -943,7 +943,10 @@ public class ConversationMessage : Gtk.Grid, Geary.BaseInterface {
         }
 
         try {
-            this.web_view.add_internal_resource(id, part.write_to_buffer());
+            this.web_view.add_internal_resource(
+                id,
+                part.write_to_buffer(Geary.RFC822.Part.EncodingConversion.UTF8)
+            );
         } catch (Geary.RFC822Error err) {
             debug("Failed to get inline buffer: %s", err.message);
             return null;
diff --git a/src/engine/imap-db/imap-db-attachment.vala b/src/engine/imap-db/imap-db-attachment.vala
index 540222d7..f2aacc25 100644
--- a/src/engine/imap-db/imap-db-attachment.vala
+++ b/src/engine/imap-db/imap-db-attachment.vala
@@ -183,7 +183,7 @@ private class Geary.ImapDB.Attachment : Geary.Attachment {
             stream, GMime.StreamBufferMode.BLOCK_WRITE
         );
 
-        part.write_to_stream(stream);
+        part.write_to_stream(stream, RFC822.Part.EncodingConversion.NONE);
 
         // Using the stream's length is a bit of a hack, but at
         // least on one system we are getting 0 back for the file
diff --git a/src/engine/rfc822/rfc822-message-data.vala b/src/engine/rfc822/rfc822-message-data.vala
index 9476ce78..f5eeb628 100644
--- a/src/engine/rfc822/rfc822-message-data.vala
+++ b/src/engine/rfc822/rfc822-message-data.vala
@@ -414,7 +414,9 @@ public class Geary.RFC822.PreviewText : Geary.RFC822.Text {
                 output_stream.set_owner(false);
 
                 try {
-                    part.write_to_stream(output_stream);
+                    part.write_to_stream(
+                        output_stream, Part.EncodingConversion.NONE
+                    );
                     uint8[] data = output.data;
                     data += (uint8) '\0';
 
diff --git a/src/engine/rfc822/rfc822-message.vala b/src/engine/rfc822/rfc822-message.vala
index 9eba825b..e140fad3 100644
--- a/src/engine/rfc822/rfc822-message.vala
+++ b/src/engine/rfc822/rfc822-message.vala
@@ -596,6 +596,7 @@ public class Geary.RFC822.Message : BaseObject, EmailHeaderSet {
 
             if (content_type.is_type("text", text_subtype)) {
                 body = part.write_to_buffer(
+                    Part.EncodingConversion.UTF8,
                     to_html ? Part.BodyFormatting.HTML : Part.BodyFormatting.NONE
                 ).to_string();
             } else if (replacer != null &&
diff --git a/src/engine/rfc822/rfc822-part.vala b/src/engine/rfc822/rfc822-part.vala
index c7d941fd..cfbdde90 100644
--- a/src/engine/rfc822/rfc822-part.vala
+++ b/src/engine/rfc822/rfc822-part.vala
@@ -16,7 +16,17 @@
 public class Geary.RFC822.Part : Object {
 
 
-    /** Specifies a format to apply to body data when writing it. */
+    /** Specifies a character set encoding when writing text bodies. */
+    public enum EncodingConversion {
+
+        /** No conversion will be applied. */
+        NONE,
+
+        /** Plain text bodies will be converted to UTF-8. */
+        UTF8;
+    }
+
+    /** Specifies a format to apply when writing text bodies. */
     public enum BodyFormatting {
 
         /** No formatting will be applied. */
@@ -133,18 +143,20 @@ public class Geary.RFC822.Part : Object {
         return filename;
     }
 
-    public Memory.Buffer write_to_buffer(BodyFormatting format = BodyFormatting.NONE)
+    public Memory.Buffer write_to_buffer(EncodingConversion conversion,
+                                         BodyFormatting format = BodyFormatting.NONE)
         throws RFC822Error {
         ByteArray byte_array = new ByteArray();
         GMime.StreamMem stream = new GMime.StreamMem.with_byte_array(byte_array);
         stream.set_owner(false);
 
-        write_to_stream(stream, format);
+        write_to_stream(stream, conversion, format);
 
         return new Geary.Memory.ByteBuffer.from_byte_array(byte_array);
     }
 
     internal void write_to_stream(GMime.Stream destination,
+                                  EncodingConversion conversion,
                                   BodyFormatting format = BodyFormatting.NONE)
         throws RFC822Error {
         GMime.DataWrapper? wrapper = (this.source_part != null)
@@ -156,22 +168,40 @@ public class Geary.RFC822.Part : Object {
             );
         }
 
-        Mime.ContentType content_type = this.get_effective_content_type();
-        if (content_type.is_type("text", Mime.ContentType.WILDCARD)) {
-            // Assume encoded text, convert to unencoded UTF-8
+        if (this.content_type.is_type("text", Mime.ContentType.WILDCARD)) {
             GMime.StreamFilter filter = new GMime.StreamFilter(destination);
-            string? charset = content_type.params.get_value("charset");
-            filter.add(
-                Geary.RFC822.Utils.create_utf8_filter_charset(charset)
-            );
+
+            // Do charset conversion if needed
+            string? charset = this.content_type.params.get_value("charset");
+            if (String.is_empty(charset)) {
+                // Fallback charset per RFC 2045, Section 5.2
+                charset = "US-ASCII";
+            }
+            if (conversion == UTF8 && !is_utf_8(charset)) {
+                print("\nconverting!\n");
+                GMime.FilterCharset? filter_charset = new GMime.FilterCharset(
+                    charset, Geary.RFC822.UTF8_CHARSET
+                );
+                if (filter_charset == null) {
+                    // Source charset not supported, so assume
+                    // US-ASCII
+                    filter_charset = new GMime.FilterCharset(
+                        "US-ASCII", Geary.RFC822.UTF8_CHARSET
+                    );
+                }
+                filter.add(filter_charset);
+            }
 
             bool flowed = content_type.params.has_value_ci("format", "flowed");
             bool delsp = content_type.params.has_value_ci("DelSp", "yes");
 
             // Remove the CR's in any CRLF sequence since they are
             // effectively a wire encoding, unless the format requires
-            // them.
-            if (!(content_type.media_subtype in CR_PRESERVING_TEXT_TYPES)) {
+            // them or the content encoding is Base64 (being a binary
+            // format)
+            if ((this.source_part == null ||
+                 this.source_part.encoding != BASE64) &&
+                !(content_type.media_subtype in CR_PRESERVING_TEXT_TYPES)) {
                 filter.add(new GMime.FilterCRLF(false, false));
             }
 
@@ -207,3 +237,12 @@ public class Geary.RFC822.Part : Object {
     }
 
 }
+
+private inline bool is_utf_8(string charset) {
+    return (
+        charset == "UTF-8" ||
+        charset == "utf-8" ||
+        charset == "UTF8" ||
+        charset == "utf8"
+    );
+}
\ No newline at end of file
diff --git a/src/engine/rfc822/rfc822-utils.vala b/src/engine/rfc822/rfc822-utils.vala
index 7672d8ae..dfa4180e 100644
--- a/src/engine/rfc822/rfc822-utils.vala
+++ b/src/engine/rfc822/rfc822-utils.vala
@@ -11,36 +11,6 @@ namespace Geary.RFC822.Utils {
 // in UTF-8, and is unmolested by GMime.FilterHTML.
 public const char QUOTE_MARKER = '\x7f';
 
-/**
- * Charset to use when it is otherwise missing or invalid
- *
- * Per RFC 2045, Section 5.2.
- */
-public const string DEFAULT_MIME_CHARSET = "us-ascii";
-
-/**
- * Creates a filter to convert a MIME charset to UTF-8.
- *
- * Param `from_charset` may be null, empty or invalid, in which case
- * `DEFAULT_MIME_CHARSET` will be used instead.
- */
-public GMime.FilterCharset create_utf8_filter_charset(string? from_charset) {
-    string actual_charset = from_charset != null ? from_charset.strip() : "";
-    if (Geary.String.is_empty(actual_charset)) {
-        actual_charset = DEFAULT_MIME_CHARSET;
-    }
-    GMime.FilterCharset? filter_charset = new GMime.FilterCharset(
-        actual_charset, Geary.RFC822.UTF8_CHARSET
-    );
-    if (filter_charset == null) {
-        debug("Unknown charset: %s; using RFC 2045 default instead", from_charset);
-        filter_charset = new GMime.FilterCharset(
-            DEFAULT_MIME_CHARSET, Geary.RFC822.UTF8_CHARSET
-        );
-        assert(filter_charset != null);
-    }
-    return filter_charset;
-}
 
 /**
  * Uses the best-possible transfer of bytes from the Memory.Buffer to the GMime.StreamMem object.
diff --git a/test/engine/rfc822-part-test.vala b/test/engine/rfc822-part-test.vala
index ab5f026e..3af30ec0 100644
--- a/test/engine/rfc822-part-test.vala
+++ b/test/engine/rfc822-part-test.vala
@@ -10,6 +10,7 @@ class Geary.RFC822.PartTest : TestCase {
     private const string CR_BODY = "This is an attachment.\n";
     private const string CRLF_BODY = "This is an attachment.\r\n";
     private const string ICAL_BODY = "BEGIN:VCALENDAR\r\nEND:VCALENDAR\r\n";
+    private const string UTF8_BODY = "Тест.";
 
 
     public PartTest() {
@@ -19,6 +20,7 @@ class Geary.RFC822.PartTest : TestCase {
         add_test("write_to_buffer_plain", write_to_buffer_plain);
         add_test("write_to_buffer_plain_crlf", write_to_buffer_plain_crlf);
         add_test("write_to_buffer_plain_ical", write_to_buffer_plain_ical);
+        add_test("write_to_buffer_plain_utf8", write_to_buffer_plain_utf8);
     }
 
     public void new_from_minimal_mime_part() throws Error {
@@ -56,7 +58,7 @@ class Geary.RFC822.PartTest : TestCase {
     public void write_to_buffer_plain() throws Error {
         Part test = new Part(new_part("text/plain", CR_BODY.data));
 
-        Memory.Buffer buf = test.write_to_buffer();
+        Memory.Buffer buf = test.write_to_buffer(Part.EncodingConversion.NONE);
 
         assert_string(CR_BODY, buf.to_string());
     }
@@ -64,7 +66,7 @@ class Geary.RFC822.PartTest : TestCase {
     public void write_to_buffer_plain_crlf() throws Error {
         Part test = new Part(new_part("text/plain", CRLF_BODY.data));
 
-        Memory.Buffer buf = test.write_to_buffer();
+        Memory.Buffer buf = test.write_to_buffer(Part.EncodingConversion.NONE);
 
         // CRLF should be stripped
         assert_string(CR_BODY, buf.to_string());
@@ -73,12 +75,20 @@ class Geary.RFC822.PartTest : TestCase {
     public void write_to_buffer_plain_ical() throws Error {
         Part test = new Part(new_part("text/calendar", ICAL_BODY.data));
 
-        Memory.Buffer buf = test.write_to_buffer();
+        Memory.Buffer buf = test.write_to_buffer(Part.EncodingConversion.NONE);
 
         // CRLF should not be stripped
         assert_string(ICAL_BODY, buf.to_string());
     }
 
+    public void write_to_buffer_plain_utf8() throws GLib.Error {
+        Part test = new Part(new_part("text/plain", UTF8_BODY.data));
+
+        Memory.Buffer buf = test.write_to_buffer(Part.EncodingConversion.NONE);
+
+        assert_string(UTF8_BODY, buf.to_string());
+    }
+
     private GMime.Part new_part(string? mime_type,
                                 uint8[] body,
                                 GMime.ContentEncoding encoding = GMime.ContentEncoding.DEFAULT) {


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]