[evolution-data-server] evo-I#1621 - Prevent IDN homograph attacks



commit 9a330f2831c5d50f3e902166ad819daf93d53c8d
Author: Milan Crha <mcrha redhat com>
Date:   Thu Sep 30 14:39:01 2021 +0200

    evo-I#1621 - Prevent IDN homograph attacks
    
    Add methods to help to recognize when the domain should be in ASCII
    and use it on appropriate places.
    
    Related to https://gitlab.gnome.org/GNOME/evolution/-/issues/1621

 src/addressbook/libebook/e-destination.c |  42 +-
 src/camel/CMakeLists.txt                 |   2 +
 src/camel/camel-hostname-utils.c         | 681 +++++++++++++++++++++++++++++++
 src/camel/camel-hostname-utils.h         |  33 ++
 src/camel/camel-internet-address.c       |  54 ++-
 src/camel/camel-internet-address.h       |   2 +
 src/camel/camel-message-info-base.c      |  53 +--
 src/camel/camel-mime-filter-tohtml.c     |  32 +-
 src/camel/camel-net-utils.c              | 143 +++++++
 src/camel/camel-net-utils.h              |  11 +
 src/camel/camel-string-utils.c           |  25 ++
 src/camel/camel-string-utils.h           |   2 +
 src/camel/camel.h                        |   1 +
 src/camel/tests/misc/CMakeLists.txt      |   1 +
 src/camel/tests/misc/test3.c             | 132 ++++++
 15 files changed, 1165 insertions(+), 49 deletions(-)
---
diff --git a/src/addressbook/libebook/e-destination.c b/src/addressbook/libebook/e-destination.c
index 475bbe45f..cf841004f 100644
--- a/src/addressbook/libebook/e-destination.c
+++ b/src/addressbook/libebook/e-destination.c
@@ -487,12 +487,14 @@ e_destination_set_contact (EDestination *dest,
 
                                                raw = e_vcard_attribute_get_value (attr->data);
                                                addr = camel_internet_address_new ();
-                                               if (camel_address_unformat (CAMEL_ADDRESS (addr), raw) > 0 &&
-                                                   camel_internet_address_get (addr, 0, &name, &email)) {
-                                                       e_destination_set_name (s_dest, name);
-                                                       e_destination_set_email (s_dest, email);
-
-                                                       dest->priv->list_alldests = g_list_append 
(dest->priv->list_alldests, s_dest);
+                                               if (camel_address_unformat (CAMEL_ADDRESS (addr), raw) > 0) {
+                                                       camel_internet_address_sanitize_ascii_domain (addr);
+                                                       if (camel_internet_address_get (addr, 0, &name, 
&email)) {
+                                                               e_destination_set_name (s_dest, name);
+                                                               e_destination_set_email (s_dest, email);
+
+                                                               dest->priv->list_alldests = g_list_append 
(dest->priv->list_alldests, s_dest);
+                                                       }
                                                }
 
                                                g_object_unref (addr);
@@ -743,13 +745,15 @@ e_destination_set_email (EDestination *dest,
 
        if (email == NULL) {
                if (dest->priv->email != NULL) {
-                       g_free (dest->priv->addr);
-                       dest->priv->addr = NULL;
+                       g_free (dest->priv->email);
+                       dest->priv->email = NULL;
                        changed = TRUE;
                }
        } else if (dest->priv->email == NULL || strcmp (dest->priv->email, email)) {
                g_free (dest->priv->email);
-               dest->priv->email = g_strdup (email);
+               dest->priv->email = camel_utils_sanitize_ascii_domain_in_address (email, TRUE);
+               if (!dest->priv->email)
+                       dest->priv->email = g_strdup (email);
                changed = TRUE;
        }
 
@@ -995,6 +999,7 @@ e_destination_get_email (const EDestination *dest)
 
                        if (camel_address_unformat (CAMEL_ADDRESS (addr), priv->raw)) {
                                const gchar *camel_email = NULL;
+                               camel_internet_address_sanitize_ascii_domain (addr);
                                if (camel_internet_address_get (addr, 0, NULL, &camel_email))
                                        priv->email = g_strdup (camel_email);
                        }
@@ -1066,12 +1071,16 @@ e_destination_get_address (const EDestination *dest)
 
        if (e_destination_is_evolution_list (dest)) {
                destination_get_address (dest, addr);
+               camel_internet_address_sanitize_ascii_domain (addr);
                priv->addr = camel_address_encode (CAMEL_ADDRESS (addr));
        } else if (priv->raw) {
-               if (camel_address_unformat (CAMEL_ADDRESS (addr), priv->raw))
+               if (camel_address_unformat (CAMEL_ADDRESS (addr), priv->raw)) {
+                       camel_internet_address_sanitize_ascii_domain (addr);
                        priv->addr = camel_address_encode (CAMEL_ADDRESS (addr));
+               }
        } else {
                destination_get_address (dest, addr);
+               camel_internet_address_sanitize_ascii_domain (addr);
                priv->addr = camel_address_encode (CAMEL_ADDRESS (addr));
        }
 
@@ -1096,9 +1105,17 @@ e_destination_set_raw (EDestination *dest,
        g_return_if_fail (raw != NULL);
 
        if (dest->priv->raw == NULL || strcmp (dest->priv->raw, raw)) {
+               CamelInternetAddress *addr = camel_internet_address_new ();
 
                e_destination_clear (dest);
-               dest->priv->raw = g_strdup (raw);
+
+               if (camel_address_unformat (CAMEL_ADDRESS (addr), raw) > 0 &&
+                   camel_internet_address_sanitize_ascii_domain (addr))
+                       dest->priv->raw = camel_address_format (CAMEL_ADDRESS (addr));
+               else
+                       dest->priv->raw = g_strdup (raw);
+
+               g_object_unref (addr);
 
                g_signal_emit (dest, signals[CHANGED], 0);
        }
@@ -1133,11 +1150,12 @@ e_destination_get_textrep (const EDestination *dest,
                return name;
 
        /* Make sure that our address gets quoted properly */
-       if (name && email && dest->priv->textrep == NULL) {
+       if (email && dest->priv->textrep == NULL) {
                CamelInternetAddress *addr = camel_internet_address_new ();
 
                camel_internet_address_add (addr, name, email);
                g_free (dest->priv->textrep);
+               camel_internet_address_sanitize_ascii_domain (addr);
                dest->priv->textrep = camel_address_format (CAMEL_ADDRESS (addr));
                g_object_unref (addr);
        }
diff --git a/src/camel/CMakeLists.txt b/src/camel/CMakeLists.txt
index 7058b9c08..df537f992 100644
--- a/src/camel/CMakeLists.txt
+++ b/src/camel/CMakeLists.txt
@@ -46,6 +46,7 @@ set(SOURCES
        camel-folder-thread.c
        camel-folder.c
        camel-gpg-context.c
+       camel-hostname-utils.c
        camel-html-parser.c
        camel-iconv.c
        camel-index.c
@@ -185,6 +186,7 @@ set(HEADERS
        camel-folder-thread.h
        camel-folder.h
        camel-gpg-context.h
+       camel-hostname-utils.h
        camel-html-parser.h
        camel-iconv.h
        camel-index.h
diff --git a/src/camel/camel-hostname-utils.c b/src/camel/camel-hostname-utils.c
new file mode 100644
index 000000000..956f6c1cf
--- /dev/null
+++ b/src/camel/camel-hostname-utils.c
@@ -0,0 +1,681 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/*
+ * Copyright (C) 2021 Red Hat (www.redhat.com)
+ *
+ * This library is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * This code is based on WebKit's URL Helpers:
+ * https://trac.webkit.org/browser/webkit/trunk/Source/WTF/wtf/URLHelpers.cpp?rev=278879
+ */
+
+#include "evolution-data-server-config.h"
+
+#include <unicode/uchar.h>
+#include <unicode/uscript.h>
+
+#include "camel-string-utils.h"
+#include "camel-hostname-utils.h"
+
+/* This needs to be higher than the UScriptCode for any of the scripts on the IDN allowed list.
+ * At one point we used USCRIPT_CODE_LIMIT from ICU, but there are two reasons not to use it.
+ * 1) ICU considers it deprecated, so by setting U_HIDE_DEPRECATED we can't see it.
+ * 2) No good reason to limit ourselves to scripts that existed in the ICU headers when
+ *    WebKit was compiled.
+ * This is only really important for platforms that load an external IDN allowed script list.
+ * Not important for the compiled-in one.
+ */
+#define SCRIPT_CODE_LIMIT 256
+
+static guint32 allowed_idn_script_bits[(SCRIPT_CODE_LIMIT + 31) / 32];
+
+static gpointer
+camel_hostname_utils_init_global_memory (gpointer user_data)
+{
+       const UScriptCode scripts[] = {
+               USCRIPT_COMMON,
+               USCRIPT_INHERITED,
+               USCRIPT_ARABIC,
+               USCRIPT_ARMENIAN,
+               USCRIPT_BOPOMOFO,
+               USCRIPT_CANADIAN_ABORIGINAL,
+               USCRIPT_DEVANAGARI,
+               USCRIPT_DESERET,
+               USCRIPT_GUJARATI,
+               USCRIPT_GURMUKHI,
+               USCRIPT_HANGUL,
+               USCRIPT_HAN,
+               USCRIPT_HEBREW,
+               USCRIPT_HIRAGANA,
+               USCRIPT_KATAKANA_OR_HIRAGANA,
+               USCRIPT_KATAKANA,
+               USCRIPT_LATIN,
+               USCRIPT_TAMIL,
+               USCRIPT_THAI,
+               USCRIPT_YI
+       };
+       guint ii;
+
+       for (ii = 0; ii < G_N_ELEMENTS (scripts); ii++) {
+               gint32 script = (gint32) scripts[ii];
+               if (script >= 0 && script < SCRIPT_CODE_LIMIT) {
+                       guint32 index = script / 32;
+                       guint32 mask = 1 << (script % 32);
+                       allowed_idn_script_bits[index] |= mask;
+               }
+       }
+
+       return NULL;
+}
+
+static gboolean
+is_lookalike_character_for_script (UScriptCode expected_script,
+                                  UChar32 code_point)
+{
+       switch (code_point) {
+       case 0x0548: /* ARMENIAN CAPITAL LETTER VO */
+       case 0x054D: /* ARMENIAN CAPITAL LETTER SEH */
+       case 0x0551: /* ARMENIAN CAPITAL LETTER CO */
+       case 0x0555: /* ARMENIAN CAPITAL LETTER OH */
+       case 0x0578: /* ARMENIAN SMALL LETTER VO */
+       case 0x057D: /* ARMENIAN SMALL LETTER SEH */
+       case 0x0581: /* ARMENIAN SMALL LETTER CO */
+       case 0x0585: /* ARMENIAN SMALL LETTER OH */
+               return expected_script == USCRIPT_ARMENIAN;
+       case 0x0BE6: /* TAMIL DIGIT ZERO */
+               return expected_script == USCRIPT_TAMIL;
+       default:
+               return FALSE;
+       }
+}
+
+static gboolean
+is_of_script_type (UScriptCode expected_script,
+                  UChar32 code_point)
+{
+       UErrorCode error = U_ZERO_ERROR;
+       UScriptCode script = uscript_getScript (code_point, &error);
+       if (error != U_ZERO_ERROR)
+               return FALSE;
+
+       return script == expected_script;
+}
+
+static gboolean
+is_ascii_digit_or_punctuation (UChar32 character)
+{
+       return (character >= '!' && character <= '@') || (character >= '[' && character <= '`') || (character 
= '{' && character <= '~');
+}
+
+static gboolean
+is_ascii_digit_or_valid_host_character (UChar32 character)
+{
+       if (!is_ascii_digit_or_punctuation (character))
+               return FALSE;
+
+       /* Things the URL Parser rejects: */
+       switch (character) {
+       case '#':
+       case '%':
+       case '/':
+       case ':':
+       case '?':
+       case '@':
+       case '[':
+       case '\\':
+       case ']':
+               return FALSE;
+       default:
+               return TRUE;
+       }
+}
+
+static gboolean
+is_lookalike_sequence (UScriptCode expected_script,
+                      UChar32 previous_code_point,
+                      UChar32 code_point)
+{
+       if (!previous_code_point || previous_code_point == '/')
+               return FALSE;
+
+       return (is_lookalike_character_for_script (expected_script, code_point) && !(is_of_script_type 
(expected_script, previous_code_point) ||
+               is_ascii_digit_or_valid_host_character (previous_code_point))) ||
+              (is_lookalike_character_for_script (expected_script, previous_code_point) && 
!(is_of_script_type (expected_script, code_point) ||
+               is_ascii_digit_or_valid_host_character (code_point)));
+}
+
+static gboolean
+is_lookalike_character (UChar32 previous_code_point,
+                       UChar32 code_point)
+{
+       /* This function treats the following as unsafe, lookalike characters:
+        * any non-printable character, any character considered as whitespace,
+        * any ignorable character, and emoji characters related to locks.
+        *
+        *  We also considered the characters in Mozilla's list of characters 
<http://kb.mozillazine.org/Network.IDN.blacklist_chars>.
+        *
+        * Some of the characters here will never appear once ICU has encoded.
+        * For example, ICU transforms most spaces into an ASCII space and most
+        * slashes into an ASCII solidus. But one of the two callers uses this
+        * on characters that have not been processed by ICU, so they are needed here.
+        */
+
+       if (!u_isprint (code_point) || u_isUWhiteSpace (code_point) || u_hasBinaryProperty (code_point, 
UCHAR_DEFAULT_IGNORABLE_CODE_POINT))
+               return TRUE;
+
+       switch (code_point) {
+       case 0x00BC: /* VULGAR FRACTION ONE QUARTER */
+       case 0x00BD: /* VULGAR FRACTION ONE HALF */
+       case 0x00BE: /* VULGAR FRACTION THREE QUARTERS */
+       case 0x00ED: /* LATIN SMALL LETTER I WITH ACUTE */
+       /* 0x0131 LATIN SMALL LETTER DOTLESS I is intentionally not considered a lookalike character because 
it is visually distinguishable from i and it has legitimate use in the Turkish language. */
+       case 0x01C0: /* LATIN LETTER DENTAL CLICK */
+       case 0x01C3: /* LATIN LETTER RETROFLEX CLICK */
+       case 0x0237: /* LATIN SMALL LETTER DOTLESS J */
+       case 0x0251: /* LATIN SMALL LETTER ALPHA */
+       case 0x0261: /* LATIN SMALL LETTER SCRIPT G */
+       case 0x0274: /* LATIN LETTER SMALL CAPITAL N */
+       case 0x027E: /* LATIN SMALL LETTER R WITH FISHHOOK */
+       case 0x02D0: /* MODIFIER LETTER TRIANGULAR COLON */
+       case 0x0335: /* COMBINING SHORT STROKE OVERLAY */
+       case 0x0337: /* COMBINING SHORT SOLIDUS OVERLAY */
+       case 0x0338: /* COMBINING LONG SOLIDUS OVERLAY */
+       case 0x0589: /* ARMENIAN FULL STOP */
+       case 0x05B4: /* HEBREW POINT HIRIQ */
+       case 0x05BC: /* HEBREW POINT DAGESH OR MAPIQ */
+       case 0x05C3: /* HEBREW PUNCTUATION SOF PASUQ */
+       case 0x05F4: /* HEBREW PUNCTUATION GERSHAYIM */
+       case 0x0609: /* ARABIC-INDIC PER MILLE SIGN */
+       case 0x060A: /* ARABIC-INDIC PER TEN THOUSAND SIGN */
+       case 0x0650: /* ARABIC KASRA */
+       case 0x0660: /* ARABIC INDIC DIGIT ZERO */
+       case 0x066A: /* ARABIC PERCENT SIGN */
+       case 0x06D4: /* ARABIC FULL STOP */
+       case 0x06F0: /* EXTENDED ARABIC INDIC DIGIT ZERO */
+       case 0x0701: /* SYRIAC SUPRALINEAR FULL STOP */
+       case 0x0702: /* SYRIAC SUBLINEAR FULL STOP */
+       case 0x0703: /* SYRIAC SUPRALINEAR COLON */
+       case 0x0704: /* SYRIAC SUBLINEAR COLON */
+       case 0x1735: /* PHILIPPINE SINGLE PUNCTUATION */
+       case 0x1D04: /* LATIN LETTER SMALL CAPITAL C */
+       case 0x1D0F: /* LATIN LETTER SMALL CAPITAL O */
+       case 0x1D1C: /* LATIN LETTER SMALL CAPITAL U */
+       case 0x1D20: /* LATIN LETTER SMALL CAPITAL V */
+       case 0x1D21: /* LATIN LETTER SMALL CAPITAL W */
+       case 0x1D22: /* LATIN LETTER SMALL CAPITAL Z */
+       case 0x1ECD: /* LATIN SMALL LETTER O WITH DOT BELOW */
+       case 0x2010: /* HYPHEN */
+       case 0x2011: /* NON-BREAKING HYPHEN */
+       case 0x2024: /* ONE DOT LEADER */
+       case 0x2027: /* HYPHENATION POINT */
+       case 0x2039: /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
+       case 0x203A: /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
+       case 0x2041: /* CARET INSERTION POINT */
+       case 0x2044: /* FRACTION SLASH */
+       case 0x2052: /* COMMERCIAL MINUS SIGN */
+       case 0x2153: /* VULGAR FRACTION ONE THIRD */
+       case 0x2154: /* VULGAR FRACTION TWO THIRDS */
+       case 0x2155: /* VULGAR FRACTION ONE FIFTH */
+       case 0x2156: /* VULGAR FRACTION TWO FIFTHS */
+       case 0x2157: /* VULGAR FRACTION THREE FIFTHS */
+       case 0x2158: /* VULGAR FRACTION FOUR FIFTHS */
+       case 0x2159: /* VULGAR FRACTION ONE SIXTH */
+       case 0x215A: /* VULGAR FRACTION FIVE SIXTHS */
+       case 0x215B: /* VULGAR FRACTION ONE EIGHT */
+       case 0x215C: /* VULGAR FRACTION THREE EIGHTHS */
+       case 0x215D: /* VULGAR FRACTION FIVE EIGHTHS */
+       case 0x215E: /* VULGAR FRACTION SEVEN EIGHTHS */
+       case 0x215F: /* FRACTION NUMERATOR ONE */
+       case 0x2212: /* MINUS SIGN */
+       case 0x2215: /* DIVISION SLASH */
+       case 0x2216: /* SET MINUS */
+       case 0x2236: /* RATIO */
+       case 0x233F: /* APL FUNCTIONAL SYMBOL SLASH BAR */
+       case 0x23AE: /* INTEGRAL EXTENSION */
+       case 0x244A: /* OCR DOUBLE BACKSLASH */
+       case 0x2571: /* BOX DRAWINGS LIGHT DIAGONAL UPPER RIGHT TO LOWER LEFT */
+       case 0x2572: /* BOX DRAWINGS LIGHT DIAGONAL UPPER LEFT TO LOWER RIGHT */
+       case 0x29F6: /* SOLIDUS WITH OVERBAR */
+       case 0x29F8: /* BIG SOLIDUS */
+       case 0x2AFB: /* TRIPLE SOLIDUS BINARY RELATION */
+       case 0x2AFD: /* DOUBLE SOLIDUS OPERATOR */
+       case 0x2FF0: /* IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT */
+       case 0x2FF1: /* IDEOGRAPHIC DESCRIPTION CHARACTER ABOVE TO BELOW */
+       case 0x2FF2: /* IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO MIDDLE AND RIGHT */
+       case 0x2FF3: /* IDEOGRAPHIC DESCRIPTION CHARACTER ABOVE TO MIDDLE AND BELOW */
+       case 0x2FF4: /* IDEOGRAPHIC DESCRIPTION CHARACTER FULL SURROUND */
+       case 0x2FF5: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM ABOVE */
+       case 0x2FF6: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM BELOW */
+       case 0x2FF7: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM LEFT */
+       case 0x2FF8: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM UPPER LEFT */
+       case 0x2FF9: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM UPPER RIGHT */
+       case 0x2FFA: /* IDEOGRAPHIC DESCRIPTION CHARACTER SURROUND FROM LOWER LEFT */
+       case 0x2FFB: /* IDEOGRAPHIC DESCRIPTION CHARACTER OVERLAID */
+       case 0x3002: /* IDEOGRAPHIC FULL STOP */
+       case 0x3008: /* LEFT ANGLE BRACKET */
+       case 0x3014: /* LEFT TORTOISE SHELL BRACKET */
+       case 0x3015: /* RIGHT TORTOISE SHELL BRACKET */
+       case 0x3033: /* VERTICAL KANA REPEAT MARK UPPER HALF */
+       case 0x3035: /* VERTICAL KANA REPEAT MARK LOWER HALF */
+       case 0x321D: /* PARENTHESIZED KOREAN CHARACTER OJEON */
+       case 0x321E: /* PARENTHESIZED KOREAN CHARACTER O HU */
+       case 0x33AE: /* SQUARE RAD OVER S */
+       case 0x33AF: /* SQUARE RAD OVER S SQUARED */
+       case 0x33C6: /* SQUARE C OVER KG */
+       case 0x33DF: /* SQUARE A OVER M */
+       case 0x05B9: /* HEBREW POINT HOLAM */
+       case 0x05BA: /* HEBREW POINT HOLAM HASER FOR VAV */
+       case 0x05C1: /* HEBREW POINT SHIN DOT */
+       case 0x05C2: /* HEBREW POINT SIN DOT */
+       case 0x05C4: /* HEBREW MARK UPPER DOT */
+       case 0xA731: /* LATIN LETTER SMALL CAPITAL S */
+       case 0xA771: /* LATIN SMALL LETTER DUM */
+       case 0xA789: /* MODIFIER LETTER COLON */
+       case 0xFE14: /* PRESENTATION FORM FOR VERTICAL SEMICOLON */
+       case 0xFE15: /* PRESENTATION FORM FOR VERTICAL EXCLAMATION MARK */
+       case 0xFE3F: /* PRESENTATION FORM FOR VERTICAL LEFT ANGLE BRACKET */
+       case 0xFE5D: /* SMALL LEFT TORTOISE SHELL BRACKET */
+       case 0xFE5E: /* SMALL RIGHT TORTOISE SHELL BRACKET */
+       case 0xFF0E: /* FULLWIDTH FULL STOP */
+       case 0xFF0F: /* FULL WIDTH SOLIDUS */
+       case 0xFF61: /* HALFWIDTH IDEOGRAPHIC FULL STOP */
+       case 0xFFFC: /* OBJECT REPLACEMENT CHARACTER */
+       case 0xFFFD: /* REPLACEMENT CHARACTER */
+       case 0x1F50F: /* LOCK WITH INK PEN */
+       case 0x1F510: /* CLOSED LOCK WITH KEY */
+       case 0x1F511: /* KEY */
+       case 0x1F512: /* LOCK */
+       case 0x1F513: /* OPEN LOCK */
+               return TRUE;
+       case 0x0307: /* COMBINING DOT ABOVE */
+               return previous_code_point == 0x0237 || /* LATIN SMALL LETTER DOTLESS J */
+                       previous_code_point == 0x0131 || /* LATIN SMALL LETTER DOTLESS I */
+                       previous_code_point == 0x05D5; /* HEBREW LETTER VAV */
+       case '.':
+               return FALSE;
+       default:
+               return is_lookalike_sequence (USCRIPT_ARMENIAN, previous_code_point, code_point) ||
+                       is_lookalike_sequence (USCRIPT_TAMIL, previous_code_point, code_point);
+       }
+}
+
+static gboolean
+all_characters_in_allowed_idn_script_list (const UChar *buffer,
+                                          gint32 length)
+{
+       gint32 ii = 0;
+       UChar32 previous_code_point = 0;
+
+       while (ii < length) {
+               UChar32 cc;
+               UErrorCode error;
+               UScriptCode script;
+               guint32 index, mask;
+
+               U16_NEXT (buffer, ii, length, cc);
+               error = U_ZERO_ERROR;
+               script = uscript_getScript (cc, &error);
+               if (error != U_ZERO_ERROR) {
+                       return FALSE;
+               }
+               if (script < 0) {
+                       return FALSE;
+               }
+               if (script >= SCRIPT_CODE_LIMIT)
+                       return FALSE;
+
+               index = script / 32;
+               mask = 1 << (script % 32);
+
+               if (!(allowed_idn_script_bits[index] & mask))
+                       return FALSE;
+
+               if (is_lookalike_character (previous_code_point, cc))
+                       return FALSE;
+
+               previous_code_point = cc;
+       }
+
+       return TRUE;
+}
+
+static gboolean
+is_second_level_domain_name_allowed_by_tld_rules (const UChar *buffer,
+                                                 gint32 length,
+                                                 gboolean (* character_is_allowed) (UChar ch))
+{
+       gint32 ii;
+
+       g_return_val_if_fail (length > 0, FALSE);
+
+       for (ii = length - 1; ii >= 0; ii--) {
+               UChar ch = buffer[ii];
+
+               if (character_is_allowed (ch))
+                       continue;
+
+               /* Only check the second level domain. Lower level registrars may have different rules. */
+               if (ch == '.')
+                       break;
+
+               return FALSE;
+       }
+
+       return TRUE;
+}
+
+static gboolean
+check_rules_if_suffix_matches (const UChar *buffer,
+                              gint length,
+                              const UChar suffix[],
+                              guint n_suffix,
+                              guint sizeof_suffix,
+                              gboolean (* func) (const UChar ch),
+                              gboolean *out_result)
+{
+        if (length > n_suffix && !memcmp (buffer + length - n_suffix, suffix, sizeof_suffix)) {
+               *out_result = is_second_level_domain_name_allowed_by_tld_rules (buffer, length - n_suffix, 
func);
+               return TRUE;
+       }
+
+       return FALSE;
+}
+
+static gboolean
+is_russian_domain_name_character (const UChar ch)
+{
+       /* Only modern Russian letters, digits and dashes are allowed. */
+       return (ch >= 0x0430 && ch <= 0x044f) || ch == 0x0451 || g_ascii_isdigit (ch) || ch == '-';
+}
+
+static gboolean
+is_russian_and_byelorussian_domain_name_character (const UChar ch)
+{
+       /* Russian and Byelorussian letters, digits and dashes are allowed. */
+       return (ch >= 0x0430 && ch <= 0x044f) || ch == 0x0451 || ch == 0x0456 || ch == 0x045E || ch == 0x2019 
|| g_ascii_isdigit (ch) || ch == '-';
+}
+
+static gboolean
+is_kazakh_domain_name_character (const UChar ch)
+{
+       /* Kazakh letters, digits and dashes are allowed. */
+       return (ch >= 0x0430 && ch <= 0x044f) || ch == 0x0451 || ch == 0x04D9 || ch == 0x0493 || ch == 0x049B 
|| ch == 0x04A3 ||
+               ch == 0x04E9 || ch == 0x04B1 || ch == 0x04AF || ch == 0x04BB || ch == 0x0456 || 
g_ascii_isdigit (ch) || ch == '-';
+}
+
+static gboolean
+is_russian_and_ukrainian_domain_name_character (const UChar ch)
+{
+       /* Russian and Ukrainian letters, digits and dashes are allowed. */
+       return (ch >= 0x0430 && ch <= 0x044f) || ch == 0x0451 || ch == 0x0491 || ch == 0x0404 || ch == 0x0456 
|| ch == 0x0457 || g_ascii_isdigit (ch) || ch == '-';
+}
+
+static gboolean
+is_serbian_domain_name_character (const UChar ch)
+{
+       /* Serbian letters, digits and dashes are allowed. */
+       return (ch >= 0x0430 && ch <= 0x0438) || (ch >= 0x043A && ch <= 0x0448) || ch == 0x0452 || ch == 
0x0458 || ch == 0x0459 ||
+               ch == 0x045A || ch == 0x045B || ch == 0x045F || g_ascii_isdigit (ch) || ch == '-';
+}
+
+static gboolean
+is_macedonian_domain_name_character (const UChar ch)
+{
+       /* Macedonian letters, digits and dashes are allowed. */
+       return (ch >= 0x0430 && ch <= 0x0438) || (ch >= 0x043A && ch <= 0x0448) || ch == 0x0453 || ch == 
0x0455 || ch == 0x0458 ||
+               ch == 0x0459 || ch == 0x045A || ch == 0x045C || ch == 0x045F || g_ascii_isdigit (ch) || ch == 
'-';
+}
+
+static gboolean
+is_mongolian_domain_name_character (const UChar ch)
+{
+       /* Mongolian letters, digits and dashes are allowed. */
+       return (ch >= 0x0430 && ch <= 0x044f) || ch == 0x0451 || ch == 0x04E9 || ch == 0x04AF || 
g_ascii_isdigit (ch) || ch == '-';
+}
+
+static gboolean
+is_bulgarian_domain_name_character (const UChar ch)
+{
+       /* Bulgarian letters, digits and dashes are allowed. */
+       return (ch >= 0x0430 && ch <= 0x044A) || ch == 0x044C || (ch >= 0x044E && ch <= 0x0450) || ch == 
0x045D || g_ascii_isdigit (ch) || ch == '-';
+}
+
+static gboolean
+all_characters_allowed_by_tld_rules (const UChar *buffer,
+                                    gint32 length)
+{
+       /* Skip trailing dot for root domain. */
+       if (buffer[length - 1] == '.')
+               length--;
+
+       #define CHECK_RULES_IF_SUFFIX_MATCHES(suffix, func)  G_STMT_START { \
+               gboolean result = FALSE; \
+               if (check_rules_if_suffix_matches (buffer, length, suffix, G_N_ELEMENTS (suffix), sizeof 
(suffix), func, &result)) \
+                       return result; \
+               } G_STMT_END
+
+       {
+       /* http://cctld.ru/files/pdf/docs/rules_ru-rf.pdf */
+       static const UChar cyrillic_RF[] = {
+               '.',
+               0x0440, /* CYRILLIC SMALL LETTER ER */
+               0x0444, /* CYRILLIC SMALL LETTER EF */
+               };
+       CHECK_RULES_IF_SUFFIX_MATCHES (cyrillic_RF, is_russian_domain_name_character);
+       }
+
+       {
+       /* http://rusnames.ru/rules.pl */
+       static const UChar cyrillic_RUS[] = {
+               '.',
+               0x0440, /* CYRILLIC SMALL LETTER ER */
+               0x0443, /* CYRILLIC SMALL LETTER U */
+               0x0441, /* CYRILLIC SMALL LETTER ES */
+               };
+       CHECK_RULES_IF_SUFFIX_MATCHES (cyrillic_RUS, is_russian_domain_name_character);
+       }
+
+       {
+       /* http://ru.faitid.org/projects/moscow/documents/moskva/idn */
+       static const UChar cyrillic_MOSKVA[] = {
+               '.',
+               0x043C, /* CYRILLIC SMALL LETTER EM */
+               0x043E, /* CYRILLIC SMALL LETTER O */
+               0x0441, /* CYRILLIC SMALL LETTER ES */
+               0x043A, /* CYRILLIC SMALL LETTER KA */
+               0x0432, /* CYRILLIC SMALL LETTER VE */
+               0x0430, /* CYRILLIC SMALL LETTER A */
+               };
+       CHECK_RULES_IF_SUFFIX_MATCHES (cyrillic_MOSKVA, is_russian_domain_name_character);
+       }
+
+       {
+       /* http://www.dotdeti.ru/foruser/docs/regrules.php */
+       static const UChar cyrillic_DETI[] = {
+               '.',
+               0x0434, /* CYRILLIC SMALL LETTER DE */
+               0x0435, /* CYRILLIC SMALL LETTER IE */
+               0x0442, /* CYRILLIC SMALL LETTER TE */
+               0x0438, /* CYRILLIC SMALL LETTER I */
+               };
+       CHECK_RULES_IF_SUFFIX_MATCHES (cyrillic_DETI, is_russian_domain_name_character);
+       }
+
+       {
+       /* http://corenic.org - rules not published. The word is Russian, so only allowing Russian at this 
time,
+          although we may need to revise the checks if this ends up being used with other languages spoken 
in Russia. */
+       static const UChar cyrillic_ONLAYN[] = {
+               '.',
+               0x043E, /* CYRILLIC SMALL LETTER O */
+               0x043D, /* CYRILLIC SMALL LETTER EN */
+               0x043B, /* CYRILLIC SMALL LETTER EL */
+               0x0430, /* CYRILLIC SMALL LETTER A */
+               0x0439, /* CYRILLIC SMALL LETTER SHORT I */
+               0x043D, /* CYRILLIC SMALL LETTER EN */
+               };
+       CHECK_RULES_IF_SUFFIX_MATCHES (cyrillic_ONLAYN, is_russian_domain_name_character);
+       }
+
+       {
+       /* http://corenic.org - same as above. */
+       static const UChar cyrillic_SAYT[] = {
+               '.',
+               0x0441, /* CYRILLIC SMALL LETTER ES */
+               0x0430, /* CYRILLIC SMALL LETTER A */
+               0x0439, /* CYRILLIC SMALL LETTER SHORT I */
+               0x0442, /* CYRILLIC SMALL LETTER TE */
+               };
+       CHECK_RULES_IF_SUFFIX_MATCHES (cyrillic_SAYT, is_russian_domain_name_character);
+       }
+
+       {
+       /* http://pir.org/products/opr-domain/ - rules not published. According to the registry site,
+          the intended audience is "Russian and other Slavic-speaking markets".
+          Chrome appears to only allow Russian, so sticking with that for now. */
+       static const UChar cyrillic_ORG[] = {
+               '.',
+               0x043E, /* CYRILLIC SMALL LETTER O */
+               0x0440, /* CYRILLIC SMALL LETTER ER */
+               0x0433, /* CYRILLIC SMALL LETTER GHE */
+               };
+       CHECK_RULES_IF_SUFFIX_MATCHES (cyrillic_ORG, is_russian_domain_name_character);
+       }
+
+       {
+       /* http://cctld.by/rules.html */
+       static const UChar cyrillic_BEL[] = {
+               '.',
+               0x0431, /* CYRILLIC SMALL LETTER BE */
+               0x0435, /* CYRILLIC SMALL LETTER IE */
+               0x043B, /* CYRILLIC SMALL LETTER EL */
+               };
+       CHECK_RULES_IF_SUFFIX_MATCHES (cyrillic_BEL, is_russian_and_byelorussian_domain_name_character);
+       }
+
+       {
+       /* http://www.nic.kz/docs/poryadok_vnedreniya_kaz_ru.pdf */
+       static const UChar cyrillic_KAZ[] = {
+               '.',
+               0x049B, /* CYRILLIC SMALL LETTER KA WITH DESCENDER */
+               0x0430, /* CYRILLIC SMALL LETTER A */
+               0x0437, /* CYRILLIC SMALL LETTER ZE */
+               };
+       CHECK_RULES_IF_SUFFIX_MATCHES (cyrillic_KAZ, is_kazakh_domain_name_character);
+       }
+
+       {
+       /* http://uanic.net/docs/documents-ukr/Rules%20of%20UKR_v4.0.pdf */
+       static const UChar cyrillic_UKR[] = {
+               '.',
+               0x0443, /* CYRILLIC SMALL LETTER U */
+               0x043A, /* CYRILLIC SMALL LETTER KA */
+               0x0440, /* CYRILLIC SMALL LETTER ER */
+               };
+       CHECK_RULES_IF_SUFFIX_MATCHES (cyrillic_UKR, is_russian_and_ukrainian_domain_name_character);
+       }
+
+       {
+       /* http://www.rnids.rs/data/DOKUMENTI/idn-srb-policy-termsofuse-v1.4-eng.pdf */
+       static const UChar cyrillic_SRB[] = {
+               '.',
+               0x0441, /* CYRILLIC SMALL LETTER ES */
+               0x0440, /* CYRILLIC SMALL LETTER ER */
+               0x0431, /* CYRILLIC SMALL LETTER BE */
+               };
+       CHECK_RULES_IF_SUFFIX_MATCHES (cyrillic_SRB, is_serbian_domain_name_character);
+       }
+
+       {
+       /* http://marnet.mk/doc/pravilnik-mk-mkd.pdf */
+       static const UChar cyrillic_MKD[] = {
+               '.',
+               0x043C, /* CYRILLIC SMALL LETTER EM */
+               0x043A, /* CYRILLIC SMALL LETTER KA */
+               0x0434, /* CYRILLIC SMALL LETTER DE */
+               };
+       CHECK_RULES_IF_SUFFIX_MATCHES (cyrillic_MKD, is_macedonian_domain_name_character);
+       }
+
+       {
+       /* https://www.mon.mn/cs/ */
+       static const UChar cyrillic_MON[] = {
+               '.',
+               0x043C, /* CYRILLIC SMALL LETTER EM */
+               0x043E, /* CYRILLIC SMALL LETTER O */
+               0x043D, /* CYRILLIC SMALL LETTER EN */
+               };
+       CHECK_RULES_IF_SUFFIX_MATCHES (cyrillic_MON, is_mongolian_domain_name_character);
+       }
+
+       {
+       /* https://www.icann.org/sites/default/files/packages/lgr/lgr-second-level-bulgarian-30aug16-en.html 
*/
+       static const UChar cyrillic_BG[] = {
+               '.',
+               0x0431, /* CYRILLIC SMALL LETTER BE */
+               0x0433 /* CYRILLIC SMALL LETTER GHE */
+               };
+       CHECK_RULES_IF_SUFFIX_MATCHES (cyrillic_BG, is_bulgarian_domain_name_character);
+       }
+
+    /* Not a known top level domain with special rules. */
+    return FALSE;
+}
+
+/**
+ * camel_hostname_utils_requires_ascii:
+ * @hostname: a host name
+ *
+ * Check whether the @hostname requires conversion to ASCII. That can
+ * be when a character in it can look like an ASCII character, even
+ * it being a Unicode letter. This can be used to display host names
+ * in a way of invulnerable to IDN homograph attacks.
+ *
+ * Returns: %TRUE, when the @hostname should be converted to an ASCII equivalent,
+ *    %FALSE, when it can be shown as is.
+ *
+ * Since: 3.44
+ **/
+gboolean
+camel_hostname_utils_requires_ascii (const gchar *hostname)
+{
+       static GOnce initialized = G_ONCE_INIT;
+       UErrorCode uerror = U_ZERO_ERROR;
+       int32_t uhost_len = 0;
+       gboolean needs_conversion = FALSE;
+
+       if (camel_string_is_all_ascii (hostname))
+               return FALSE;
+
+       g_once (&initialized, camel_hostname_utils_init_global_memory, NULL);
+
+       u_strFromUTF8 (NULL, 0, &uhost_len, hostname, -1, &uerror);
+       if (uhost_len > 0) {
+               UChar *uhost = g_new0 (UChar, uhost_len + 2);
+
+               uerror = U_ZERO_ERROR;
+               u_strFromUTF8 (uhost, uhost_len + 1, &uhost_len, hostname, -1, &uerror);
+               if (uerror == U_ZERO_ERROR && uhost_len > 0) {
+                       needs_conversion = !all_characters_in_allowed_idn_script_list (uhost, uhost_len) ||
+                                          !all_characters_allowed_by_tld_rules (uhost, uhost_len);
+               } else {
+                       needs_conversion = uerror != U_ZERO_ERROR;
+               }
+
+               g_free (uhost);
+       } else {
+               needs_conversion = TRUE;
+       }
+
+       return needs_conversion;
+}
diff --git a/src/camel/camel-hostname-utils.h b/src/camel/camel-hostname-utils.h
new file mode 100644
index 000000000..dff7eee45
--- /dev/null
+++ b/src/camel/camel-hostname-utils.h
@@ -0,0 +1,33 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/*
+ * Copyright (C) 2021 Red Hat (www.redhat.com)
+ *
+ * This library is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#if !defined (__CAMEL_H_INSIDE__) && !defined (CAMEL_COMPILATION)
+#error "Only <camel/camel.h> can be included directly."
+#endif
+
+#ifndef CAMEL_HOSTNAME_UTILS_H
+#define CAMEL_HOSTNAME_UTILS_H
+
+#include <glib.h>
+
+G_BEGIN_DECLS
+
+gboolean       camel_hostname_utils_requires_ascii     (const gchar *hostname);
+
+G_END_DECLS
+
+#endif /* CAMEL_HOSTNAME_UTILS_H */
diff --git a/src/camel/camel-internet-address.c b/src/camel/camel-internet-address.c
index 790730738..bfb7e4ab6 100644
--- a/src/camel/camel-internet-address.c
+++ b/src/camel/camel-internet-address.c
@@ -20,6 +20,7 @@
 #include <string.h>
 
 #include "camel-internet-address.h"
+#include "camel-hostname-utils.h"
 #include "camel-mime-utils.h"
 #include "camel-net-utils.h"
 
@@ -452,8 +453,7 @@ camel_internet_address_ensure_ascii_domains (CamelInternetAddress *addr)
 
                        domain = camel_host_idna_to_ascii (a->address + at_pos + 1);
                        if (at_pos >= 0) {
-                               gchar *name = g_strndup (a->address, at_pos);
-                               address = g_strconcat (name, "@", domain, NULL);
+                               address = g_strdup_printf ("%.*s@%s", at_pos, a->address, domain);
                        } else {
                                address = domain;
                                domain = NULL;
@@ -466,6 +466,56 @@ camel_internet_address_ensure_ascii_domains (CamelInternetAddress *addr)
        }
 }
 
+/**
+ * camel_internet_address_sanitize_ascii_domain:
+ * @addr: a #CamelInternetAddress
+ *
+ * Checks the addresses in @addr for any suspicious characters in the domain
+ * name and coverts those domains into their representation. In contrast to
+ * camel_internet_address_ensure_ascii_domains(), this converts the domains
+ * into ASCII only when needed, as returned by camel_hostname_utils_requires_ascii().
+ *
+ * Returns: %TRUE, when converted at least one address
+ *
+ * Since: 3.44
+ **/
+gboolean
+camel_internet_address_sanitize_ascii_domain (CamelInternetAddress *addr)
+{
+       struct _address *a;
+       gboolean did_convert = FALSE;
+       gint ii, len;
+
+       g_return_val_if_fail (CAMEL_IS_INTERNET_ADDRESS (addr), FALSE);
+
+       len = addr->priv->addresses->len;
+       for (ii = 0; ii < len; ii++) {
+               gint at_pos = -1;
+               a = g_ptr_array_index (addr->priv->addresses, ii);
+               if (a->address && !domain_contains_only_ascii (a->address, &at_pos) &&
+                   at_pos >= 0 && at_pos + 1 < strlen (a->address) &&
+                   camel_hostname_utils_requires_ascii (a->address + at_pos + 1)) {
+                       gchar *address, *domain;
+
+                       did_convert = TRUE;
+
+                       domain = camel_host_idna_to_ascii (a->address + at_pos + 1);
+                       if (at_pos >= 0) {
+                               address = g_strdup_printf ("%.*s@%s", at_pos, a->address, domain);
+                       } else {
+                               address = domain;
+                               domain = NULL;
+                       }
+
+                       g_free (domain);
+                       g_free (a->address);
+                       a->address = address;
+               }
+       }
+
+       return did_convert;
+}
+
 /**
  * camel_internet_address_find_address:
  * @addr: a #CamelInternetAddress object
diff --git a/src/camel/camel-internet-address.h b/src/camel/camel-internet-address.h
index 956246025..93996d2b8 100644
--- a/src/camel/camel-internet-address.h
+++ b/src/camel/camel-internet-address.h
@@ -82,6 +82,8 @@ gint          camel_internet_address_find_address
                                                 const gchar **namep);
 void           camel_internet_address_ensure_ascii_domains
                                                (CamelInternetAddress *addr);
+gboolean       camel_internet_address_sanitize_ascii_domain
+                                               (CamelInternetAddress *addr);
 
 /* utility functions, for network/display formatting */
 gchar *                camel_internet_address_encode_address
diff --git a/src/camel/camel-message-info-base.c b/src/camel/camel-message-info-base.c
index 432655b84..76ebefe3b 100644
--- a/src/camel/camel-message-info-base.c
+++ b/src/camel/camel-message-info-base.c
@@ -21,6 +21,7 @@
 
 #include "camel-folder-summary.h"
 #include "camel-message-info.h"
+#include "camel-net-utils.h"
 #include "camel-string-utils.h"
 
 #include "camel-message-info-base.h"
@@ -364,6 +365,28 @@ message_info_base_get_from (const CamelMessageInfo *mi)
        return result;
 }
 
+#define SET_ADDRESS(_member, _value) G_STMT_START { \
+       changed = g_strcmp0 (bmi->_member, _value) != 0; \
+       \
+       if (changed) { \
+               gchar *in_ascii; \
+       \
+               in_ascii = camel_utils_sanitize_ascii_domain_in_address (_value, TRUE); \
+               if (in_ascii) { \
+                       if (g_strcmp0 (in_ascii, bmi->_member) == 0) { \
+                               changed = FALSE; \
+                               g_free (in_ascii); \
+                       } else { \
+                               camel_pstring_free (bmi->_member); \
+                               bmi->_member = camel_pstring_add (in_ascii, TRUE); \
+                       } \
+               } else { \
+                       camel_pstring_free (bmi->_member); \
+                       bmi->_member = camel_pstring_strdup (_value); \
+               } \
+       } \
+       } G_STMT_END
+
 static gboolean
 message_info_base_set_from (CamelMessageInfo *mi,
                            const gchar *from)
@@ -377,12 +400,7 @@ message_info_base_set_from (CamelMessageInfo *mi,
 
        camel_message_info_property_lock (mi);
 
-       changed = g_strcmp0 (bmi->priv->from, from) != 0;
-
-       if (changed) {
-               camel_pstring_free (bmi->priv->from);
-               bmi->priv->from = camel_pstring_strdup (from);
-       }
+       SET_ADDRESS (priv->from, from);
 
        camel_message_info_property_unlock (mi);
 
@@ -419,12 +437,7 @@ message_info_base_set_to (CamelMessageInfo *mi,
 
        camel_message_info_property_lock (mi);
 
-       changed = g_strcmp0 (bmi->priv->to, to) != 0;
-
-       if (changed) {
-               camel_pstring_free (bmi->priv->to);
-               bmi->priv->to = camel_pstring_strdup (to);
-       }
+       SET_ADDRESS (priv->to, to);
 
        camel_message_info_property_unlock (mi);
 
@@ -461,12 +474,7 @@ message_info_base_set_cc (CamelMessageInfo *mi,
 
        camel_message_info_property_lock (mi);
 
-       changed = g_strcmp0 (bmi->priv->cc, cc) != 0;
-
-       if (changed) {
-               camel_pstring_free (bmi->priv->cc);
-               bmi->priv->cc = camel_pstring_strdup (cc);
-       }
+       SET_ADDRESS (priv->cc, cc);
 
        camel_message_info_property_unlock (mi);
 
@@ -503,18 +511,15 @@ message_info_base_set_mlist (CamelMessageInfo *mi,
 
        camel_message_info_property_lock (mi);
 
-       changed = g_strcmp0 (bmi->priv->mlist, mlist) != 0;
-
-       if (changed) {
-               camel_pstring_free (bmi->priv->mlist);
-               bmi->priv->mlist = camel_pstring_strdup (mlist);
-       }
+       SET_ADDRESS (priv->mlist, mlist);
 
        camel_message_info_property_unlock (mi);
 
        return changed;
 }
 
+#undef SET_ADDRESS
+
 static guint32
 message_info_base_get_size (const CamelMessageInfo *mi)
 {
diff --git a/src/camel/camel-mime-filter-tohtml.c b/src/camel/camel-mime-filter-tohtml.c
index 6731635b7..25f216fc7 100644
--- a/src/camel/camel-mime-filter-tohtml.c
+++ b/src/camel/camel-mime-filter-tohtml.c
@@ -23,6 +23,7 @@
 #include <string.h>
 
 #include "camel-mime-filter-tohtml.h"
+#include "camel-net-utils.h"
 #include "camel-url-scanner.h"
 #include "camel-utf8.h"
 
@@ -459,6 +460,9 @@ html_convert (CamelMimeFilter *mime_filter,
 
                        do {
                                if (camel_url_scanner_scan (priv->scanner, start, len - (len > 0 && start[len 
- 1] == 0 ? 1 : 0), &match)) {
+                                       gchar *url_str, *sanitized_url;
+                                       gint prefix_len = strlen (match.prefix), url_len;
+
                                        /* write out anything before the first regex match */
                                        outptr = writeln (
                                                mime_filter,
@@ -471,18 +475,22 @@ html_convert (CamelMimeFilter *mime_filter,
 
                                        matchlen = match.um_eo - match.um_so;
 
+                                       url_str = g_strdup_printf ("%s%.*s", match.prefix, (gint) matchlen, 
start);
+                                       sanitized_url = camel_utils_sanitize_ascii_domain_in_url_str 
(url_str);
+                                       if (sanitized_url) {
+                                               g_free (url_str);
+                                               url_str = sanitized_url;
+                                               sanitized_url = NULL;
+                                       }
+
+                                       url_len = strlen (url_str);
+
                                        /* write out the href tag */
                                        outptr = append_string_verbatim (mime_filter, "<a href=\"", outptr, 
&outend);
-                                       /* prefix shouldn't need escaping, but let's be safe */
-                                       outptr = writeln (
-                                               mime_filter,
-                                               match.prefix,
-                                               match.prefix + strlen (match.prefix),
-                                               outptr, &outend);
                                        outptr = writeln (
                                                mime_filter,
-                                               start,
-                                               start + matchlen,
+                                               url_str,
+                                               url_str + url_len,
                                                outptr, &outend);
                                        outptr = append_string_verbatim (
                                                mime_filter, "\">",
@@ -491,10 +499,10 @@ html_convert (CamelMimeFilter *mime_filter,
                                        /* now write the matched string */
                                        outptr = writeln (
                                                mime_filter,
-                                               start,
-                                               start + matchlen,
+                                               url_str + prefix_len,
+                                               url_str + url_len,
                                                outptr, &outend);
-                                       priv->column += matchlen;
+                                       priv->column += url_len - prefix_len;
                                        start += matchlen;
                                        len -= matchlen;
 
@@ -502,6 +510,8 @@ html_convert (CamelMimeFilter *mime_filter,
                                        outptr = append_string_verbatim (
                                                mime_filter, "</a>",
                                                outptr, &outend);
+
+                                       g_free (url_str);
                                } else {
                                        /* nothing matched so write out the remainder of this line buffer */
                                        outptr = writeln (
diff --git a/src/camel/camel-net-utils.c b/src/camel/camel-net-utils.c
index 93e1409c4..ef37877f1 100644
--- a/src/camel/camel-net-utils.c
+++ b/src/camel/camel-net-utils.c
@@ -40,6 +40,8 @@
 #include "camel-object.h"
 #include "camel-operation.h"
 #include "camel-service.h"
+#include "camel-hostname-utils.h"
+#include "camel-string-utils.h"
 
 #define d(x)
 
@@ -861,3 +863,144 @@ camel_host_idna_to_ascii (const gchar *host)
 
        return ascii;
 }
+
+/**
+ * camel_utils_sanitize_ascii_domain_in_address:
+ * @email_address: an email address as string
+ * @do_format: what format will be returned
+ *
+ * Checks whether the domain in the @email_address requires
+ * conversion to ASCII and if it does it also converts it.
+ * When the @do_format is %TRUE, the output string is formatted
+ * for display, otherwise it's encoded for use in the message
+ * headers. A %NULL is returned when no conversion was needed.
+ *
+ * Returns: (nullable): the @email_address with only ASCII letters,
+ *    if such conversion is needed or %NULL, when no conversion was
+ *    required.
+ *
+ * See: camel_hostname_utils_requires_ascii(), camel_internet_address_sanitize_ascii_domain(),
+ *    camel_utils_sanitize_ascii_domain_in_url_str()
+ *
+ * Since: 3.44
+ **/
+gchar *
+camel_utils_sanitize_ascii_domain_in_address (const gchar *email_address,
+                                             gboolean do_format)
+{
+       CamelInternetAddress *addr;
+       gchar *res = NULL;
+
+       g_return_val_if_fail (email_address != NULL, NULL);
+
+       if (camel_string_is_all_ascii (email_address))
+               return NULL;
+
+       addr = camel_internet_address_new ();
+
+       if (camel_address_decode (CAMEL_ADDRESS (addr), email_address) == -1)
+               camel_address_unformat (CAMEL_ADDRESS (addr), email_address);
+
+       if (camel_internet_address_sanitize_ascii_domain (addr)) {
+               if (do_format)
+                       res = camel_address_format (CAMEL_ADDRESS (addr));
+               else
+                       res = camel_address_encode (CAMEL_ADDRESS (addr));
+       }
+
+       g_clear_object (&addr);
+
+       return res;
+}
+
+/**
+ * camel_utils_sanitize_ascii_domain_in_url_str:
+ * @url_str: a URL as string
+ *
+ * Checks whether the host name of the @url_str requires conversion
+ * to ASCII and converts it if needed. Returns %NULL, when no conversion
+ * was required.
+ *
+ * Returns: (nullable): converted @url_str to ASCII host name, or %NULL, when
+ *    no conversion was needed.
+ *
+ * See: camel_hostname_utils_requires_ascii(), camel_utils_sanitize_ascii_domain_in_url()
+ *
+ * Since: 3.44
+ **/
+gchar *
+camel_utils_sanitize_ascii_domain_in_url_str (const gchar *url_str)
+{
+       CamelURL *url;
+       gchar *res = NULL;
+
+       g_return_val_if_fail (url_str != NULL, NULL);
+
+       if (camel_string_is_all_ascii (url_str))
+               return NULL;
+
+       url = camel_url_new (url_str, NULL);
+       if (!url)
+               return NULL;
+
+       if (camel_utils_sanitize_ascii_domain_in_url (url))
+               res = camel_url_to_string (url, 0);
+
+       camel_url_free (url);
+
+       return res;
+}
+
+/**
+ * camel_utils_sanitize_ascii_domain_in_url:
+ * @url: a #CamelURL
+ *
+ * Checks whether the host name of the @url requires conversion
+ * to ASCII and converts it, if needed.
+ *
+ * Returns: %TRUE, when the conversion was required.
+ *
+ * See: camel_hostname_utils_requires_ascii(), camel_utils_sanitize_ascii_domain_in_url_str()
+ *
+ * Since: 3.44
+ **/
+gboolean
+camel_utils_sanitize_ascii_domain_in_url (CamelURL *url)
+{
+       g_return_val_if_fail (url != NULL, FALSE);
+
+       if (!url->host && url->path && url->protocol && g_ascii_strcasecmp (url->protocol, "mailto") == 0) {
+               const gchar *at_pos = strchr (url->path, '@');
+               gboolean res = FALSE;
+
+               if (at_pos && camel_hostname_utils_requires_ascii (at_pos + 1)) {
+                       gchar *ascii_domain, *tmp;
+
+                       ascii_domain = camel_host_idna_to_ascii (at_pos + 1);
+                       tmp = g_strdup_printf ("%.*s@%s", (gint) (at_pos - url->path), url->path, 
ascii_domain);
+                       g_free (ascii_domain);
+                       g_free (url->path);
+                       url->path = tmp;
+
+                       res = TRUE;
+               } else if (camel_hostname_utils_requires_ascii (url->path)) {
+                       gchar *ascii_path = camel_host_idna_to_ascii (url->path);
+                       g_free (url->path);
+                       url->path = ascii_path;
+
+                       res = TRUE;
+               }
+
+               return res;
+       }
+
+       if (camel_hostname_utils_requires_ascii (url->host)) {
+               gchar *ascii_host = camel_host_idna_to_ascii (url->host);
+               g_free (url->host);
+               url->host = ascii_host;
+
+               return TRUE;
+       }
+
+       return FALSE;
+}
diff --git a/src/camel/camel-net-utils.h b/src/camel/camel-net-utils.h
index 8090263f2..0432c1096 100644
--- a/src/camel/camel-net-utils.h
+++ b/src/camel/camel-net-utils.h
@@ -37,6 +37,9 @@ struct sockaddr;
 struct addrinfo;
 #endif
 
+#include <camel/camel-internet-address.h>
+#include <camel/camel-url.h>
+
 G_BEGIN_DECLS
 
 #ifndef _WIN32
@@ -96,6 +99,14 @@ void         camel_freeaddrinfo              (struct addrinfo *host);
 
 gchar *                camel_host_idna_to_ascii        (const gchar *host);
 
+gchar *                camel_utils_sanitize_ascii_domain_in_address
+                                               (const gchar *email_address,
+                                                gboolean do_format);
+gchar *                camel_utils_sanitize_ascii_domain_in_url_str
+                                               (const gchar *url_str);
+gboolean       camel_utils_sanitize_ascii_domain_in_url
+                                               (CamelURL *url);
+
 G_END_DECLS
 
 #ifdef _WIN32
diff --git a/src/camel/camel-string-utils.c b/src/camel/camel-string-utils.c
index f362c853d..2dfcdf23d 100644
--- a/src/camel/camel-string-utils.c
+++ b/src/camel/camel-string-utils.c
@@ -374,3 +374,28 @@ camel_pstring_dump_stat (void)
 
        g_mutex_unlock (&string_pool_lock);
 }
+
+/**
+ * camel_string_is_all_ascii:
+ * @str: (nullable): a string to check, or %NULL
+ *
+ * Returns: %TRUE, when the @str is %NULL, an empty string or when
+ *    it contains only ASCII letters.
+ *
+ * Since: 3.44
+ **/
+gboolean
+camel_string_is_all_ascii (const gchar *str)
+{
+       gint ii;
+
+       if (!str || !*str)
+               return TRUE;
+
+       for (ii = 0; str[ii]; ii++) {
+               if (str[ii] < 0)
+                       break;
+       }
+
+       return str[ii] == '\0';
+}
diff --git a/src/camel/camel-string-utils.h b/src/camel/camel-string-utils.h
index a66fcdf36..33306e833 100644
--- a/src/camel/camel-string-utils.h
+++ b/src/camel/camel-string-utils.h
@@ -42,6 +42,8 @@ const gchar * camel_pstring_peek (const gchar *string);
 gboolean camel_pstring_contains (const gchar *string);
 void camel_pstring_dump_stat (void);
 
+gboolean       camel_string_is_all_ascii       (const gchar *str);
+
 G_END_DECLS
 
 #endif /* CAMEL_STRING_UTILS_H */
diff --git a/src/camel/camel.h b/src/camel/camel.h
index 0df1baf22..f5685f653 100644
--- a/src/camel/camel.h
+++ b/src/camel/camel.h
@@ -44,6 +44,7 @@
 #include <camel/camel-folder-summary.h>
 #include <camel/camel-folder-thread.h>
 #include <camel/camel-gpg-context.h>
+#include <camel/camel-hostname-utils.h>
 #include <camel/camel-html-parser.h>
 #include <camel/camel-iconv.h>
 #include <camel/camel-index.h>
diff --git a/src/camel/tests/misc/CMakeLists.txt b/src/camel/tests/misc/CMakeLists.txt
index 272925cc3..7aae19694 100644
--- a/src/camel/tests/misc/CMakeLists.txt
+++ b/src/camel/tests/misc/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(TESTS
        test1
        test2
+       test3
        utf7
        split
        rfc2047
diff --git a/src/camel/tests/misc/test3.c b/src/camel/tests/misc/test3.c
new file mode 100644
index 000000000..42eb57743
--- /dev/null
+++ b/src/camel/tests/misc/test3.c
@@ -0,0 +1,132 @@
+/*
+ * This library is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library. If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "evolution-data-server-config.h"
+
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "camel-test.h"
+
+static void
+detect_hostname_bad_chars (void)
+{
+       struct _data {
+               const gchar *hostname;
+               gboolean needs_convert;
+       } data[] = {
+               { "example.com", FALSE },
+               { "ex\xd0\xb0" "mple.com", TRUE }
+       };
+       gint ii;
+
+       camel_test_start ("Detect hostname bad chars");
+
+       for (ii = 0; ii < G_N_ELEMENTS (data); ii++) {
+               gboolean needs_convert = camel_hostname_utils_requires_ascii (data[ii].hostname);
+               check_msg (needs_convert == data[ii].needs_convert,
+                       "Failed on [%d] (%s): returns %d, expected %d", ii, data[ii].hostname, needs_convert, 
data[ii].needs_convert);
+       }
+
+       camel_test_end ();
+}
+
+static void
+convert_hostname_bad_chars_email (void)
+{
+       struct _data {
+               const gchar *value;
+               const gchar *fmt_expected;
+               const gchar *enc_expected;
+       } data[] = {
+               { "user example com", NULL, NULL },
+               { "user@ex\xd0\xb0" "mple.com",
+                 "user xn--exmple-4nf com",
+                 "user xn--exmple-4nf com" },
+               { "Žába1 <1st@žába.no.where>",
+                 "Žába1 <1st xn--ba-lia14d no.where>",
+                 "=?iso-8859-2?Q?=AE=E1ba1?= <1st xn--ba-lia14d no.where>" },
+               { "Zaba2 <2nd@zab\xd0\xb0" ".no.where>",
+                 "Zaba2 <2nd xn--zab-8cd no.where>",
+                 "Zaba2 <2nd xn--zab-8cd no.where>" },
+               { "Žába1 <1st@žába.no.where>, Zaba2 <2nd@zab\xd0\xb0" ".no.where>",
+                 "Žába1 <1st xn--ba-lia14d no.where>, Zaba2 <2nd xn--zab-8cd no.where>",
+                 "=?iso-8859-2?Q?=AE=E1ba1?= <1st xn--ba-lia14d no.where>, Zaba2\n\t <2nd xn--zab-8cd 
no.where>" }
+       };
+       gint ii;
+
+       camel_test_start ("Convert hostname bad chars in email");
+
+       for (ii = 0; ii < G_N_ELEMENTS (data); ii++) {
+               gchar *converted = camel_utils_sanitize_ascii_domain_in_address (data[ii].value, TRUE);
+               check_msg (g_strcmp0 (converted, data[ii].fmt_expected) == 0,
+                       "Failed on [%d] (%s): returns '%s', expected formatted '%s'", ii, data[ii].value, 
converted, data[ii].fmt_expected);
+               g_free (converted);
+
+               converted = camel_utils_sanitize_ascii_domain_in_address (data[ii].value, FALSE);
+               check_msg (g_strcmp0 (converted, data[ii].enc_expected) == 0,
+                       "Failed on [%d] (%s): returns '%s', expected encoded '%s'", ii, data[ii].value, 
converted, data[ii].enc_expected);
+               g_free (converted);
+       }
+
+       camel_test_end ();
+}
+
+static void
+convert_hostname_bad_chars_url (void)
+{
+       struct _data {
+               const gchar *value;
+               const gchar *expected;
+       } data[] = {
+               { "mailto:user example com", NULL },
+               { "mailto:user@ex\xd0\xb0"; "mple.com?subject=Tést",
+                 "mailto:user xn--exmple-4nf com?subject=T%c3%a9st" },
+               { "http://žába.no.where/index.html?param1=a&amp;param2=b#fragment";,
+                 "http://xn--ba-lia14d.no.where/index.html?param1=a&amp;param2=b#fragment"; },
+               { "https://1st@žába.no.where/";,
+                 "https://1st xn--ba-lia14d no.where/" },
+               { "ftp://2nd@zab\xd0\xb0"; ".no.where/index.html",
+                 "ftp://2nd xn--zab-8cd no.where/index.html" }
+       };
+       gint ii;
+
+       camel_test_start ("Convert hostname bad chars in URL");
+
+       for (ii = 0; ii < G_N_ELEMENTS (data); ii++) {
+               gchar *converted = camel_utils_sanitize_ascii_domain_in_url_str (data[ii].value);
+               check_msg (g_strcmp0 (converted, data[ii].expected) == 0,
+                       "Failed on [%d] (%s): returns '%s', expected '%s'", ii, data[ii].value, converted, 
data[ii].expected);
+               g_free (converted);
+       }
+
+       camel_test_end ();
+}
+
+gint
+main (gint argc,
+      gchar **argv)
+{
+
+       camel_test_init (argc, argv);
+
+       detect_hostname_bad_chars ();
+       convert_hostname_bad_chars_email ();
+       convert_hostname_bad_chars_url ();
+
+       return 0;
+}


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]