[tepl] File loading: add fallback mode to determine encoding if uchardet fails



commit 4220d5e1eb61532d7d6c5d5341c456827b8393a6
Author: Sébastien Wilmet <swilmet gnome org>
Date:   Fri Oct 20 18:17:43 2017 +0200

    File loading: add fallback mode to determine encoding if uchardet fails

 tepl/tepl-file-content.c |  107 ++++++++++++++++++++++++++++++++++++++++++----
 1 files changed, 98 insertions(+), 9 deletions(-)
---
diff --git a/tepl/tepl-file-content.c b/tepl/tepl-file-content.c
index 5cdfab8..1717f7f 100644
--- a/tepl/tepl-file-content.c
+++ b/tepl/tepl-file-content.c
@@ -20,6 +20,7 @@
 #include "tepl-file-content.h"
 #include <uchardet.h>
 #include "tepl-encoding.h"
+#include "tepl-encoding-converter.h"
 
 struct _TeplFileContentPrivate
 {
@@ -91,7 +92,7 @@ _tepl_file_content_get_chunks (TeplFileContent *content)
 }
 
 static TeplEncoding *
-create_encoding_for_charset (const gchar *charset)
+create_encoding_for_uchardet_charset (const gchar *charset)
 {
        TeplEncoding *encoding_for_charset;
        TeplEncoding *ascii_encoding;
@@ -132,19 +133,14 @@ create_encoding_for_charset (const gchar *charset)
        return encoding_for_charset;
 }
 
-/* Returns: (transfer full) (nullable): the encoding, or %NULL if the encoding
- * detection failed.
- */
-TeplEncoding *
-_tepl_file_content_determine_encoding (TeplFileContent *content)
+static TeplEncoding *
+determine_encoding_with_uchardet (TeplFileContent *content)
 {
        uchardet_t ud;
        const gchar *charset;
        TeplEncoding *encoding = NULL;
        GList *l;
 
-       g_return_val_if_fail (TEPL_IS_FILE_CONTENT (content), NULL);
-
        ud = uchardet_new ();
 
        for (l = content->priv->chunks->head; l != NULL; l = l->next)
@@ -163,10 +159,103 @@ _tepl_file_content_determine_encoding (TeplFileContent *content)
        charset = uchardet_get_charset (ud);
        if (charset != NULL && charset[0] != '\0')
        {
-               encoding = create_encoding_for_charset (charset);
+               encoding = create_encoding_for_uchardet_charset (charset);
        }
 
        uchardet_delete (ud);
 
        return encoding;
 }
+
+static gboolean
+can_convert_successfully_with_encoding (TeplFileContent *content,
+                                       TeplEncoding    *from_encoding)
+{
+       TeplEncodingConverter *converter;
+       GList *l;
+       gboolean success = FALSE;
+
+       converter = _tepl_encoding_converter_new (-1);
+
+       if (!_tepl_encoding_converter_open (converter,
+                                           "UTF-8",
+                                           tepl_encoding_get_charset (from_encoding),
+                                           NULL))
+       {
+               goto out;
+       }
+
+       for (l = content->priv->chunks->head; l != NULL; l = l->next)
+       {
+               GBytes *chunk = l->data;
+
+               g_assert (chunk_is_valid (chunk));
+
+               if (!_tepl_encoding_converter_feed (converter,
+                                                   g_bytes_get_data (chunk, NULL),
+                                                   g_bytes_get_size (chunk),
+                                                   NULL))
+               {
+                       goto out;
+               }
+       }
+
+       if (!_tepl_encoding_converter_close (converter, NULL))
+       {
+               goto out;
+       }
+
+       success = TRUE;
+
+out:
+       g_object_unref (converter);
+       return success;
+}
+
+/* Try the candidate encodings one by one, taking the first without conversion
+ * error.
+ */
+static TeplEncoding *
+determine_encoding_with_fallback_mode (TeplFileContent *content)
+{
+       GSList *candidate_encodings;
+       GSList *l;
+       TeplEncoding *encoding = NULL;
+
+       candidate_encodings = tepl_encoding_get_default_candidates ();
+
+       for (l = candidate_encodings; l != NULL; l = l->next)
+       {
+               TeplEncoding *cur_encoding = l->data;
+
+               if (can_convert_successfully_with_encoding (content, cur_encoding))
+               {
+                       encoding = tepl_encoding_copy (cur_encoding);
+                       break;
+               }
+       }
+
+       g_slist_free_full (candidate_encodings, (GDestroyNotify)tepl_encoding_free);
+
+       return encoding;
+}
+
+/* Returns: (transfer full) (nullable): the encoding, or %NULL if the encoding
+ * detection failed.
+ */
+TeplEncoding *
+_tepl_file_content_determine_encoding (TeplFileContent *content)
+{
+       TeplEncoding *encoding;
+
+       g_return_val_if_fail (TEPL_IS_FILE_CONTENT (content), NULL);
+
+       encoding = determine_encoding_with_uchardet (content);
+
+       if (encoding == NULL)
+       {
+               encoding = determine_encoding_with_fallback_mode (content);
+       }
+
+       return encoding;
+}


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]