[tepl] File loading: delegate determine_encoding() to FileContent



commit 0b1238796a70f7ac7a6973b3a76816388d53c032
Author: Sébastien Wilmet <swilmet gnome org>
Date:   Fri Oct 20 16:33:14 2017 +0200

    File loading: delegate determine_encoding() to FileContent
    
    That function will become more complex in the future, to have a fallback
    mode in case uchardet fails, and to take into account candidate
    encodings, and also to support invalid chars.

 tepl/tepl-file-content.c |   84 ++++++++++++++++++++++++++++++++++++++++++++++
 tepl/tepl-file-content.h |   14 +++++---
 tepl/tepl-file-loader.c  |   73 +---------------------------------------
 3 files changed, 94 insertions(+), 77 deletions(-)
---
diff --git a/tepl/tepl-file-content.c b/tepl/tepl-file-content.c
index 74efdae..48bc169 100644
--- a/tepl/tepl-file-content.c
+++ b/tepl/tepl-file-content.c
@@ -18,6 +18,8 @@
  */
 
 #include "tepl-file-content.h"
+#include <uchardet.h>
+#include "tepl-encoding.h"
 
 struct _TeplFileContentPrivate
 {
@@ -80,3 +82,85 @@ _tepl_file_content_get_chunks (TeplFileContent *content)
 
        return content->priv->chunks;
 }
+
+static TeplEncoding *
+create_encoding_for_charset (const gchar *charset)
+{
+       TeplEncoding *encoding_for_charset;
+       TeplEncoding *ascii_encoding;
+       TeplEncoding *locale_encoding;
+
+       g_assert (charset != NULL);
+
+       encoding_for_charset = tepl_encoding_new (charset);
+
+       ascii_encoding = tepl_encoding_new ("ASCII");
+       locale_encoding = tepl_encoding_new_from_locale ();
+
+       /* ASCII -> UTF-8 if locale is UTF-8.
+        *
+        * uchardet returns ASCII if only ASCII chars are present. But since any
+        * UTF-8 char can be inserted in a GtkTextView, it would be annoying for
+        * the user to have an error each time the text becomes UTF-8. I think
+        * most users expect their files to be UTF-8 if their locale is UTF-8.
+        * The exception here is for example to keep source code ASCII-only,
+        * maybe some projects prefer that, but I think that's the minority of
+        * users.
+        *
+        * TODO: have a list of candidate encodings, and if ASCII is before
+        * UTF-8, keep ASCII. This could be configurable if there is a GSetting
+        * for the candidate encodings, with a GUI to configure the list, like
+        * in gedit.
+        */
+       if (tepl_encoding_equals (encoding_for_charset, ascii_encoding) &&
+           tepl_encoding_is_utf8 (locale_encoding))
+       {
+               tepl_encoding_free (encoding_for_charset);
+               encoding_for_charset = tepl_encoding_new_utf8 ();
+       }
+
+       tepl_encoding_free (ascii_encoding);
+       tepl_encoding_free (locale_encoding);
+
+       return encoding_for_charset;
+}
+
+/* Returns: (transfer full) (nullable): the encoding, or %NULL if the encoding
+ * detection failed.
+ */
+TeplEncoding *
+_tepl_file_content_determine_encoding (TeplFileContent *content)
+{
+       uchardet_t ud;
+       const gchar *charset;
+       TeplEncoding *encoding = NULL;
+       GList *l;
+
+       g_return_val_if_fail (TEPL_IS_FILE_CONTENT (content), NULL);
+
+       ud = uchardet_new ();
+
+       for (l = content->priv->chunks->head; l != NULL; l = l->next)
+       {
+               GBytes *chunk = l->data;
+
+               g_assert (chunk != NULL);
+               g_assert (g_bytes_get_size (chunk) > 0);
+
+               uchardet_handle_data (ud,
+                                     g_bytes_get_data (chunk, NULL),
+                                     g_bytes_get_size (chunk));
+       }
+
+       uchardet_data_end (ud);
+
+       charset = uchardet_get_charset (ud);
+       if (charset != NULL && charset[0] != '\0')
+       {
+               encoding = create_encoding_for_charset (charset);
+       }
+
+       uchardet_delete (ud);
+
+       return encoding;
+}
diff --git a/tepl/tepl-file-content.h b/tepl/tepl-file-content.h
index a9ee7d2..6cdfc9e 100644
--- a/tepl/tepl-file-content.h
+++ b/tepl/tepl-file-content.h
@@ -21,6 +21,7 @@
 #define TEPL_FILE_CONTENT_H
 
 #include <glib-object.h>
+#include "tepl-types.h"
 
 G_BEGIN_DECLS
 
@@ -48,17 +49,20 @@ struct _TeplFileContentClass
 };
 
 G_GNUC_INTERNAL
-GType                  _tepl_file_content_get_type     (void);
+GType                  _tepl_file_content_get_type             (void);
 
 G_GNUC_INTERNAL
-TeplFileContent *      _tepl_file_content_new          (void);
+TeplFileContent *      _tepl_file_content_new                  (void);
 
 G_GNUC_INTERNAL
-void                   _tepl_file_content_add_chunk    (TeplFileContent *content,
-                                                        GBytes          *chunk);
+void                   _tepl_file_content_add_chunk            (TeplFileContent *content,
+                                                                GBytes          *chunk);
 
 G_GNUC_INTERNAL
-GQueue *               _tepl_file_content_get_chunks   (TeplFileContent *content);
+GQueue *               _tepl_file_content_get_chunks           (TeplFileContent *content);
+
+G_GNUC_INTERNAL
+TeplEncoding *         _tepl_file_content_determine_encoding   (TeplFileContent *content);
 
 G_END_DECLS
 
diff --git a/tepl/tepl-file-loader.c b/tepl/tepl-file-loader.c
index 435d609..3f7bd1e 100644
--- a/tepl/tepl-file-loader.c
+++ b/tepl/tepl-file-loader.c
@@ -19,7 +19,6 @@
 
 #include "config.h"
 #include "tepl-file-loader.h"
-#include <uchardet.h>
 #include <glib/gi18n-lib.h>
 #include "tepl-buffer.h"
 #include "tepl-file.h"
@@ -875,94 +874,24 @@ out:
        g_clear_object (&converter);
 }
 
-static TeplEncoding *
-create_encoding_for_charset (const gchar *charset)
-{
-       TeplEncoding *encoding_for_charset;
-       TeplEncoding *ascii_encoding;
-       TeplEncoding *locale_encoding;
-
-       g_assert (charset != NULL);
-
-       encoding_for_charset = tepl_encoding_new (charset);
-
-       ascii_encoding = tepl_encoding_new ("ASCII");
-       locale_encoding = tepl_encoding_new_from_locale ();
-
-       /* ASCII -> UTF-8 if locale is UTF-8.
-        *
-        * uchardet returns ASCII if only ASCII chars are present. But since any
-        * UTF-8 char can be inserted in a GtkTextView, it would be annoying for
-        * the user to have a warning each time the text becomes UTF-8. I think
-        * most users expect their files to be UTF-8 if their locale is UTF-8.
-        * The exception here is for example to keep source code ASCII-only,
-        * maybe some projects prefer that, but I think that's the minority of
-        * users.
-        *
-        * TODO: have a list of candidate encodings, and if ASCII is before
-        * UTF-8, keep ASCII. This could be configurable if there is a GSetting
-        * for the candidate encodings, with a GUI to configure the list, like
-        * in gedit.
-        */
-       if (tepl_encoding_equals (encoding_for_charset, ascii_encoding) &&
-           tepl_encoding_is_utf8 (locale_encoding))
-       {
-               tepl_encoding_free (encoding_for_charset);
-               encoding_for_charset = tepl_encoding_new_utf8 ();
-       }
-
-       tepl_encoding_free (ascii_encoding);
-       tepl_encoding_free (locale_encoding);
-
-       return encoding_for_charset;
-}
-
 static void
 determine_encoding (GTask *task)
 {
        TeplFileLoader *loader;
        TeplFileLoaderPrivate *priv;
        TaskData *task_data;
-       uchardet_t ud;
-       const gchar *charset;
        TeplFileContent *content;
-       GQueue *chunks;
-       GList *l;
 
        loader = g_task_get_source_object (task);
        priv = tepl_file_loader_get_instance_private (loader);
 
        task_data = g_task_get_task_data (task);
 
-       ud = uchardet_new ();
-
        content = _tepl_file_content_loader_get_content (task_data->content_loader);
-       chunks = _tepl_file_content_get_chunks (content);
-
-       for (l = chunks->head; l != NULL; l = l->next)
-       {
-               GBytes *chunk = l->data;
-
-               g_assert (chunk != NULL);
-               g_assert (g_bytes_get_size (chunk) > 0);
-
-               uchardet_handle_data (ud,
-                                     g_bytes_get_data (chunk, NULL),
-                                     g_bytes_get_size (chunk));
-       }
-
-       uchardet_data_end (ud);
 
        /* reset() must have been called before launching the task. */
        g_assert (priv->detected_encoding == NULL);
-
-       charset = uchardet_get_charset (ud);
-       if (charset != NULL && charset[0] != '\0')
-       {
-               priv->detected_encoding = create_encoding_for_charset (charset);
-       }
-
-       uchardet_delete (ud);
+       priv->detected_encoding = _tepl_file_content_determine_encoding (content);
 
        if (priv->detected_encoding == NULL)
        {


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]