[tepl] File loading: delegate determine_encoding() to FileContent
- From: Sébastien Wilmet <swilmet src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [tepl] File loading: delegate determine_encoding() to FileContent
- Date: Fri, 20 Oct 2017 14:50:16 +0000 (UTC)
commit 0b1238796a70f7ac7a6973b3a76816388d53c032
Author: Sébastien Wilmet <swilmet gnome org>
Date: Fri Oct 20 16:33:14 2017 +0200
File loading: delegate determine_encoding() to FileContent
That function will become more complex in the future, to have a fallback
mode in case uchardet fails, and to take into account candidate
encodings, and also to support invalid chars.
tepl/tepl-file-content.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++
tepl/tepl-file-content.h | 14 +++++---
tepl/tepl-file-loader.c | 73 +---------------------------------------
3 files changed, 94 insertions(+), 77 deletions(-)
---
diff --git a/tepl/tepl-file-content.c b/tepl/tepl-file-content.c
index 74efdae..48bc169 100644
--- a/tepl/tepl-file-content.c
+++ b/tepl/tepl-file-content.c
@@ -18,6 +18,8 @@
*/
#include "tepl-file-content.h"
+#include <uchardet.h>
+#include "tepl-encoding.h"
struct _TeplFileContentPrivate
{
@@ -80,3 +82,85 @@ _tepl_file_content_get_chunks (TeplFileContent *content)
return content->priv->chunks;
}
+
+static TeplEncoding *
+create_encoding_for_charset (const gchar *charset)
+{
+ TeplEncoding *encoding_for_charset;
+ TeplEncoding *ascii_encoding;
+ TeplEncoding *locale_encoding;
+
+ g_assert (charset != NULL);
+
+ encoding_for_charset = tepl_encoding_new (charset);
+
+ ascii_encoding = tepl_encoding_new ("ASCII");
+ locale_encoding = tepl_encoding_new_from_locale ();
+
+ /* ASCII -> UTF-8 if locale is UTF-8.
+ *
+ * uchardet returns ASCII if only ASCII chars are present. But since any
+ * UTF-8 char can be inserted in a GtkTextView, it would be annoying for
+ * the user to have an error each time the text becomes UTF-8. I think
+ * most users expect their files to be UTF-8 if their locale is UTF-8.
+ * The exception here is for example to keep source code ASCII-only,
+ * maybe some projects prefer that, but I think that's the minority of
+ * users.
+ *
+ * TODO: have a list of candidate encodings, and if ASCII is before
+ * UTF-8, keep ASCII. This could be configurable if there is a GSetting
+ * for the candidate encodings, with a GUI to configure the list, like
+ * in gedit.
+ */
+ if (tepl_encoding_equals (encoding_for_charset, ascii_encoding) &&
+ tepl_encoding_is_utf8 (locale_encoding))
+ {
+ tepl_encoding_free (encoding_for_charset);
+ encoding_for_charset = tepl_encoding_new_utf8 ();
+ }
+
+ tepl_encoding_free (ascii_encoding);
+ tepl_encoding_free (locale_encoding);
+
+ return encoding_for_charset;
+}
+
+/* Returns: (transfer full) (nullable): the encoding, or %NULL if the encoding
+ * detection failed.
+ */
+TeplEncoding *
+_tepl_file_content_determine_encoding (TeplFileContent *content)
+{
+ uchardet_t ud;
+ const gchar *charset;
+ TeplEncoding *encoding = NULL;
+ GList *l;
+
+ g_return_val_if_fail (TEPL_IS_FILE_CONTENT (content), NULL);
+
+ ud = uchardet_new ();
+
+ for (l = content->priv->chunks->head; l != NULL; l = l->next)
+ {
+ GBytes *chunk = l->data;
+
+ g_assert (chunk != NULL);
+ g_assert (g_bytes_get_size (chunk) > 0);
+
+ uchardet_handle_data (ud,
+ g_bytes_get_data (chunk, NULL),
+ g_bytes_get_size (chunk));
+ }
+
+ uchardet_data_end (ud);
+
+ charset = uchardet_get_charset (ud);
+ if (charset != NULL && charset[0] != '\0')
+ {
+ encoding = create_encoding_for_charset (charset);
+ }
+
+ uchardet_delete (ud);
+
+ return encoding;
+}
diff --git a/tepl/tepl-file-content.h b/tepl/tepl-file-content.h
index a9ee7d2..6cdfc9e 100644
--- a/tepl/tepl-file-content.h
+++ b/tepl/tepl-file-content.h
@@ -21,6 +21,7 @@
#define TEPL_FILE_CONTENT_H
#include <glib-object.h>
+#include "tepl-types.h"
G_BEGIN_DECLS
@@ -48,17 +49,20 @@ struct _TeplFileContentClass
};
G_GNUC_INTERNAL
-GType _tepl_file_content_get_type (void);
+GType _tepl_file_content_get_type (void);
G_GNUC_INTERNAL
-TeplFileContent * _tepl_file_content_new (void);
+TeplFileContent * _tepl_file_content_new (void);
G_GNUC_INTERNAL
-void _tepl_file_content_add_chunk (TeplFileContent *content,
- GBytes *chunk);
+void _tepl_file_content_add_chunk (TeplFileContent *content,
+ GBytes *chunk);
G_GNUC_INTERNAL
-GQueue * _tepl_file_content_get_chunks (TeplFileContent *content);
+GQueue * _tepl_file_content_get_chunks (TeplFileContent *content);
+
+G_GNUC_INTERNAL
+TeplEncoding * _tepl_file_content_determine_encoding (TeplFileContent *content);
G_END_DECLS
diff --git a/tepl/tepl-file-loader.c b/tepl/tepl-file-loader.c
index 435d609..3f7bd1e 100644
--- a/tepl/tepl-file-loader.c
+++ b/tepl/tepl-file-loader.c
@@ -19,7 +19,6 @@
#include "config.h"
#include "tepl-file-loader.h"
-#include <uchardet.h>
#include <glib/gi18n-lib.h>
#include "tepl-buffer.h"
#include "tepl-file.h"
@@ -875,94 +874,24 @@ out:
g_clear_object (&converter);
}
-static TeplEncoding *
-create_encoding_for_charset (const gchar *charset)
-{
- TeplEncoding *encoding_for_charset;
- TeplEncoding *ascii_encoding;
- TeplEncoding *locale_encoding;
-
- g_assert (charset != NULL);
-
- encoding_for_charset = tepl_encoding_new (charset);
-
- ascii_encoding = tepl_encoding_new ("ASCII");
- locale_encoding = tepl_encoding_new_from_locale ();
-
- /* ASCII -> UTF-8 if locale is UTF-8.
- *
- * uchardet returns ASCII if only ASCII chars are present. But since any
- * UTF-8 char can be inserted in a GtkTextView, it would be annoying for
- * the user to have a warning each time the text becomes UTF-8. I think
- * most users expect their files to be UTF-8 if their locale is UTF-8.
- * The exception here is for example to keep source code ASCII-only,
- * maybe some projects prefer that, but I think that's the minority of
- * users.
- *
- * TODO: have a list of candidate encodings, and if ASCII is before
- * UTF-8, keep ASCII. This could be configurable if there is a GSetting
- * for the candidate encodings, with a GUI to configure the list, like
- * in gedit.
- */
- if (tepl_encoding_equals (encoding_for_charset, ascii_encoding) &&
- tepl_encoding_is_utf8 (locale_encoding))
- {
- tepl_encoding_free (encoding_for_charset);
- encoding_for_charset = tepl_encoding_new_utf8 ();
- }
-
- tepl_encoding_free (ascii_encoding);
- tepl_encoding_free (locale_encoding);
-
- return encoding_for_charset;
-}
-
static void
determine_encoding (GTask *task)
{
TeplFileLoader *loader;
TeplFileLoaderPrivate *priv;
TaskData *task_data;
- uchardet_t ud;
- const gchar *charset;
TeplFileContent *content;
- GQueue *chunks;
- GList *l;
loader = g_task_get_source_object (task);
priv = tepl_file_loader_get_instance_private (loader);
task_data = g_task_get_task_data (task);
- ud = uchardet_new ();
-
content = _tepl_file_content_loader_get_content (task_data->content_loader);
- chunks = _tepl_file_content_get_chunks (content);
-
- for (l = chunks->head; l != NULL; l = l->next)
- {
- GBytes *chunk = l->data;
-
- g_assert (chunk != NULL);
- g_assert (g_bytes_get_size (chunk) > 0);
-
- uchardet_handle_data (ud,
- g_bytes_get_data (chunk, NULL),
- g_bytes_get_size (chunk));
- }
-
- uchardet_data_end (ud);
/* reset() must have been called before launching the task. */
g_assert (priv->detected_encoding == NULL);
-
- charset = uchardet_get_charset (ud);
- if (charset != NULL && charset[0] != '\0')
- {
- priv->detected_encoding = create_encoding_for_charset (charset);
- }
-
- uchardet_delete (ud);
+ priv->detected_encoding = _tepl_file_content_determine_encoding (content);
if (priv->detected_encoding == NULL)
{
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]