[totem-pl-parser/wip/hadess/more-encoding: 5/6] plparser: Detect character encoding when UTF-8 validation fails
- From: Bastien Nocera <hadess src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [totem-pl-parser/wip/hadess/more-encoding: 5/6] plparser: Detect character encoding when UTF-8 validation fails
- Date: Thu, 4 Mar 2021 14:55:04 +0000 (UTC)
commit 3561c001544c3239180b5133ba54ff9cb989ed64
Author: Bastien Nocera <hadess hadess net>
Date: Thu Mar 4 15:42:06 2021 +0100
plparser: Detect character encoding when UTF-8 validation fails
Use uchardet when available to detect the encoding of XML data when the
declared encoding doesn't match the data passed.
meson.build | 19 ++++++++++++++++++-
meson_options.txt | 2 ++
plparse/totem-pl-parser.c | 38 ++++++++++++++++++++++++++++++++++++--
3 files changed, 56 insertions(+), 3 deletions(-)
---
diff --git a/meson.build b/meson.build
index df547ad..a037581 100644
--- a/meson.build
+++ b/meson.build
@@ -136,6 +136,21 @@ foreach cflag: test_cflags
endif
endforeach
+# uchardet dependency
+enable_uchardet = get_option('enable-uchardet')
+have_uchardet = false
+if enable_uchardet != 'no'
+ uchardet_dep = dependency('uchardet', required: false)
+ if enable_uchardet == 'yes' and not uchardet_dep.found()
+ error('uchardet support requested but not available.')
+ endif
+ if uchardet_dep.found()
+ cdata.set('HAVE_UCHARDET', true,
+ description: 'uchardet available in the system')
+ have_uchardet = true
+ endif
+endif
+
# quvi dependency
enable_quvi = get_option('enable-quvi')
have_quvi = false
@@ -238,7 +253,9 @@ message('''
Quvi video link parsing : @0@
ISO detection with libarchive : @1@
AmazonAMZ decoding with libgcrypt : @2@
+ uchardet encoding detection : @3@
'''.format(have_quvi.to_string('yes', 'no'),
have_libarchive.to_string('yes', 'no'),
- have_libgcrypt.to_string('yes', 'no')))
+ have_libgcrypt.to_string('yes', 'no'),
+ have_uchardet.to_string('yes', 'no')))
diff --git a/meson_options.txt b/meson_options.txt
index 81a02ba..9026e35 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -4,6 +4,8 @@ option('enable-libarchive', type: 'combo', choices : ['yes', 'no', 'auto'], valu
description : 'Enable libarchive support.')
option('enable-libgcrypt', type: 'combo', choices : ['yes', 'no', 'auto'], value : 'auto',
description : 'Enable libgcrypt support.')
+option('enable-uchardet', type: 'combo', choices : ['yes', 'no', 'auto'], value : 'no',
+ description : 'Enable uchardet support.')
option('enable-gtk-doc', type: 'boolean', value: 'false',
description : 'Generate the API reference (depends on GTK-Doc)')
option('introspection', type: 'boolean', value: 'true',
diff --git a/plparse/totem-pl-parser.c b/plparse/totem-pl-parser.c
index e8a6b61..faf89a4 100644
--- a/plparse/totem-pl-parser.c
+++ b/plparse/totem-pl-parser.c
@@ -132,6 +132,9 @@
#ifndef TOTEM_PL_PARSER_MINI
#include <gobject/gvaluecollector.h>
+#ifdef HAVE_UCHARDET
+#include <uchardet.h>
+#endif
#include "totem-pl-parser.h"
#include "totemplparser-marshal.h"
@@ -1846,6 +1849,34 @@ totem_pl_parser_cleanup_xml (char *contents)
}
}
+#ifdef HAVE_UCHARDET
+static char *
+guess_text_encoding (const char *text,
+ gsize len)
+{
+ uchardet_t handle;
+ char *encoding = NULL;
+ int ret;
+
+ handle = uchardet_new ();
+ ret = uchardet_handle_data (handle, text, len);
+ if (ret == 0) {
+ uchardet_data_end (handle);
+ encoding = g_strdup (uchardet_get_charset (handle));
+ }
+
+ uchardet_delete (handle);
+ return encoding;
+}
+#else
+static char *
+guess_text_encoding (const char *text,
+ gsize len)
+{
+ return NULL;
+}
+#endif /* HAVE_UCHARDET */
+
xml_node_t *
totem_pl_parser_parse_xml_relaxed (char *contents,
gsize size)
@@ -1878,8 +1909,11 @@ totem_pl_parser_parse_xml_relaxed (char *contents,
if (g_utf8_validate (contents, -1, NULL))
return doc;
g_debug ("Document pretended to be in UTF-8 but didn't validate");
- /* FIXME detect encoding using uchardet */
- return NULL;
+ g_free (encoding);
+ encoding = guess_text_encoding (contents, size);
+ if (!encoding)
+ return NULL;
+ /* fall-through with the detected encoding */
}
xml_parser_free_tree (doc);
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]