[totem-pl-parser/wip/hadess/more-encoding: 5/6] plparser: Detect character encoding when UTF-8 validation fails

From: Bastien Nocera <hadess src gnome org>
To: commits-list gnome org
Cc:
Subject: [totem-pl-parser/wip/hadess/more-encoding: 5/6] plparser: Detect character encoding when UTF-8 validation fails
Date: Thu, 4 Mar 2021 14:55:04 +0000 (UTC)

commit 3561c001544c3239180b5133ba54ff9cb989ed64
Author: Bastien Nocera <hadess hadess net>
Date:   Thu Mar 4 15:42:06 2021 +0100

    plparser: Detect character encoding when UTF-8 validation fails
    
    Use uchardet when available to detect the encoding of XML data when the
    declared encoding doesn't match the data passed.

 meson.build               | 19 ++++++++++++++++++-
 meson_options.txt         |  2 ++
 plparse/totem-pl-parser.c | 38 ++++++++++++++++++++++++++++++++++++--
 3 files changed, 56 insertions(+), 3 deletions(-)
---
diff --git a/meson.build b/meson.build
index df547ad..a037581 100644
--- a/meson.build
+++ b/meson.build
@@ -136,6 +136,21 @@ foreach cflag: test_cflags
   endif
 endforeach
 
+# uchardet dependency
+enable_uchardet = get_option('enable-uchardet')
+have_uchardet = false
+if enable_uchardet != 'no'
+  uchardet_dep = dependency('uchardet', required: false)
+  if enable_uchardet == 'yes' and not uchardet_dep.found()
+    error('uchardet support requested but not available.')
+  endif
+  if uchardet_dep.found()
+    cdata.set('HAVE_UCHARDET', true,
+      description: 'uchardet available in the system')
+    have_uchardet = true
+  endif
+endif
+
 # quvi dependency
 enable_quvi = get_option('enable-quvi')
 have_quvi = false
@@ -238,7 +253,9 @@ message('''
       Quvi video link parsing           : @0@
       ISO detection with libarchive     : @1@
       AmazonAMZ decoding with libgcrypt : @2@
+      uchardet encoding detection       : @3@
 '''.format(have_quvi.to_string('yes', 'no'),
            have_libarchive.to_string('yes', 'no'),
-           have_libgcrypt.to_string('yes', 'no')))
+           have_libgcrypt.to_string('yes', 'no'),
+           have_uchardet.to_string('yes', 'no')))
 
diff --git a/meson_options.txt b/meson_options.txt
index 81a02ba..9026e35 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -4,6 +4,8 @@ option('enable-libarchive', type: 'combo', choices : ['yes', 'no', 'auto'], valu
   description : 'Enable libarchive support.')
 option('enable-libgcrypt', type: 'combo', choices : ['yes', 'no', 'auto'], value : 'auto',
   description : 'Enable libgcrypt support.')
+option('enable-uchardet', type: 'combo', choices : ['yes', 'no', 'auto'], value : 'no',
+  description : 'Enable uchardet support.')
 option('enable-gtk-doc', type: 'boolean', value: 'false',
   description : 'Generate the API reference (depends on GTK-Doc)')
 option('introspection', type: 'boolean', value: 'true',
diff --git a/plparse/totem-pl-parser.c b/plparse/totem-pl-parser.c
index e8a6b61..faf89a4 100644
--- a/plparse/totem-pl-parser.c
+++ b/plparse/totem-pl-parser.c
@@ -132,6 +132,9 @@
 
 #ifndef TOTEM_PL_PARSER_MINI
 #include <gobject/gvaluecollector.h>
+#ifdef HAVE_UCHARDET
+#include <uchardet.h>
+#endif
 
 #include "totem-pl-parser.h"
 #include "totemplparser-marshal.h"
@@ -1846,6 +1849,34 @@ totem_pl_parser_cleanup_xml (char *contents)
        }
 }
 
+#ifdef HAVE_UCHARDET
+static char *
+guess_text_encoding (const char *text,
+                    gsize       len)
+{
+       uchardet_t handle;
+       char *encoding = NULL;
+       int ret;
+
+       handle = uchardet_new ();
+       ret = uchardet_handle_data (handle, text, len);
+       if (ret == 0) {
+               uchardet_data_end (handle);
+               encoding = g_strdup (uchardet_get_charset (handle));
+       }
+
+       uchardet_delete (handle);
+       return encoding;
+}
+#else
+static char *
+guess_text_encoding (const char *text,
+                    gsize       len)
+{
+       return NULL;
+}
+#endif /* HAVE_UCHARDET */
+
 xml_node_t *
 totem_pl_parser_parse_xml_relaxed (char *contents,
                                   gsize size)
@@ -1878,8 +1909,11 @@ totem_pl_parser_parse_xml_relaxed (char *contents,
                if (g_utf8_validate (contents, -1, NULL))
                        return doc;
                g_debug ("Document pretended to be in UTF-8 but didn't validate");
-               /* FIXME detect encoding using uchardet */
-               return NULL;
+               g_free (encoding);
+               encoding = guess_text_encoding (contents, size);
+               if (!encoding)
+                       return NULL;
+               /* fall-through with the detected encoding */
        }
 
        xml_parser_free_tree (doc);
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]