[totem-pl-parser/wip/hadess/more-encoding: 4/6] plparser: Validate UTF-8 before returning it




commit 27950578cd1c34c95dcded313030403d9e53ad37
Author: Bastien Nocera <hadess hadess net>
Date:   Thu Mar 4 15:25:04 2021 +0100

    plparser: Validate UTF-8 before returning it
    
    We shouldn't blindly return data as UTF-8 simply because the XML header
    says that it is UTF-8.

 plparse/totem-pl-parser.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)
---
diff --git a/plparse/totem-pl-parser.c b/plparse/totem-pl-parser.c
index a1bec2b..e8a6b61 100644
--- a/plparse/totem-pl-parser.c
+++ b/plparse/totem-pl-parser.c
@@ -1874,8 +1874,13 @@ totem_pl_parser_parse_xml_relaxed (char *contents,
                break;
        }
 
-       if (encoding == NULL || g_ascii_strcasecmp (encoding, "UTF-8") == 0)
-               return doc;
+       if (encoding == NULL || g_ascii_strcasecmp (encoding, "UTF-8") == 0) {
+               if (g_utf8_validate (contents, -1, NULL))
+                       return doc;
+               g_debug ("Document pretended to be in UTF-8 but didn't validate");
+               /* FIXME detect encoding using uchardet */
+               return NULL;
+       }
 
        xml_parser_free_tree (doc);
 


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]