[totem-pl-parser/wip/hadess/more-encoding: 1/4] plparser: Validate UTF-8 before returning it




commit a19de02ba7586b576c42d8f0758eeaef94652d2b
Author: Bastien Nocera <hadess hadess net>
Date:   Thu Mar 4 15:25:04 2021 +0100

    plparser: Validate UTF-8 before returning it
    
    We shouldn't blindly return data as UTF-8 simply because the XML header
    says that it is UTF-8.

 plparse/totem-pl-parser.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)
---
diff --git a/plparse/totem-pl-parser.c b/plparse/totem-pl-parser.c
index a1bec2b..9869701 100644
--- a/plparse/totem-pl-parser.c
+++ b/plparse/totem-pl-parser.c
@@ -1874,8 +1874,14 @@ totem_pl_parser_parse_xml_relaxed (char *contents,
                break;
        }
 
-       if (encoding == NULL || g_ascii_strcasecmp (encoding, "UTF-8") == 0)
-               return doc;
+       if (encoding == NULL || g_ascii_strcasecmp (encoding, "UTF-8") == 0) {
+               if (g_utf8_validate (contents, -1, NULL))
+                       return doc;
+               g_debug ("Document %s pretended to be in UTF-8 but didn't validate",
+                        encoding ? "explicitly" : "implicitly");
+               /* FIXME detect encoding using uchardet */
+               return NULL;
+       }
 
        xml_parser_free_tree (doc);
 


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]