[glibmm] Added ustring::make_valid() which fixes non-UTF8 strings.



commit 0797bf2954177f58b7ac6ebecce7264310481c55
Author: Krzysztof Piecuch <piecuch protonmail com>
Date:   Wed Mar 15 02:13:15 2017 +0100

    Added ustring::make_valid() which fixes non-UTF8 strings.
    
    make_valid replaces all non-UTF8 characters with replacement
    character (U+FFFD). Allows manipulating with ustring after you find
    out by ustring::validate() that it's not an UTF-8 string and you
    need to rescue it somehow.
    
    This wraps g_utf8_make_valid().
    
    Bug #780075

 glib/glibmm/ustring.cc                  |    6 +++
 glib/glibmm/ustring.h                   |    6 +++
 tests/Makefile.am                       |    4 ++-
 tests/glibmm_ustring_make_valid/main.cc |   58 +++++++++++++++++++++++++++++++
 4 files changed, 73 insertions(+), 1 deletions(-)
---
diff --git a/glib/glibmm/ustring.cc b/glib/glibmm/ustring.cc
index eae9802..d05e986 100644
--- a/glib/glibmm/ustring.cc
+++ b/glib/glibmm/ustring.cc
@@ -1219,6 +1219,12 @@ ustring::validate(ustring::const_iterator& first_invalid) const
   return (is_valid != 0);
 }
 
+ustring
+ustring::make_valid() const
+{
+  return ustring(g_utf8_make_valid(string_.data(), string_.size()));
+}
+
 bool
 ustring::is_ascii() const
 {
diff --git a/glib/glibmm/ustring.h b/glib/glibmm/ustring.h
index ba6289d..7cd19f0 100644
--- a/glib/glibmm/ustring.h
+++ b/glib/glibmm/ustring.h
@@ -596,6 +596,12 @@ public:
   /*! Check whether the string is valid UTF-8. */
   bool validate(const_iterator& first_invalid) const;
 
+  /*! Return a copy that is a valid UTF-8 string replacing invalid bytes in the
+   *  original with Unicode replacement character (U+FFFD).
+   *  If the string is valid - return it's copy.
+   */
+  ustring make_valid() const;
+
   /*! Check whether the string is plain 7-bit ASCII. @par
    * Unlike any other ustring method, is_ascii() is safe to use on invalid
    * UTF-8 strings.  If the string isn't valid UTF-8, it cannot be valid
diff --git a/tests/Makefile.am b/tests/Makefile.am
index bbe85ae..4a0d2fe 100644
--- a/tests/Makefile.am
+++ b/tests/Makefile.am
@@ -48,7 +48,8 @@ check_PROGRAMS =                              \
        glibmm_refptr/test              \
        glibmm_refptr_sigc_bind/test            \
        glibmm_weakref/test             \
-       glibmm_bytearray/test
+       glibmm_bytearray/test                   \
+       glibmm_ustring_make_valid/test
 
 TESTS =        $(check_PROGRAMS)
 
@@ -123,3 +124,4 @@ glibmm_refptr_sigc_bind_test_SOURCES     = glibmm_refptr_sigc_bind/main.cc
 glibmm_weakref_test_SOURCES              = glibmm_weakref/main.cc
 glibmm_weakref_test_LDADD                = $(giomm_ldadd)
 glibmm_bytearray_test_SOURCES            = glibmm_bytearray/main.cc
+glibmm_ustring_make_valid_test_SOURCES   = glibmm_ustring_make_valid/main.cc
diff --git a/tests/glibmm_ustring_make_valid/main.cc b/tests/glibmm_ustring_make_valid/main.cc
new file mode 100644
index 0000000..3f94122
--- /dev/null
+++ b/tests/glibmm_ustring_make_valid/main.cc
@@ -0,0 +1,58 @@
+#include <iostream>
+#include <glibmm.h>
+
+int
+main()
+{
+  Glib::init();
+
+    //                        0-1: bad character
+  const char not_utf8[] = { '\x80',
+    //                        1-4: good three bytes (one character)
+    '\xef', '\x80', '\x80',
+    //                        4-5: bad character
+    '\xef',
+    //                        5-6: bad character
+    '\x80',
+    //                        6-7: good character
+    'a',
+    //                        7-8: bad character
+    '\0',
+    //                        8-9: good character
+    'd',
+    //                        9-10: bad character
+    '\x80',
+    //                        10-13: good three bytes (one character)
+    '\xef', '\x80', '\x80',
+    //                        13-15: two bad characters
+    '\xef', '\x80'
+  };
+
+  const char fixed_utf8[] = { '\xef', '\xbf', '\xbd',
+    '\xef', '\x80', '\x80',
+    '\xef', '\xbf', '\xbd',
+    '\xef', '\xbf', '\xbd',
+    'a',
+    '\xef', '\xbf', '\xbd',
+    'd',
+    '\xef', '\xbf', '\xbd',
+    '\xef', '\x80', '\x80',
+    '\xef', '\xbf', '\xbd',
+    '\xef', '\xbf', '\xbd'
+  };
+
+  // const char repl_character[] = {'\xef', '\xbf', '\xbd'};
+  const Glib::ustring s(not_utf8, not_utf8 + sizeof not_utf8);
+  g_assert(s.validate() == false);
+
+  const Glib::ustring good_one = s.make_valid();
+  g_assert(s.validate() == false); // we make a copy
+  g_assert(good_one.validate());   // this one is good!
+
+  const Glib::ustring correct_output(fixed_utf8,
+      fixed_utf8 + sizeof fixed_utf8);
+  g_assert(correct_output.validate());
+  g_assert(correct_output == good_one);
+
+  return EXIT_SUCCESS;
+}


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]