[json-glib] scanner: Clean up the UTF-16 surrogate pairs decoding

From: Emmanuele Bassi <ebassi src gnome org>
To: commits-list gnome org
Cc:
Subject: [json-glib] scanner: Clean up the UTF-16 surrogate pairs decoding
Date: Tue, 18 Mar 2014 18:20:24 +0000 (UTC)
commit 36872776b354d2b4b39f9da8e12dd51e949b6c90
Author: Emmanuele Bassi <ebassi gnome org>
Date:   Sun Mar 2 14:26:44 2014 +0000

    scanner: Clean up the UTF-16 surrogate pairs decoding
    
    We over-assert() our pre-conditions, and the conversion is a bit
    obfuscated. We should use a proper function, and de-obfuscate the
    code so that the intent is clear.

 json-glib/json-scanner.c |   42 ++++++++++++++++++++++++++++++++++++------
 1 files changed, 36 insertions(+), 6 deletions(-)
---
diff --git a/json-glib/json-scanner.c b/json-glib/json-scanner.c
index 21339e1..5d7cb89 100644
--- a/json-glib/json-scanner.c
+++ b/json-glib/json-scanner.c
@@ -577,6 +577,30 @@ json_scanner_get_unichar (JsonScanner *scanner,
   return uchar;
 }
 
+/*
+ * decode_utf16_surrogate_pair:
+ * @units: (array length=2): a pair of UTF-16 code points
+ *
+ * Decodes a surrogate pair of UTF-16 code points into the equivalent
+ * Unicode code point.
+ *
+ * Returns: the Unicode code point equivalent to the surrogate pair
+ */
+static inline gunichar
+decode_utf16_surrogate_pair (const gunichar units[2])
+{
+  gunichar ucs;
+
+  g_assert (0xd800 <= units[0] && units[0] <= 0xdbff);
+  g_assert (0xdc00 <= units[1] && units[1] <= 0xdfff);
+
+  ucs = 0x10000;
+  ucs += (units[0] & 0x3ff) << 10;
+  ucs += (units[1] & 0x3ff);
+
+  return ucs;
+}
+
 void
 json_scanner_unexp_token (JsonScanner *scanner,
                           GTokenType   expected_token,
@@ -1113,19 +1137,25 @@ json_scanner_get_token_ll (JsonScanner *scanner,
 
                               ucs = json_scanner_get_unichar (scanner, line_p, position_p);
 
+                              /* resolve UTF-16 surrogates for Unicode characters not in the BMP,
+                                * as per ECMA 404, § 9, "String"
+                                */
                               if (g_unichar_type (ucs) == G_UNICODE_SURROGATE)
                                 {
                                   /* read next surrogate */
-                                  if ('\\' == json_scanner_get_char (scanner, line_p, position_p)
-                                      && 'u' == json_scanner_get_char (scanner, line_p, position_p))
+                                  if ('\\' == json_scanner_get_char (scanner, line_p, position_p) &&
+                                      'u' == json_scanner_get_char (scanner, line_p, position_p))
                                     {
-                                      gunichar ucs_lo = json_scanner_get_unichar (scanner, line_p, 
position_p);
-                                      g_assert (g_unichar_type (ucs_lo) == G_UNICODE_SURROGATE);
-                                      ucs = (((ucs & 0x3ff) << 10) | (ucs_lo & 0x3ff)) + 0x10000;
+                                      gunichar units[2];
+
+                                      units[0] = ucs;
+                                      units[1] = json_scanner_get_unichar (scanner, line_p, position_p);
+
+                                      ucs = decode_utf16_surrogate_pair (units);
+                                      g_assert (g_unichar_validate (ucs));
                                     }
                                 }
 
-                              g_assert (g_unichar_validate (ucs));
                               gstring = g_string_append_unichar (gstring, ucs);
                             }
                           break;
[Date Prev][Date Next] [Thread Prev][Thread Next] [Thread Index] [Date Index] [Author Index]