[libxml2] Document support for the non-standard escape sequences. Support non-BMP code points in surrogate pai



commit 37ebf8a8b2789037792cfc0264b814d742cda2d9
Author: Damjan Jovanovic <damjan jov gmail com>
Date:   Mon May 31 07:45:18 2021 +0200

    Document support for the non-standard escape sequences.
    Support non-BMP code points in surrogate pairs of '\uXXXX\uXXXX'.

 xmlregexp.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 64 insertions(+), 17 deletions(-)
---
diff --git a/xmlregexp.c b/xmlregexp.c
index f1366fd4..f5afa45d 100644
--- a/xmlregexp.c
+++ b/xmlregexp.c
@@ -4907,6 +4907,47 @@ xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
     }
 }
 
+static int parse_escaped_codeunit(xmlRegParserCtxtPtr ctxt)
+{
+    int val = 0, i, cur;
+    for (i = 0; i < 4; i++) {
+       NEXT;
+       val *= 16;
+       cur = CUR;
+       if (cur >= '0' && cur <= '9') {
+           val += cur - '0';
+       } else if (cur >= 'A' && cur <= 'F') {
+           val += cur - 'A' + 10;
+       } else if (cur >= 'a' && cur <= 'f') {
+           val += cur - 'a' + 10;
+       } else {
+           ERROR("Expecting hex digit");
+           return -1;
+       }
+    }
+    return val;
+}
+
+static int parse_escaped_codepoint(xmlRegParserCtxtPtr ctxt)
+{
+    int val = parse_escaped_codeunit(ctxt);
+    if (0xD800 <= val && val <= 0xDBFF) {
+       NEXT;
+       if (CUR == '\\') {
+           NEXT;
+           if (CUR == 'u') {
+               int low = parse_escaped_codeunit(ctxt);
+               if (0xDC00 <= low && low <= 0xDFFF) {
+                   return (val - 0xD800) * 0x400 + (low - 0xDC00) + 0x10000;
+               }
+           }
+       }
+       ERROR("Invalid low surrogate pair code unit");
+       val = -1;
+    }
+    return val;
+}
+
 /**
  * xmlFAParseCharClassEsc:
  * @ctxt:  a regexp parser context
@@ -4969,10 +5010,25 @@ xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
        (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
        (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
        (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
-       (cur == 0x5E) || (cur == '!') || (cur == '"') || (cur == '#') ||
-       (cur == '$') || (cur == '%') || (cur == ',') || (cur == '/') ||
-       (cur == ':') || (cur == ';') || (cur == '=') || (cur == '>') ||
-       (cur == '@') || (cur == '`') || (cur == '~') || (cur == 'u')) {
+       (cur == 0x5E) ||
+
+       /* Non-standard escape sequences:
+        *                  Java 1.8|.NET Core 3.1|MSXML 6 */
+       (cur == '!') ||     /*   +  |     +       |    +   */
+       (cur == '"') ||     /*   +  |     +       |    +   */
+       (cur == '#') ||     /*   +  |     +       |    +   */
+       (cur == '$') ||     /*   +  |     +       |    +   */
+       (cur == '%') ||     /*   +  |     +       |    +   */
+       (cur == ',') ||     /*   +  |     +       |    +   */
+       (cur == '/') ||     /*   +  |     +       |    +   */
+       (cur == ':') ||     /*   +  |     +       |    +   */
+       (cur == ';') ||     /*   +  |     +       |    +   */
+       (cur == '=') ||     /*   +  |     +       |    +   */
+       (cur == '>') ||     /*      |     +       |    +   */
+       (cur == '@') ||     /*   +  |     +       |    +   */
+       (cur == '`') ||     /*   +  |     +       |    +   */
+       (cur == '~') ||     /*   +  |     +       |    +   */
+       (cur == 'u')) {     /*      |     +       |    +   */
        if (ctxt->atom == NULL) {
            ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
            if (ctxt->atom != NULL) {
@@ -4987,21 +5043,12 @@ xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
                        ctxt->atom->codepoint = '\t';
                        break;
                    case 'u':
-                   {
-                       char hex_buffer[5];
-                       int loop;
-                       for (loop = 0; loop < 4; loop++) {
-                           NEXT;
-                           if (!('0' <= CUR && CUR <= '9') && !('a' <= CUR && CUR <= 'f') && !('A' <= CUR && 
CUR <= 'F')) {
-                               ERROR("Expecting hex digit");
-                               return;
-                           }
-                           hex_buffer[loop] = CUR;
+                       cur = parse_escaped_codepoint(ctxt);
+                       if (cur < 0) {
+                           return;
                        }
-                       hex_buffer[4] = 0;
-                       ctxt->atom->codepoint = (int)strtoul(hex_buffer, NULL, 16);
+                       ctxt->atom->codepoint = cur;
                        break;
-                   }
                    default:
                        ctxt->atom->codepoint = cur;
                }


[Date Prev][Date Next]   [Thread Prev][Thread Next]   [Thread Index] [Date Index] [Author Index]