[libxml2] Document support for the non-standard escape sequences. Support non-BMP code points in surrogate pai
- From: Nick Wellnhofer <nwellnhof src gnome org>
- To: commits-list gnome org
- Cc:
- Subject: [libxml2] Document support for the non-standard escape sequences. Support non-BMP code points in surrogate pai
- Date: Wed, 2 Mar 2022 15:33:00 +0000 (UTC)
commit 37ebf8a8b2789037792cfc0264b814d742cda2d9
Author: Damjan Jovanovic <damjan jov gmail com>
Date: Mon May 31 07:45:18 2021 +0200
Document support for the non-standard escape sequences.
Support non-BMP code points in surrogate pairs of '\uXXXX\uXXXX'.
xmlregexp.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++-------------
1 file changed, 64 insertions(+), 17 deletions(-)
---
diff --git a/xmlregexp.c b/xmlregexp.c
index f1366fd4..f5afa45d 100644
--- a/xmlregexp.c
+++ b/xmlregexp.c
@@ -4907,6 +4907,47 @@ xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
}
}
+static int parse_escaped_codeunit(xmlRegParserCtxtPtr ctxt)
+{
+ int val = 0, i, cur;
+ for (i = 0; i < 4; i++) {
+ NEXT;
+ val *= 16;
+ cur = CUR;
+ if (cur >= '0' && cur <= '9') {
+ val += cur - '0';
+ } else if (cur >= 'A' && cur <= 'F') {
+ val += cur - 'A' + 10;
+ } else if (cur >= 'a' && cur <= 'f') {
+ val += cur - 'a' + 10;
+ } else {
+ ERROR("Expecting hex digit");
+ return -1;
+ }
+ }
+ return val;
+}
+
+static int parse_escaped_codepoint(xmlRegParserCtxtPtr ctxt)
+{
+ int val = parse_escaped_codeunit(ctxt);
+ if (0xD800 <= val && val <= 0xDBFF) {
+ NEXT;
+ if (CUR == '\\') {
+ NEXT;
+ if (CUR == 'u') {
+ int low = parse_escaped_codeunit(ctxt);
+ if (0xDC00 <= low && low <= 0xDFFF) {
+ return (val - 0xD800) * 0x400 + (low - 0xDC00) + 0x10000;
+ }
+ }
+ }
+ ERROR("Invalid low surrogate pair code unit");
+ val = -1;
+ }
+ return val;
+}
+
/**
* xmlFAParseCharClassEsc:
* @ctxt: a regexp parser context
@@ -4969,10 +5010,25 @@ xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
(cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
(cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
(cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
- (cur == 0x5E) || (cur == '!') || (cur == '"') || (cur == '#') ||
- (cur == '$') || (cur == '%') || (cur == ',') || (cur == '/') ||
- (cur == ':') || (cur == ';') || (cur == '=') || (cur == '>') ||
- (cur == '@') || (cur == '`') || (cur == '~') || (cur == 'u')) {
+ (cur == 0x5E) ||
+
+ /* Non-standard escape sequences:
+ * Java 1.8|.NET Core 3.1|MSXML 6 */
+ (cur == '!') || /* + | + | + */
+ (cur == '"') || /* + | + | + */
+ (cur == '#') || /* + | + | + */
+ (cur == '$') || /* + | + | + */
+ (cur == '%') || /* + | + | + */
+ (cur == ',') || /* + | + | + */
+ (cur == '/') || /* + | + | + */
+ (cur == ':') || /* + | + | + */
+ (cur == ';') || /* + | + | + */
+ (cur == '=') || /* + | + | + */
+ (cur == '>') || /* | + | + */
+ (cur == '@') || /* + | + | + */
+ (cur == '`') || /* + | + | + */
+ (cur == '~') || /* + | + | + */
+ (cur == 'u')) { /* | + | + */
if (ctxt->atom == NULL) {
ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
if (ctxt->atom != NULL) {
@@ -4987,21 +5043,12 @@ xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
ctxt->atom->codepoint = '\t';
break;
case 'u':
- {
- char hex_buffer[5];
- int loop;
- for (loop = 0; loop < 4; loop++) {
- NEXT;
- if (!('0' <= CUR && CUR <= '9') && !('a' <= CUR && CUR <= 'f') && !('A' <= CUR &&
CUR <= 'F')) {
- ERROR("Expecting hex digit");
- return;
- }
- hex_buffer[loop] = CUR;
+ cur = parse_escaped_codepoint(ctxt);
+ if (cur < 0) {
+ return;
}
- hex_buffer[4] = 0;
- ctxt->atom->codepoint = (int)strtoul(hex_buffer, NULL, 16);
+ ctxt->atom->codepoint = cur;
break;
- }
default:
ctxt->atom->codepoint = cur;
}
[
Date Prev][
Date Next] [
Thread Prev][
Thread Next]
[
Thread Index]
[
Date Index]
[
Author Index]