From 7cdfe538de6473e38948b4a97c73c8d6c22af109 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hu=C3=A1ng=20J=C3=B9nli=C3=A0ng?= Date: Thu, 24 Oct 2024 16:42:42 -0400 Subject: [PATCH] fix: unicode escape sequence within RegExpIdentifier is always in unicode mode --- parser.js | 28 +- test/test-data-named-groups.json | 430 ++++++++++++++++++++++++++++++- 2 files changed, 449 insertions(+), 9 deletions(-) diff --git a/parser.js b/parser.js index d12e4b7..e5e51cc 100644 --- a/parser.js +++ b/parser.js @@ -860,7 +860,7 @@ return modifiersGroup; } - function parseUnicodeSurrogatePairEscape(firstEscape) { + function parseUnicodeSurrogatePairEscape(firstEscape, isUnicodeMode) { if (isUnicodeMode) { var first, second; if (firstEscape.kind == 'unicodeEscape' && @@ -1041,12 +1041,13 @@ } } - function parseRegExpUnicodeEscapeSequence() { + function parseRegExpUnicodeEscapeSequence(isUnicodeMode) { var res; if (res = matchReg(/^u([0-9a-fA-F]{4})/)) { // UnicodeEscapeSequence return parseUnicodeSurrogatePairEscape( - createEscaped('unicodeEscape', parseInt(res[1], 16), res[1], 2) + createEscaped('unicodeEscape', parseInt(res[1], 16), res[1], 2), + isUnicodeMode ); } else if (isUnicodeMode && (res = matchReg(/^u\{([0-9a-fA-F]+)\}/))) { // RegExpUnicodeEscapeSequence (ES6 Unicode code point escape) @@ -1059,8 +1060,8 @@ // ControlEscape // c ControlLetter // HexEscapeSequence - // UnicodeEscapeSequence - // IdentityEscape + // UnicodeEscapeSequence[?UnicodeMode] + // IdentityEscape[?UnicodeMode] var res; var from = pos; @@ -1081,7 +1082,7 @@ } else if (res = matchReg(/^x([0-9a-fA-F]{2})/)) { // HexEscapeSequence return createEscaped('hexadecimalEscape', parseInt(res[1], 16), res[1], 2); - } else if (res = parseRegExpUnicodeEscapeSequence()) { + } else if (res = parseRegExpUnicodeEscapeSequence(isUnicodeMode)) { if (!res || res.codePoint > 0x10FFFF) { bail('Invalid escape sequence', null, from, pos); } @@ -1093,11 +1094,22 @@ } function parseIdentifierAtom(check) { + // RegExpIdentifierStart[UnicodeMode] :: + // IdentifierStartChar + // \ RegExpUnicodeEscapeSequence[+UnicodeMode] + // [~UnicodeMode] UnicodeLeadSurrogate UnicodeTrailSurrogate + // + // RegExpIdentifierPart[UnicodeMode] :: + // IdentifierPartChar + // \ RegExpUnicodeEscapeSequence[+UnicodeMode] + // [~UnicodeMode] UnicodeLeadSurrogate UnicodeTrailSurrogate + + var ch = lookahead(); var from = pos; if (ch === '\\') { incr(); - var esc = parseRegExpUnicodeEscapeSequence(); + var esc = parseRegExpUnicodeEscapeSequence(true); if (!esc || !check(esc.codePoint)) { bail('Invalid escape sequence', null, from, pos); } @@ -1366,7 +1378,7 @@ bail('classEscape'); } - return parseUnicodeSurrogatePairEscape(res); + return parseUnicodeSurrogatePairEscape(res, isUnicodeMode); } } diff --git a/test/test-data-named-groups.json b/test/test-data-named-groups.json index 74d328b..992e385 100644 --- a/test/test-data-named-groups.json +++ b/test/test-data-named-groups.json @@ -164,7 +164,7 @@ "(?<\\u{41})": { "type": "error", "name": "SyntaxError", - "message": "Invalid escape sequence at position 3\n (?<\\u{41})\n ^", + "message": "character at position 9: >\n (?<\\u{41})\n ^", "input": "(?<\\u{41})" }, "(?<\\u0041bc\\u0041>)\\k<\\u0041bc\\u0041>": { @@ -219,6 +219,434 @@ "message": "Invalid escape sequence at position 3\n (?<\\u0000>)\n ^", "input": "(?<\\u0000>)" }, + "(?<\\u{10000}>b008-A)\\k<\\u{10000}>": { + "type": "alternative", + "body": [ + { + "type": "group", + "behavior": "normal", + "body": [ + { + "type": "value", + "kind": "symbol", + "codePoint": 98, + "range": [ + 13, + 14 + ], + "raw": "b" + }, + { + "type": "value", + "kind": "symbol", + "codePoint": 48, + "range": [ + 14, + 15 + ], + "raw": "0" + }, + { + "type": "value", + "kind": "symbol", + "codePoint": 48, + "range": [ + 15, + 16 + ], + "raw": "0" + }, + { + "type": "value", + "kind": "symbol", + "codePoint": 56, + "range": [ + 16, + 17 + ], + "raw": "8" + }, + { + "type": "value", + "kind": "symbol", + "codePoint": 45, + "range": [ + 17, + 18 + ], + "raw": "-" + }, + { + "type": "value", + "kind": "symbol", + "codePoint": 65, + "range": [ + 18, + 19 + ], + "raw": "A" + } + ], + "range": [ + 0, + 20 + ], + "raw": "(?<\\u{10000}>b008-A)", + "name": { + "type": "identifier", + "value": "\uD800\uDC00", + "range": [ + 3, + 12 + ], + "raw": "\\u{10000}" + } + }, + { + "type": "reference", + "name": { + "type": "identifier", + "value": "\uD800\uDC00", + "range": [ + 23, + 32 + ], + "raw": "\\u{10000}" + }, + "range": [ + 20, + 33 + ], + "raw": "\\k<\\u{10000}>" + } + ], + "range": [ + 0, + 33 + ], + "raw": "(?<\\u{10000}>b008-A)\\k<\\u{10000}>" + }, + "(?b008-A)\\k": { + "type": "alternative", + "body": [ + { + "type": "group", + "behavior": "normal", + "body": [ + { + "type": "value", + "kind": "symbol", + "codePoint": 98, + "range": [ + 14, + 15 + ], + "raw": "b" + }, + { + "type": "value", + "kind": "symbol", + "codePoint": 48, + "range": [ + 15, + 16 + ], + "raw": "0" + }, + { + "type": "value", + "kind": "symbol", + "codePoint": 48, + "range": [ + 16, + 17 + ], + "raw": "0" + }, + { + "type": "value", + "kind": "symbol", + "codePoint": 56, + "range": [ + 17, + 18 + ], + "raw": "8" + }, + { + "type": "value", + "kind": "symbol", + "codePoint": 45, + "range": [ + 18, + 19 + ], + "raw": "-" + }, + { + "type": "value", + "kind": "symbol", + "codePoint": 65, + "range": [ + 19, + 20 + ], + "raw": "A" + } + ], + "range": [ + 0, + 21 + ], + "raw": "(?b008-A)", + "name": { + "type": "identifier", + "value": "A\uD800\uDC00", + "range": [ + 3, + 13 + ], + "raw": "A\\u{10000}" + } + }, + { + "type": "reference", + "name": { + "type": "identifier", + "value": "A\uD800\uDC00", + "range": [ + 24, + 34 + ], + "raw": "A\\u{10000}" + }, + "range": [ + 21, + 35 + ], + "raw": "\\k" + } + ], + "range": [ + 0, + 35 + ], + "raw": "(?b008-A)\\k" + }, + "(?<\\ud800\\udc00>b008-A)\\k<\\ud800\\udc00>": { + "type": "alternative", + "body": [ + { + "type": "group", + "behavior": "normal", + "body": [ + { + "type": "value", + "kind": "symbol", + "codePoint": 98, + "range": [ + 16, + 17 + ], + "raw": "b" + }, + { + "type": "value", + "kind": "symbol", + "codePoint": 48, + "range": [ + 17, + 18 + ], + "raw": "0" + }, + { + "type": "value", + "kind": "symbol", + "codePoint": 48, + "range": [ + 18, + 19 + ], + "raw": "0" + }, + { + "type": "value", + "kind": "symbol", + "codePoint": 56, + "range": [ + 19, + 20 + ], + "raw": "8" + }, + { + "type": "value", + "kind": "symbol", + "codePoint": 45, + "range": [ + 20, + 21 + ], + "raw": "-" + }, + { + "type": "value", + "kind": "symbol", + "codePoint": 65, + "range": [ + 21, + 22 + ], + "raw": "A" + } + ], + "range": [ + 0, + 23 + ], + "raw": "(?<\\ud800\\udc00>b008-A)", + "name": { + "type": "identifier", + "value": "\uD800\uDC00", + "range": [ + 3, + 15 + ], + "raw": "\\ud800\\udc00" + } + }, + { + "type": "reference", + "name": { + "type": "identifier", + "value": "\uD800\uDC00", + "range": [ + 26, + 38 + ], + "raw": "\\ud800\\udc00" + }, + "range": [ + 23, + 39 + ], + "raw": "\\k<\\ud800\\udc00>" + } + ], + "range": [ + 0, + 39 + ], + "raw": "(?<\\ud800\\udc00>b008-A)\\k<\\ud800\\udc00>" + }, + "(?b008-A)\\k": { + "type": "alternative", + "body": [ + { + "type": "group", + "behavior": "normal", + "body": [ + { + "type": "value", + "kind": "symbol", + "codePoint": 98, + "range": [ + 17, + 18 + ], + "raw": "b" + }, + { + "type": "value", + "kind": "symbol", + "codePoint": 48, + "range": [ + 18, + 19 + ], + "raw": "0" + }, + { + "type": "value", + "kind": "symbol", + "codePoint": 48, + "range": [ + 19, + 20 + ], + "raw": "0" + }, + { + "type": "value", + "kind": "symbol", + "codePoint": 56, + "range": [ + 20, + 21 + ], + "raw": "8" + }, + { + "type": "value", + "kind": "symbol", + "codePoint": 45, + "range": [ + 21, + 22 + ], + "raw": "-" + }, + { + "type": "value", + "kind": "symbol", + "codePoint": 65, + "range": [ + 22, + 23 + ], + "raw": "A" + } + ], + "range": [ + 0, + 24 + ], + "raw": "(?b008-A)", + "name": { + "type": "identifier", + "value": "A\uD800\uDC00", + "range": [ + 3, + 16 + ], + "raw": "A\\ud800\\udc00" + } + }, + { + "type": "reference", + "name": { + "type": "identifier", + "value": "A\uD800\uDC00", + "range": [ + 27, + 40 + ], + "raw": "A\\ud800\\udc00" + }, + "range": [ + 24, + 41 + ], + "raw": "\\k" + } + ], + "range": [ + 0, + 41 + ], + "raw": "(?b008-A)\\k" + }, "{(?)}": { "type": "alternative", "body": [