From 00a570228f60e2f921bd6f265f5baa71d8d36a8b Mon Sep 17 00:00:00 2001 From: Thomas Lively Date: Mon, 5 Feb 2024 16:26:57 -0800 Subject: [PATCH] [Parser} Support string-style identifiers In addition to normal identifiers, support parsing identifiers of the format `$"..."`. This format is not yet allowed by the standard, but it is a popular proposed extension (see https://github.com/WebAssembly/spec/issues/617 and https://github.com/WebAssembly/annotations/issues/21). Binaryen has historically allowed a similar format and has supported arbitrary non-standard identifier characters, so it's much easier to support this extended syntax than to fix everything to use the restricted standard syntax. --- src/parser/lexer.cpp | 81 +++++++++++++++++++++++++--------- src/parser/lexer.h | 16 +++---- test/gtest/wat-lexer.cpp | 27 ++++++++++++ test/lit/wat-kitchen-sink.wast | 11 +++-- 4 files changed, 103 insertions(+), 32 deletions(-) diff --git a/src/parser/lexer.cpp b/src/parser/lexer.cpp index 2625bf8fd73..07931d69ba4 100644 --- a/src/parser/lexer.cpp +++ b/src/parser/lexer.cpp @@ -329,6 +329,25 @@ struct LexStrCtx : LexCtx { } }; +struct LexIdResult : LexResult { + bool isStr = false; + std::optional str; +}; + +struct LexIdCtx : LexCtx { + bool isStr = false; + std::optional str; + + LexIdCtx(std::string_view in) : LexCtx(in) {} + + std::optional lexed() { + if (auto basic = LexCtx::lexed()) { + return LexIdResult{*basic, isStr, str}; + } + return {}; + } +}; + std::optional lparen(std::string_view in) { LexCtx ctx(in); ctx.takePrefix("("sv); @@ -647,26 +666,6 @@ std::optional idchar(std::string_view in) { return ctx.lexed(); } -// id ::= '$' idchar+ -std::optional ident(std::string_view in) { - LexCtx ctx(in); - if (!ctx.takePrefix("$"sv)) { - return {}; - } - if (auto lexed = idchar(ctx.next())) { - ctx.take(*lexed); - } else { - return {}; - } - while (auto lexed = idchar(ctx.next())) { - ctx.take(*lexed); - } - if (ctx.canFinish()) { - return ctx.lexed(); - } - return {}; -} - // string ::= '"' (b*:stringelem)* '"' => concat((b*)*) // (if |concat((b*)*)| < 2^32) // stringelem ::= c:stringchar => utf8(c) @@ -741,6 +740,30 @@ std::optional str(std::string_view in) { return ctx.lexed(); } +// id ::= '$' idchar+ | '$' str +std::optional ident(std::string_view in) { + LexIdCtx ctx(in); + if (!ctx.takePrefix("$"sv)) { + return {}; + } + if (auto s = str(ctx.next())) { + ctx.isStr = true; + ctx.str = s->str; + ctx.take(*s); + } else if (auto lexed = idchar(ctx.next())) { + ctx.take(*lexed); + while (auto lexed = idchar(ctx.next())) { + ctx.take(*lexed); + } + } else { + return {}; + } + if (ctx.canFinish()) { + return ctx.lexed(); + } + return {}; +} + // keyword ::= ( 'a' | ... | 'z' ) idchar* (if literal terminal in grammar) // reserved ::= idchar+ // @@ -889,11 +912,27 @@ std::optional Token::getString() const { if (tok->str) { return std::string_view(*tok->str); } + // Remove quotes. return span.substr(1, span.size() - 2); } return {}; } +std::optional Token::getID() const { + if (auto* tok = std::get_if(&data)) { + if (tok->str) { + return std::string_view(*tok->str); + } + if (tok->isStr) { + // Remove '$' and quotes. + return span.substr(2, span.size() - 3); + } + // Remove '$'. + return span.substr(1); + } + return {}; +} + void Lexer::skipSpace() { if (auto ctx = space(next())) { index += ctx->span.size(); @@ -908,7 +947,7 @@ void Lexer::lexToken() { } else if (auto t = rparen(next())) { tok = Token{t->span, RParenTok{}}; } else if (auto t = ident(next())) { - tok = Token{t->span, IdTok{}}; + tok = Token{t->span, IdTok{t->isStr, t->str}}; } else if (auto t = integer(next())) { tok = Token{t->span, IntTok{t->n, t->sign}}; } else if (auto t = float_(next())) { diff --git a/src/parser/lexer.h b/src/parser/lexer.h index 42b18508e46..f0da151f9d7 100644 --- a/src/parser/lexer.h +++ b/src/parser/lexer.h @@ -53,6 +53,12 @@ struct RParenTok { }; struct IdTok { + // Whether this ID has `$"..."` format + bool isStr; + + // If the ID is a string ID and contains escapes, this is its contents. + std::optional str; + bool operator==(const IdTok&) const { return true; } friend std::ostream& operator<<(std::ostream&, const IdTok&); }; @@ -81,6 +87,7 @@ struct FloatTok { }; struct StringTok { + // If the string contains escapes, this is its contents. std::optional str; bool operator==(const StringTok& other) const { return str == other.str; } @@ -111,14 +118,6 @@ struct Token { bool isRParen() const { return std::get_if(&data); } - std::optional getID() const { - if (std::get_if(&data)) { - // Drop leading '$'. - return span.substr(1); - } - return {}; - } - std::optional getKeyword() const { if (std::get_if(&data)) { return span; @@ -132,6 +131,7 @@ struct Token { std::optional getF64() const; std::optional getF32() const; std::optional getString() const; + std::optional getID() const; bool operator==(const Token&) const; friend std::ostream& operator<<(std::ostream& os, const Token&); diff --git a/test/gtest/wat-lexer.cpp b/test/gtest/wat-lexer.cpp index b46f9927f38..b626446828b 100644 --- a/test/gtest/wat-lexer.cpp +++ b/test/gtest/wat-lexer.cpp @@ -1377,6 +1377,33 @@ TEST(LexerTest, LexIdent) { Lexer lexer("$"sv); EXPECT_TRUE(lexer.empty()); } + + // String IDs + { + Lexer lexer("$\"\""); + ASSERT_FALSE(lexer.empty()); + Token expected{"$\"\""sv, IdTok{true, std::nullopt}}; + EXPECT_EQ(*lexer, expected); + EXPECT_TRUE(lexer->getID()); + EXPECT_EQ(*lexer->getID(), ""sv); + } + { + Lexer lexer("$\"hello\""); + ASSERT_FALSE(lexer.empty()); + Token expected{"$\"hello\""sv, IdTok{true, std::nullopt}}; + EXPECT_EQ(*lexer, expected); + EXPECT_TRUE(lexer->getID()); + EXPECT_EQ(*lexer->getID(), "hello"sv); + } + { + // _$_£_€_𐍈_ + auto unicode = "$\"_\\u{24}_\\u{00a3}_\\u{20AC}_\\u{10348}_\""sv; + Lexer lexer(unicode); + ASSERT_FALSE(lexer.empty()); + std::string escaped{"_$_\xC2\xA3_\xE2\x82\xAC_\xF0\x90\x8D\x88_"}; + Token expected{unicode, IdTok{true, {escaped}}}; + EXPECT_EQ(*lexer, expected); + } } TEST(LexerTest, LexString) { diff --git a/test/lit/wat-kitchen-sink.wast b/test/lit/wat-kitchen-sink.wast index 1ab8e7516df..32a70d9138d 100644 --- a/test/lit/wat-kitchen-sink.wast +++ b/test/lit/wat-kitchen-sink.wast @@ -380,7 +380,7 @@ ;; CHECK: (elem $passive-2 anyref (struct.new_default $s0) (struct.new_default $s0)) (elem $passive-2 anyref (item struct.new $s0) (struct.new $s0)) - ;; CHECK: (elem declare func $ref-func $ref-is-null $table-fill $table-grow $table-set) + ;; CHECK: (elem declare func $ref-func $table-fill $table-grow $table-set) (elem declare func 0 1 2 3) (elem $declare-2 declare funcref (item ref.func 0) (ref.func 1) (item (ref.func 2))) @@ -467,6 +467,11 @@ ;; CHECK-NEXT: ) (func $f4 (type 18) (local i32 i64) (local $l f32)) + ;; CHECK: (func $"[quoted_name]" (type $void) + ;; CHECK-NEXT: (nop) + ;; CHECK-NEXT: ) + (func $"[quoted_name]") + ;; CHECK: (func $nop-skate (type $void) ;; CHECK-NEXT: (nop) ;; CHECK-NEXT: (nop) @@ -3622,13 +3627,13 @@ ;; CHECK-NEXT: (ref.func $ref-func) ;; CHECK-NEXT: ) ;; CHECK-NEXT: (drop - ;; CHECK-NEXT: (ref.func $ref-is-null) + ;; CHECK-NEXT: (ref.func $ref-func) ;; CHECK-NEXT: ) ;; CHECK-NEXT: ) (func $ref-func ref.func $ref-func drop - ref.func 154 + ref.func 156 drop )