From 00a570228f60e2f921bd6f265f5baa71d8d36a8b Mon Sep 17 00:00:00 2001
From: Thomas Lively <tlively@google.com>
Date: Mon, 5 Feb 2024 16:26:57 -0800
Subject: [PATCH] [Parser} Support string-style identifiers

In addition to normal identifiers, support parsing identifiers of the format
`$"..."`. This format is not yet allowed by the standard, but it is a popular
proposed extension (see https://github.com/WebAssembly/spec/issues/617 and
https://github.com/WebAssembly/annotations/issues/21).

Binaryen has historically allowed a similar format and has supported arbitrary
non-standard identifier characters, so it's much easier to support this extended
syntax than to fix everything to use the restricted standard syntax.
---
 src/parser/lexer.cpp           | 81 +++++++++++++++++++++++++---------
 src/parser/lexer.h             | 16 +++----
 test/gtest/wat-lexer.cpp       | 27 ++++++++++++
 test/lit/wat-kitchen-sink.wast | 11 +++--
 4 files changed, 103 insertions(+), 32 deletions(-)
diff --git a/src/parser/lexer.cpp b/src/parser/lexer.cpp
index 2625bf8fd73..07931d69ba4 100644
--- a/src/parser/lexer.cpp
+++ b/src/parser/lexer.cpp
@@ -329,6 +329,25 @@ struct LexStrCtx : LexCtx {
   }
 };
 
+struct LexIdResult : LexResult {
+  bool isStr = false;
+  std::optional<std::string> str;
+};
+
+struct LexIdCtx : LexCtx {
+  bool isStr = false;
+  std::optional<std::string> str;
+
+  LexIdCtx(std::string_view in) : LexCtx(in) {}
+
+  std::optional<LexIdResult> lexed() {
+    if (auto basic = LexCtx::lexed()) {
+      return LexIdResult{*basic, isStr, str};
+    }
+    return {};
+  }
+};
+
 std::optional<LexResult> lparen(std::string_view in) {
   LexCtx ctx(in);
   ctx.takePrefix("("sv);
@@ -647,26 +666,6 @@ std::optional<LexResult> idchar(std::string_view in) {
   return ctx.lexed();
 }
 
-// id ::= '$' idchar+
-std::optional<LexResult> ident(std::string_view in) {
-  LexCtx ctx(in);
-  if (!ctx.takePrefix("$"sv)) {
-    return {};
-  }
-  if (auto lexed = idchar(ctx.next())) {
-    ctx.take(*lexed);
-  } else {
-    return {};
-  }
-  while (auto lexed = idchar(ctx.next())) {
-    ctx.take(*lexed);
-  }
-  if (ctx.canFinish()) {
-    return ctx.lexed();
-  }
-  return {};
-}
-
 // string     ::= '"' (b*:stringelem)* '"'  => concat((b*)*)
 //                    (if |concat((b*)*)| < 2^32)
 // stringelem ::= c:stringchar              => utf8(c)
@@ -741,6 +740,30 @@ std::optional<LexStrResult> str(std::string_view in) {
   return ctx.lexed();
 }
 
+// id ::= '$' idchar+ | '$' str
+std::optional<LexIdResult> ident(std::string_view in) {
+  LexIdCtx ctx(in);
+  if (!ctx.takePrefix("$"sv)) {
+    return {};
+  }
+  if (auto s = str(ctx.next())) {
+    ctx.isStr = true;
+    ctx.str = s->str;
+    ctx.take(*s);
+  } else if (auto lexed = idchar(ctx.next())) {
+    ctx.take(*lexed);
+    while (auto lexed = idchar(ctx.next())) {
+      ctx.take(*lexed);
+    }
+  } else {
+    return {};
+  }
+  if (ctx.canFinish()) {
+    return ctx.lexed();
+  }
+  return {};
+}
+
 // keyword ::= ( 'a' | ... | 'z' ) idchar* (if literal terminal in grammar)
 // reserved ::= idchar+
 //
@@ -889,11 +912,27 @@ std::optional<std::string_view> Token::getString() const {
     if (tok->str) {
       return std::string_view(*tok->str);
     }
+    // Remove quotes.
     return span.substr(1, span.size() - 2);
   }
   return {};
 }
 
+std::optional<std::string_view> Token::getID() const {
+  if (auto* tok = std::get_if<IdTok>(&data)) {
+    if (tok->str) {
+      return std::string_view(*tok->str);
+    }
+    if (tok->isStr) {
+      // Remove '$' and quotes.
+      return span.substr(2, span.size() - 3);
+    }
+    // Remove '$'.
+    return span.substr(1);
+  }
+  return {};
+}
+
 void Lexer::skipSpace() {
   if (auto ctx = space(next())) {
     index += ctx->span.size();
@@ -908,7 +947,7 @@ void Lexer::lexToken() {
   } else if (auto t = rparen(next())) {
     tok = Token{t->span, RParenTok{}};
   } else if (auto t = ident(next())) {
-    tok = Token{t->span, IdTok{}};
+    tok = Token{t->span, IdTok{t->isStr, t->str}};
   } else if (auto t = integer(next())) {
     tok = Token{t->span, IntTok{t->n, t->sign}};
   } else if (auto t = float_(next())) {
diff --git a/src/parser/lexer.h b/src/parser/lexer.h
index 42b18508e46..f0da151f9d7 100644
--- a/src/parser/lexer.h
+++ b/src/parser/lexer.h
@@ -53,6 +53,12 @@ struct RParenTok {
 };
 
 struct IdTok {
+  // Whether this ID has `$"..."` format
+  bool isStr;
+
+  // If the ID is a string ID and contains escapes, this is its contents.
+  std::optional<std::string> str;
+
   bool operator==(const IdTok&) const { return true; }
   friend std::ostream& operator<<(std::ostream&, const IdTok&);
 };
@@ -81,6 +87,7 @@ struct FloatTok {
 };
 
 struct StringTok {
+  // If the string contains escapes, this is its contents.
   std::optional<std::string> str;
 
   bool operator==(const StringTok& other) const { return str == other.str; }
@@ -111,14 +118,6 @@ struct Token {
 
   bool isRParen() const { return std::get_if<RParenTok>(&data); }
 
-  std::optional<std::string_view> getID() const {
-    if (std::get_if<IdTok>(&data)) {
-      // Drop leading '$'.
-      return span.substr(1);
-    }
-    return {};
-  }
-
   std::optional<std::string_view> getKeyword() const {
     if (std::get_if<KeywordTok>(&data)) {
       return span;
@@ -132,6 +131,7 @@ struct Token {
   std::optional<double> getF64() const;
   std::optional<float> getF32() const;
   std::optional<std::string_view> getString() const;
+  std::optional<std::string_view> getID() const;
 
   bool operator==(const Token&) const;
   friend std::ostream& operator<<(std::ostream& os, const Token&);
diff --git a/test/gtest/wat-lexer.cpp b/test/gtest/wat-lexer.cpp
index b46f9927f38..b626446828b 100644
--- a/test/gtest/wat-lexer.cpp
+++ b/test/gtest/wat-lexer.cpp
@@ -1377,6 +1377,33 @@ TEST(LexerTest, LexIdent) {
     Lexer lexer("$"sv);
     EXPECT_TRUE(lexer.empty());
   }
+
+  // String IDs
+  {
+    Lexer lexer("$\"\"");
+    ASSERT_FALSE(lexer.empty());
+    Token expected{"$\"\""sv, IdTok{true, std::nullopt}};
+    EXPECT_EQ(*lexer, expected);
+    EXPECT_TRUE(lexer->getID());
+    EXPECT_EQ(*lexer->getID(), ""sv);
+  }
+  {
+    Lexer lexer("$\"hello\"");
+    ASSERT_FALSE(lexer.empty());
+    Token expected{"$\"hello\""sv, IdTok{true, std::nullopt}};
+    EXPECT_EQ(*lexer, expected);
+    EXPECT_TRUE(lexer->getID());
+    EXPECT_EQ(*lexer->getID(), "hello"sv);
+  }
+  {
+    // _$_£_€_𐍈_
+    auto unicode = "$\"_\\u{24}_\\u{00a3}_\\u{20AC}_\\u{10348}_\""sv;
+    Lexer lexer(unicode);
+    ASSERT_FALSE(lexer.empty());
+    std::string escaped{"_$_\xC2\xA3_\xE2\x82\xAC_\xF0\x90\x8D\x88_"};
+    Token expected{unicode, IdTok{true, {escaped}}};
+    EXPECT_EQ(*lexer, expected);
+  }
 }
 
 TEST(LexerTest, LexString) {
diff --git a/test/lit/wat-kitchen-sink.wast b/test/lit/wat-kitchen-sink.wast
index 1ab8e7516df..32a70d9138d 100644
--- a/test/lit/wat-kitchen-sink.wast
+++ b/test/lit/wat-kitchen-sink.wast
@@ -380,7 +380,7 @@
  ;; CHECK:      (elem $passive-2 anyref (struct.new_default $s0) (struct.new_default $s0))
  (elem $passive-2 anyref (item struct.new $s0) (struct.new $s0))
 
- ;; CHECK:      (elem declare func $ref-func $ref-is-null $table-fill $table-grow $table-set)
+ ;; CHECK:      (elem declare func $ref-func $table-fill $table-grow $table-set)
  (elem declare func 0 1 2 3)
 
  (elem $declare-2 declare funcref (item ref.func 0) (ref.func 1) (item (ref.func 2)))
@@ -467,6 +467,11 @@
  ;; CHECK-NEXT: )
  (func $f4 (type 18) (local i32 i64) (local $l f32))
 
+ ;; CHECK:      (func $"[quoted_name]" (type $void)
+ ;; CHECK-NEXT:  (nop)
+ ;; CHECK-NEXT: )
+ (func $"[quoted_name]")
+
  ;; CHECK:      (func $nop-skate (type $void)
  ;; CHECK-NEXT:  (nop)
  ;; CHECK-NEXT:  (nop)
@@ -3622,13 +3627,13 @@
  ;; CHECK-NEXT:   (ref.func $ref-func)
  ;; CHECK-NEXT:  )
  ;; CHECK-NEXT:  (drop
- ;; CHECK-NEXT:   (ref.func $ref-is-null)
+ ;; CHECK-NEXT:   (ref.func $ref-func)
  ;; CHECK-NEXT:  )
  ;; CHECK-NEXT: )
  (func $ref-func
   ref.func $ref-func
   drop
-  ref.func 154
+  ref.func 156
   drop
  )