Skip to content

Commit

Permalink
[Parser} Support string-style identifiers
Browse files Browse the repository at this point in the history
In addition to normal identifiers, support parsing identifiers of the format
`$"..."`. This format is not yet allowed by the standard, but it is a popular
proposed extension (see WebAssembly/spec#617 and
WebAssembly/annotations#21).

Binaryen has historically allowed a similar format and has supported arbitrary
non-standard identifier characters, so it's much easier to support this extended
syntax than to fix everything to use the restricted standard syntax.
  • Loading branch information
tlively committed Feb 6, 2024
1 parent d3af204 commit 77ff8b6
Show file tree
Hide file tree
Showing 4 changed files with 103 additions and 32 deletions.
81 changes: 60 additions & 21 deletions src/parser/lexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,25 @@ struct LexStrCtx : LexCtx {
}
};

struct LexIdResult : LexResult {
bool isStr = false;
std::optional<std::string> str;
};

struct LexIdCtx : LexCtx {
bool isStr = false;
std::optional<std::string> str;

LexIdCtx(std::string_view in) : LexCtx(in) {}

std::optional<LexIdResult> lexed() {
if (auto basic = LexCtx::lexed()) {
return LexIdResult{*basic, isStr, str};
}
return {};
}
};

std::optional<LexResult> lparen(std::string_view in) {
LexCtx ctx(in);
ctx.takePrefix("("sv);
Expand Down Expand Up @@ -647,26 +666,6 @@ std::optional<LexResult> idchar(std::string_view in) {
return ctx.lexed();
}

// id ::= '$' idchar+
std::optional<LexResult> ident(std::string_view in) {
LexCtx ctx(in);
if (!ctx.takePrefix("$"sv)) {
return {};
}
if (auto lexed = idchar(ctx.next())) {
ctx.take(*lexed);
} else {
return {};
}
while (auto lexed = idchar(ctx.next())) {
ctx.take(*lexed);
}
if (ctx.canFinish()) {
return ctx.lexed();
}
return {};
}

// string ::= '"' (b*:stringelem)* '"' => concat((b*)*)
// (if |concat((b*)*)| < 2^32)
// stringelem ::= c:stringchar => utf8(c)
Expand Down Expand Up @@ -741,6 +740,30 @@ std::optional<LexStrResult> str(std::string_view in) {
return ctx.lexed();
}

// id ::= '$' idchar+ | '$' str
std::optional<LexIdResult> ident(std::string_view in) {
LexIdCtx ctx(in);
if (!ctx.takePrefix("$"sv)) {
return {};
}
if (auto s = str(ctx.next())) {
ctx.isStr = true;
ctx.str = s->str;
ctx.take(*s);
} else if (auto lexed = idchar(ctx.next())) {
ctx.take(*lexed);
while (auto lexed = idchar(ctx.next())) {
ctx.take(*lexed);
}
} else {
return {};
}
if (ctx.canFinish()) {
return ctx.lexed();
}
return {};
}

// keyword ::= ( 'a' | ... | 'z' ) idchar* (if literal terminal in grammar)
// reserved ::= idchar+
//
Expand Down Expand Up @@ -889,11 +912,27 @@ std::optional<std::string_view> Token::getString() const {
if (tok->str) {
return std::string_view(*tok->str);
}
// Remove quotes.
return span.substr(1, span.size() - 2);
}
return {};
}

std::optional<std::string_view> Token::getID() const {
if (auto* tok = std::get_if<IdTok>(&data)) {
if (tok->str) {
return std::string_view(*tok->str);
}
if (tok->isStr) {
// Remove '$' and quotes.
return span.substr(2, span.size() - 3);
}
// Remove '$'.
return span.substr(1);
}
return {};
}

void Lexer::skipSpace() {
if (auto ctx = space(next())) {
index += ctx->span.size();
Expand All @@ -908,7 +947,7 @@ void Lexer::lexToken() {
} else if (auto t = rparen(next())) {
tok = Token{t->span, RParenTok{}};
} else if (auto t = ident(next())) {
tok = Token{t->span, IdTok{}};
tok = Token{t->span, IdTok{t->isStr, t->str}};
} else if (auto t = integer(next())) {
tok = Token{t->span, IntTok{t->n, t->sign}};
} else if (auto t = float_(next())) {
Expand Down
16 changes: 8 additions & 8 deletions src/parser/lexer.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,12 @@ struct RParenTok {
};

struct IdTok {
// Whether this ID has `$"..."` format
bool isStr;

// If the ID is a string ID and contains escapes, this is its contents.
std::optional<std::string> str;

bool operator==(const IdTok&) const { return true; }
friend std::ostream& operator<<(std::ostream&, const IdTok&);
};
Expand Down Expand Up @@ -81,6 +87,7 @@ struct FloatTok {
};

struct StringTok {
// If the string contains escapes, this is its contents.
std::optional<std::string> str;

bool operator==(const StringTok& other) const { return str == other.str; }
Expand Down Expand Up @@ -111,14 +118,6 @@ struct Token {

bool isRParen() const { return std::get_if<RParenTok>(&data); }

std::optional<std::string_view> getID() const {
if (std::get_if<IdTok>(&data)) {
// Drop leading '$'.
return span.substr(1);
}
return {};
}

std::optional<std::string_view> getKeyword() const {
if (std::get_if<KeywordTok>(&data)) {
return span;
Expand All @@ -132,6 +131,7 @@ struct Token {
std::optional<double> getF64() const;
std::optional<float> getF32() const;
std::optional<std::string_view> getString() const;
std::optional<std::string_view> getID() const;

bool operator==(const Token&) const;
friend std::ostream& operator<<(std::ostream& os, const Token&);
Expand Down
27 changes: 27 additions & 0 deletions test/gtest/wat-lexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1377,6 +1377,33 @@ TEST(LexerTest, LexIdent) {
Lexer lexer("$"sv);
EXPECT_TRUE(lexer.empty());
}

// String IDs
{
Lexer lexer("$\"\"");
ASSERT_FALSE(lexer.empty());
Token expected{"$\"\""sv, IdTok{true, std::nullopt}};
EXPECT_EQ(*lexer, expected);
EXPECT_TRUE(lexer->getID());
EXPECT_EQ(*lexer->getID(), ""sv);
}
{
Lexer lexer("$\"hello\"");
ASSERT_FALSE(lexer.empty());
Token expected{"$\"hello\""sv, IdTok{true, std::nullopt}};
EXPECT_EQ(*lexer, expected);
EXPECT_TRUE(lexer->getID());
EXPECT_EQ(*lexer->getID(), "hello"sv);
}
{
// _$_£_€_𐍈_
auto unicode = "$\"_\\u{24}_\\u{00a3}_\\u{20AC}_\\u{10348}_\""sv;
Lexer lexer(unicode);
ASSERT_FALSE(lexer.empty());
std::string escaped{"_$_\xC2\xA3_\xE2\x82\xAC_\xF0\x90\x8D\x88_"};
Token expected{unicode, IdTok{true, {escaped}}};
EXPECT_EQ(*lexer, expected);
}
}

TEST(LexerTest, LexString) {
Expand Down
11 changes: 8 additions & 3 deletions test/lit/wat-kitchen-sink.wast
Original file line number Diff line number Diff line change
Expand Up @@ -380,7 +380,7 @@
;; CHECK: (elem $passive-2 anyref (struct.new_default $s0) (struct.new_default $s0))
(elem $passive-2 anyref (item struct.new $s0) (struct.new $s0))

;; CHECK: (elem declare func $ref-func $ref-is-null $table-fill $table-grow $table-set)
;; CHECK: (elem declare func $ref-func $table-fill $table-grow $table-set)
(elem declare func 0 1 2 3)

(elem $declare-2 declare funcref (item ref.func 0) (ref.func 1) (item (ref.func 2)))
Expand Down Expand Up @@ -467,6 +467,11 @@
;; CHECK-NEXT: )
(func $f4 (type 18) (local i32 i64) (local $l f32))

;; CHECK: (func $"[quoted_name]" (type $void)
;; CHECK-NEXT: (nop)
;; CHECK-NEXT: )
(func $"[quoted_name]")

;; CHECK: (func $nop-skate (type $void)
;; CHECK-NEXT: (nop)
;; CHECK-NEXT: (nop)
Expand Down Expand Up @@ -3622,13 +3627,13 @@
;; CHECK-NEXT: (ref.func $ref-func)
;; CHECK-NEXT: )
;; CHECK-NEXT: (drop
;; CHECK-NEXT: (ref.func $ref-is-null)
;; CHECK-NEXT: (ref.func $ref-func)
;; CHECK-NEXT: )
;; CHECK-NEXT: )
(func $ref-func
ref.func $ref-func
drop
ref.func 154
ref.func 156
drop
)

Expand Down

0 comments on commit 77ff8b6

Please sign in to comment.