diff --git a/internal/parser/lexer.go b/internal/parser/lexer.go index 37434362..a32a663c 100644 --- a/internal/parser/lexer.go +++ b/internal/parser/lexer.go @@ -358,7 +358,21 @@ func (l *lexer) resetTokenStart() { l.tokenStartLoc = l.location() } +// tokenKindPostprocessors defines a transformation of the lexed token string +// before it is stored in the tokens list. It is optional for each token kind. +var tokenKindPostprocessors = map[tokenKind]func(string) string{ + tokenNumber: func(s string) string { + // Get rid of underscore digit separators. + return strings.ReplaceAll(s, "_", "") + }, +} + func (l *lexer) emitFullToken(kind tokenKind, data, stringBlockIndent, stringBlockTermIndent string) { + // Run the postprocessor if the token kind has one defined. + if pp, ok := tokenKindPostprocessors[kind]; ok { + data = pp(data) + } + l.tokens = append(l.tokens, token{ kind: kind, fodder: l.fodder, @@ -451,7 +465,7 @@ func (l *lexer) lexUntilNewline() (string, int, int) { // that the next rune to be served by the lexer will be a leading digit. func (l *lexer) lexNumber() error { // This function should be understood with reference to the linked image: - // http://www.json.org/number.gif + // https://www.json.org/img/number.png // Note, we deviate from the json.org documentation as follows: // There is no reason to lex negative numbers as atomic tokens, it is better to parse them @@ -465,9 +479,11 @@ func (l *lexer) lexNumber() error { numAfterOneToNine numAfterDot numAfterDigit + numAfterUnderscore numAfterE numAfterExpSign numAfterExpDigit + numAfterExpUnderscore ) state := numBegin @@ -492,6 +508,9 @@ outerLoop: state = numAfterDot case 'e', 'E': state = numAfterE + case '_': + state = numAfterUnderscore + default: break outerLoop } @@ -503,6 +522,8 @@ outerLoop: state = numAfterE case r >= '0' && r <= '9': state = numAfterOneToNine + case r == '_': + state = numAfterUnderscore default: break outerLoop } @@ -521,9 +542,22 @@ outerLoop: state = numAfterE case r >= '0' && r <= '9': state = numAfterDigit + case r == '_': + state = numAfterUnderscore default: break outerLoop } + + case numAfterUnderscore: + // The only valid transition out of _ is to a digit. + switch { + case r >= '0' && r <= '9': + state = numAfterOneToNine + default: + return l.makeStaticErrorPoint( + fmt.Sprintf("Couldn't lex number, junk after '_': %v", strconv.QuoteRuneToASCII(r)), + l.location()) + } case numAfterE: switch { case r == '+' || r == '-': @@ -545,12 +579,27 @@ outerLoop: } case numAfterExpDigit: - if r >= '0' && r <= '9' { + switch { + case r >= '0' && r <= '9': state = numAfterExpDigit - } else { + case r == '_': + state = numAfterExpUnderscore + default: break outerLoop } + + case numAfterExpUnderscore: + // The only valid transition out of _ is to a digit. + switch { + case r >= '0' && r <= '9': + state = numAfterExpDigit + default: + return l.makeStaticErrorPoint( + fmt.Sprintf("Couldn't lex number, junk after '_': %v", strconv.QuoteRuneToASCII(r)), + l.location()) + } } + l.next() } @@ -965,7 +1014,6 @@ func Lex(diagnosticFilename ast.DiagnosticFileName, importedFilename, input stri fmt.Sprintf("Could not lex the character %s", strconv.QuoteRuneToASCII(r)), l.location()) } - } } diff --git a/internal/parser/lexer_test.go b/internal/parser/lexer_test.go index c54ff0ec..8d92f0c0 100644 --- a/internal/parser/lexer_test.go +++ b/internal/parser/lexer_test.go @@ -5,7 +5,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -16,6 +16,7 @@ limitations under the License. package parser import ( + "fmt" "testing" "github.com/google/go-jsonnet/ast" @@ -314,6 +315,39 @@ func TestNumber1epExc(t *testing.T) { SingleTest(t, "1e+!", "snippet:1:4 Couldn't lex number, junk after exponent sign: '!'", Tokens{}) } +func TestNumberSeparators(t *testing.T) { + for _, c := range []struct { + input string + err string + tokens Tokens + }{ + {"123_456", "", Tokens{{kind: tokenNumber, data: "123456"}}}, + {"1_750_000", "", Tokens{{kind: tokenNumber, data: "1750000"}}}, + {"1_2_3", "", Tokens{{kind: tokenNumber, data: "123"}}}, + {"3.141_592", "", Tokens{{kind: tokenNumber, data: "3.141592"}}}, + {"01_100", "", Tokens{{kind: tokenNumber, data: "0"}, {kind: tokenNumber, data: "1100"}}}, + {"1_200.0", "", Tokens{{kind: tokenNumber, data: "1200.0"}}}, + {"0e1_01", "", Tokens{{kind: tokenNumber, data: "0e101"}}}, + {"10_10e3", "", Tokens{{kind: tokenNumber, data: "1010e3"}}}, + {"2_3e1_2", "", Tokens{{kind: tokenNumber, data: "23e12"}}}, + {"1.1_2e100", "", Tokens{{kind: tokenNumber, data: "1.12e100"}}}, + {"1.1e-10_1", "", Tokens{{kind: tokenNumber, data: "1.1e-101"}}}, + {"9.109_383_56e-31", "", Tokens{{kind: tokenNumber, data: "9.10938356e-31"}}}, + {"123456_!", "snippet:1:8 Couldn't lex number, junk after '_': '!'", Tokens{}}, + {"123__456", "snippet:1:5 Couldn't lex number, junk after '_': '_'", Tokens{}}, + {"1_200_.0", "snippet:1:7 Couldn't lex number, junk after '_': '.'", Tokens{}}, + {"1_200._0", "snippet:1:7 Couldn't lex number, junk after decimal point: '_'", Tokens{}}, + {"1_200_e2", "snippet:1:7 Couldn't lex number, junk after '_': 'e'", Tokens{}}, + {"1_200e_2", "snippet:1:7 Couldn't lex number, junk after 'E': '_'", Tokens{}}, + {"200e-_2", "snippet:1:6 Couldn't lex number, junk after exponent sign: '_'", Tokens{}}, + {"200e+_2", "snippet:1:6 Couldn't lex number, junk after exponent sign: '_'", Tokens{}}, + } { + t.Run(fmt.Sprintf("number %s", c.input), func(t *testing.T) { + SingleTest(t, c.input, c.err, c.tokens) + }) + } +} + func TestDoublestring1(t *testing.T) { SingleTest(t, "\"hi\"", "", Tokens{ {kind: tokenStringDouble, data: "hi"}, @@ -491,6 +525,12 @@ func TestIdentifiers(t *testing.T) { }) } +func TestIdentifierUnderscore(t *testing.T) { + SingleTest(t, "_123", "", Tokens{ + {kind: tokenIdentifier, data: "_123"}, + }) +} + func TestCppComment(t *testing.T) { SingleTest(t, "// hi", "", Tokens{ {kind: tokenEndOfFile, fodder: ast.Fodder{{Kind: ast.FodderParagraph, Comment: []string{"// hi"}}}},