Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Longest match lexer option to mimic the Unix tool Lex #1490

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/grammar.md
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,8 @@ When using a lexer (basic or contextual), it is the grammar-author's responsibil
3. Length of literal / pattern definition
4. Name

When using the longest_match lexer, for matches that have the same length the literal that is defined first is used.

**Examples:**
```perl
IF: "if"
Expand Down
2 changes: 1 addition & 1 deletion docs/how_to_use.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ But if it doesn't, feel free to ask us on gitter, or even open an issue. Post a

### Regex collisions

A likely source of bugs occurs when two regexes in a grammar can match the same input. If both terminals have the same priority, most lexers would arbitrarily choose the first one that matches, which isn't always the desired one. (a notable exception is the `dynamic_complete` lexer, which always tries all variations. But its users pay for that with performance.)
A likely source of bugs occurs when two regexes in a grammar can match the same input. If both terminals have the same priority, most lexers would arbitrarily choose the first one that matches, which isn't always the desired one. (a notable exception is the `dynamic_complete` lexer, which always tries all variations. But its users pay for that with performance.) The `longest_match` lexer chooses the terminal with the longest matched length, similiar to the Unix tool "Lex".

These collisions can be hard to notice, and their effects can be difficult to debug, as they are subtle and sometimes hard to reproduce.

Expand Down
11 changes: 8 additions & 3 deletions lark/lark.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from .tree import Tree
from .common import LexerConf, ParserConf, _ParserArgType, _LexerArgType

from .lexer import Lexer, BasicLexer, TerminalDef, LexerThread, Token
from .lexer import Lexer, BasicLexer, LongestMatchLexer, TerminalDef, LexerThread, Token
from .parse_tree_builder import ParseTreeBuilder
from .parser_frontends import _validate_frontend_args, _get_lexer_callbacks, _deserialize_parsing_frontend, _construct_parsing_frontend
from .grammar import Rule
Expand Down Expand Up @@ -119,6 +119,7 @@ class LarkOptions(Serialize):
- "auto" (default): Choose for me based on the parser
- "basic": Use a basic lexer
- "contextual": Stronger lexer (only works with parser="lalr")
- "longest_match": Uses longest match found, where the precedence of the terminals follow the order they are defined (only works with parser='lalr')
- "dynamic": Flexible and powerful (only with parser="earley")
- "dynamic_complete": Same as dynamic, but tries *every* variation of tokenizing possible.
ambiguity
Expand Down Expand Up @@ -378,7 +379,7 @@ def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None:
if isinstance(lexer, type):
assert issubclass(lexer, Lexer) # XXX Is this really important? Maybe just ensure interface compliance
else:
assert_config(lexer, ('basic', 'contextual', 'dynamic', 'dynamic_complete'))
assert_config(lexer, ('basic', 'longest_match', 'contextual', 'dynamic', 'dynamic_complete'))
if self.options.postlex is not None and 'dynamic' in lexer:
raise ConfigurationError("Can't use postlex with a dynamic lexer. Use basic or contextual instead")

Expand Down Expand Up @@ -462,7 +463,11 @@ def _build_lexer(self, dont_ignore: bool=False) -> BasicLexer:
from copy import copy
lexer_conf = copy(lexer_conf)
lexer_conf.ignore = ()
return BasicLexer(lexer_conf)
createLexer = {
'basic': BasicLexer,
'longest_match': LongestMatchLexer
}[self.options.lexer]
return createLexer(lexer_conf)

def _prepare_callbacks(self) -> None:
self._callbacks = {}
Expand Down
45 changes: 45 additions & 0 deletions lark/lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,32 @@ def match(self, text, pos):
return m.group(0), m.lastgroup


class LongestMatchScanner:
def __init__(self, terminals, g_regex_flags, re_, use_bytes):
self.terminals = terminals
self.g_regex_flags = g_regex_flags
self.use_bytes = use_bytes
self.allowed_types = {t.name for t in self.terminals}

self.name_regex = {}
for t in self.terminals:
pattern = t.pattern.to_regexp()
if self.use_bytes:
pattern = pattern.encode('latin-1')
self.name_regex[t.name] = re_.compile(pattern, self.g_regex_flags)

def match(self, text, pos):
longestMatchLen = -1
longestMatch = None
for name, regex in self.name_regex.items():
m = regex.match(text, pos)
if m and longestMatchLen < len(m.group()):
longestMatchLen = len(m.group())
longestMatch = (m.group(0), name)

return longestMatch


def _regexp_has_newline(r: str):
r"""Expressions that may indicate newlines in a regexp:
- newlines (\n)
Expand Down Expand Up @@ -622,6 +648,25 @@ def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token:
raise EOFError(self)


class LongestMatchLexer(BasicLexer):
def __init__(self, conf: 'LexerConf', comparator=None) -> None:
super().__init__(conf, comparator)
self.terminals = list(conf.terminals)

def _build_scanner(self):
terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes)
assert all(self.callback.values())

for type_, f in self.user_callbacks.items():
if type_ in self.callback:
# Already a callback there, probably UnlessCallback
self.callback[type_] = CallChain(self.callback[type_], f, lambda t: t.type == type_)
else:
self.callback[type_] = f

self._scanner = LongestMatchScanner(terminals, self.g_regex_flags, self.re, self.use_bytes)


class ContextualLexer(Lexer):
lexers: Dict[int, AbstractBasicLexer]
root_lexer: AbstractBasicLexer
Expand Down
9 changes: 7 additions & 2 deletions lark/parser_frontends.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from .exceptions import ConfigurationError, GrammarError, assert_config
from .utils import get_regexp_width, Serialize
from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer
from .lexer import LexerThread, BasicLexer, LongestMatchLexer, ContextualLexer, Lexer
from .parsers import earley, xearley, cyk
from .parsers.lalr_parser import LALR_Parser
from .tree import Tree
Expand Down Expand Up @@ -74,6 +74,7 @@ def __init__(self, lexer_conf: LexerConf, parser_conf: ParserConf, options, pars
elif isinstance(lexer_type, str):
create_lexer = {
'basic': create_basic_lexer,
'longest_match': create_longest_match_lexer,
'contextual': create_contextual_lexer,
}[lexer_type]
self.lexer = create_lexer(lexer_conf, self.parser, lexer_conf.postlex, options)
Expand Down Expand Up @@ -117,7 +118,7 @@ def _validate_frontend_args(parser, lexer) -> None:
assert_config(parser, ('lalr', 'earley', 'cyk'))
if not isinstance(lexer, type): # not custom lexer?
expected = {
'lalr': ('basic', 'contextual'),
'lalr': ('basic', 'longest_match', 'contextual'),
'earley': ('basic', 'dynamic', 'dynamic_complete'),
'cyk': ('basic', ),
}[parser]
Expand Down Expand Up @@ -147,6 +148,10 @@ def create_basic_lexer(lexer_conf, parser, postlex, options) -> BasicLexer:
cls = (options and options._plugins.get('BasicLexer')) or BasicLexer
return cls(lexer_conf)

def create_longest_match_lexer(lexer_conf, parser, postlex, options) -> BasicLexer:
cls = (options and options._plugins.get('LongestMatchLexer')) or LongestMatchLexer
return cls(lexer_conf)

def create_contextual_lexer(lexer_conf: LexerConf, parser, postlex, options) -> ContextualLexer:
cls = (options and options._plugins.get('ContextualLexer')) or ContextualLexer
parse_table: ParseTableBase[int] = parser._parse_table
Expand Down