diff --git a/docs/grammar.md b/docs/grammar.md index 14179d0c..5c1864f3 100644 --- a/docs/grammar.md +++ b/docs/grammar.md @@ -135,6 +135,8 @@ When using a lexer (basic or contextual), it is the grammar-author's responsibil 3. Length of literal / pattern definition 4. Name +When using the longest_match lexer, for matches that have the same length the literal that is defined first is used. + **Examples:** ```perl IF: "if" diff --git a/docs/how_to_use.md b/docs/how_to_use.md index 7ba5acf8..885f5e45 100644 --- a/docs/how_to_use.md +++ b/docs/how_to_use.md @@ -44,7 +44,7 @@ But if it doesn't, feel free to ask us on gitter, or even open an issue. Post a ### Regex collisions -A likely source of bugs occurs when two regexes in a grammar can match the same input. If both terminals have the same priority, most lexers would arbitrarily choose the first one that matches, which isn't always the desired one. (a notable exception is the `dynamic_complete` lexer, which always tries all variations. But its users pay for that with performance.) +A likely source of bugs occurs when two regexes in a grammar can match the same input. If both terminals have the same priority, most lexers would arbitrarily choose the first one that matches, which isn't always the desired one. (a notable exception is the `dynamic_complete` lexer, which always tries all variations. But its users pay for that with performance.) The `longest_match` lexer chooses the terminal with the longest matched length, similiar to the Unix tool "Lex". These collisions can be hard to notice, and their effects can be difficult to debug, as they are subtle and sometimes hard to reproduce. diff --git a/lark/lark.py b/lark/lark.py index 0bec71bb..68b6e011 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -21,7 +21,7 @@ from .tree import Tree from .common import LexerConf, ParserConf, _ParserArgType, _LexerArgType -from .lexer import Lexer, BasicLexer, TerminalDef, LexerThread, Token +from .lexer import Lexer, BasicLexer, LongestMatchLexer, TerminalDef, LexerThread, Token from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import _validate_frontend_args, _get_lexer_callbacks, _deserialize_parsing_frontend, _construct_parsing_frontend from .grammar import Rule @@ -119,6 +119,7 @@ class LarkOptions(Serialize): - "auto" (default): Choose for me based on the parser - "basic": Use a basic lexer - "contextual": Stronger lexer (only works with parser="lalr") + - "longest_match": Uses longest match found, where the precedence of the terminals follow the order they are defined (only works with parser='lalr') - "dynamic": Flexible and powerful (only with parser="earley") - "dynamic_complete": Same as dynamic, but tries *every* variation of tokenizing possible. ambiguity @@ -378,7 +379,7 @@ def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None: if isinstance(lexer, type): assert issubclass(lexer, Lexer) # XXX Is this really important? Maybe just ensure interface compliance else: - assert_config(lexer, ('basic', 'contextual', 'dynamic', 'dynamic_complete')) + assert_config(lexer, ('basic', 'longest_match', 'contextual', 'dynamic', 'dynamic_complete')) if self.options.postlex is not None and 'dynamic' in lexer: raise ConfigurationError("Can't use postlex with a dynamic lexer. Use basic or contextual instead") @@ -462,7 +463,11 @@ def _build_lexer(self, dont_ignore: bool=False) -> BasicLexer: from copy import copy lexer_conf = copy(lexer_conf) lexer_conf.ignore = () - return BasicLexer(lexer_conf) + createLexer = { + 'basic': BasicLexer, + 'longest_match': LongestMatchLexer + }[self.options.lexer] + return createLexer(lexer_conf) def _prepare_callbacks(self) -> None: self._callbacks = {} diff --git a/lark/lexer.py b/lark/lexer.py index 9061d600..769325ca 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -391,6 +391,32 @@ def match(self, text, pos): return m.group(0), m.lastgroup +class LongestMatchScanner: + def __init__(self, terminals, g_regex_flags, re_, use_bytes): + self.terminals = terminals + self.g_regex_flags = g_regex_flags + self.use_bytes = use_bytes + self.allowed_types = {t.name for t in self.terminals} + + self.name_regex = {} + for t in self.terminals: + pattern = t.pattern.to_regexp() + if self.use_bytes: + pattern = pattern.encode('latin-1') + self.name_regex[t.name] = re_.compile(pattern, self.g_regex_flags) + + def match(self, text, pos): + longestMatchLen = -1 + longestMatch = None + for name, regex in self.name_regex.items(): + m = regex.match(text, pos) + if m and longestMatchLen < len(m.group()): + longestMatchLen = len(m.group()) + longestMatch = (m.group(0), name) + + return longestMatch + + def _regexp_has_newline(r: str): r"""Expressions that may indicate newlines in a regexp: - newlines (\n) @@ -622,6 +648,25 @@ def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token: raise EOFError(self) +class LongestMatchLexer(BasicLexer): + def __init__(self, conf: 'LexerConf', comparator=None) -> None: + super().__init__(conf, comparator) + self.terminals = list(conf.terminals) + + def _build_scanner(self): + terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes) + assert all(self.callback.values()) + + for type_, f in self.user_callbacks.items(): + if type_ in self.callback: + # Already a callback there, probably UnlessCallback + self.callback[type_] = CallChain(self.callback[type_], f, lambda t: t.type == type_) + else: + self.callback[type_] = f + + self._scanner = LongestMatchScanner(terminals, self.g_regex_flags, self.re, self.use_bytes) + + class ContextualLexer(Lexer): lexers: Dict[int, AbstractBasicLexer] root_lexer: AbstractBasicLexer diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 186058a6..ef3698db 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -2,7 +2,7 @@ from .exceptions import ConfigurationError, GrammarError, assert_config from .utils import get_regexp_width, Serialize -from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer +from .lexer import LexerThread, BasicLexer, LongestMatchLexer, ContextualLexer, Lexer from .parsers import earley, xearley, cyk from .parsers.lalr_parser import LALR_Parser from .tree import Tree @@ -74,6 +74,7 @@ def __init__(self, lexer_conf: LexerConf, parser_conf: ParserConf, options, pars elif isinstance(lexer_type, str): create_lexer = { 'basic': create_basic_lexer, + 'longest_match': create_longest_match_lexer, 'contextual': create_contextual_lexer, }[lexer_type] self.lexer = create_lexer(lexer_conf, self.parser, lexer_conf.postlex, options) @@ -117,7 +118,7 @@ def _validate_frontend_args(parser, lexer) -> None: assert_config(parser, ('lalr', 'earley', 'cyk')) if not isinstance(lexer, type): # not custom lexer? expected = { - 'lalr': ('basic', 'contextual'), + 'lalr': ('basic', 'longest_match', 'contextual'), 'earley': ('basic', 'dynamic', 'dynamic_complete'), 'cyk': ('basic', ), }[parser] @@ -147,6 +148,10 @@ def create_basic_lexer(lexer_conf, parser, postlex, options) -> BasicLexer: cls = (options and options._plugins.get('BasicLexer')) or BasicLexer return cls(lexer_conf) +def create_longest_match_lexer(lexer_conf, parser, postlex, options) -> BasicLexer: + cls = (options and options._plugins.get('LongestMatchLexer')) or LongestMatchLexer + return cls(lexer_conf) + def create_contextual_lexer(lexer_conf: LexerConf, parser, postlex, options) -> ContextualLexer: cls = (options and options._plugins.get('ContextualLexer')) or ContextualLexer parse_table: ParseTableBase[int] = parser._parse_table