lark-parser · AiStudent · Nov 17, 2024 · Nov 17, 2024
diff --git a/docs/grammar.md b/docs/grammar.md
@@ -135,6 +135,8 @@ When using a lexer (basic or contextual), it is the grammar-author's responsibil
 3. Length of literal / pattern definition
 4. Name
 
+When using the longest_match lexer, for matches that have the same length the literal that is defined first is used.
+
 **Examples:**
 ```perl
 IF: "if"

diff --git a/docs/how_to_use.md b/docs/how_to_use.md
@@ -44,7 +44,7 @@ But if it doesn't, feel free to ask us on gitter, or even open an issue. Post a
 
 ### Regex collisions
 
-A likely source of bugs occurs when two regexes in a grammar can match the same input. If both terminals have the same priority, most lexers would arbitrarily choose the first one that matches, which isn't always the desired one. (a notable exception is the `dynamic_complete` lexer, which always tries all variations. But its users pay for that with performance.)
+A likely source of bugs occurs when two regexes in a grammar can match the same input. If both terminals have the same priority, most lexers would arbitrarily choose the first one that matches, which isn't always the desired one. (a notable exception is the `dynamic_complete` lexer, which always tries all variations. But its users pay for that with performance.) The `longest_match` lexer chooses the terminal with the longest matched length, similiar to the Unix tool "Lex".
 
 These collisions can be hard to notice, and their effects can be difficult to debug, as they are subtle and sometimes hard to reproduce.
 

diff --git a/lark/lark.py b/lark/lark.py
@@ -21,7 +21,7 @@
 from .tree import Tree
 from .common import LexerConf, ParserConf, _ParserArgType, _LexerArgType
 
-from .lexer import Lexer, BasicLexer, TerminalDef, LexerThread, Token
+from .lexer import Lexer, BasicLexer, LongestMatchLexer, TerminalDef, LexerThread, Token
 from .parse_tree_builder import ParseTreeBuilder
 from .parser_frontends import _validate_frontend_args, _get_lexer_callbacks, _deserialize_parsing_frontend, _construct_parsing_frontend
 from .grammar import Rule
@@ -119,6 +119,7 @@ class LarkOptions(Serialize):
             - "auto" (default): Choose for me based on the parser
             - "basic": Use a basic lexer
             - "contextual": Stronger lexer (only works with parser="lalr")
+            - "longest_match": Uses longest match found, where the precedence of the terminals follow the order they are defined (only works with parser='lalr')
             - "dynamic": Flexible and powerful (only with parser="earley")
             - "dynamic_complete": Same as dynamic, but tries *every* variation of tokenizing possible.
     ambiguity
@@ -378,7 +379,7 @@ def __init__(self, grammar: 'Union[Grammar, str, IO[str]]', **options) -> None:
         if isinstance(lexer, type):
             assert issubclass(lexer, Lexer)     # XXX Is this really important? Maybe just ensure interface compliance
         else:
-            assert_config(lexer, ('basic', 'contextual', 'dynamic', 'dynamic_complete'))
+            assert_config(lexer, ('basic', 'longest_match', 'contextual', 'dynamic', 'dynamic_complete'))
             if self.options.postlex is not None and 'dynamic' in lexer:
                 raise ConfigurationError("Can't use postlex with a dynamic lexer. Use basic or contextual instead")
 
@@ -462,7 +463,11 @@ def _build_lexer(self, dont_ignore: bool=False) -> BasicLexer:
             from copy import copy
             lexer_conf = copy(lexer_conf)
             lexer_conf.ignore = ()
-        return BasicLexer(lexer_conf)
+        createLexer = {
+            'basic': BasicLexer,
+            'longest_match': LongestMatchLexer
+        }[self.options.lexer]
+        return createLexer(lexer_conf)
 
     def _prepare_callbacks(self) -> None:
         self._callbacks = {}

diff --git a/lark/lexer.py b/lark/lexer.py
@@ -391,6 +391,32 @@ def match(self, text, pos):
                 return m.group(0), m.lastgroup
 
 
+class LongestMatchScanner:
+    def __init__(self, terminals, g_regex_flags, re_, use_bytes):
+        self.terminals = terminals
+        self.g_regex_flags = g_regex_flags
+        self.use_bytes = use_bytes
+        self.allowed_types = {t.name for t in self.terminals}
+
+        self.name_regex = {}
+        for t in self.terminals:
+            pattern = t.pattern.to_regexp()
+            if self.use_bytes:
+                pattern = pattern.encode('latin-1')
+            self.name_regex[t.name] = re_.compile(pattern, self.g_regex_flags)
+
+    def match(self, text, pos):
+        longestMatchLen = -1
+        longestMatch = None
+        for name, regex in self.name_regex.items():
+            m = regex.match(text, pos)
+            if m and longestMatchLen < len(m.group()):
+                longestMatchLen = len(m.group())
+                longestMatch = (m.group(0), name)
+
+        return longestMatch
+
+
 def _regexp_has_newline(r: str):
     r"""Expressions that may indicate newlines in a regexp:
         - newlines (\n)
@@ -622,6 +648,25 @@ def next_token(self, lex_state: LexerState, parser_state: Any = None) -> Token:
         raise EOFError(self)
 
 
+class LongestMatchLexer(BasicLexer):
+    def __init__(self, conf: 'LexerConf', comparator=None) -> None:
+        super().__init__(conf, comparator)
+        self.terminals = list(conf.terminals)
+
+    def _build_scanner(self):
+        terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes)
+        assert all(self.callback.values())
+
+        for type_, f in self.user_callbacks.items():
+            if type_ in self.callback:
+                # Already a callback there, probably UnlessCallback
+                self.callback[type_] = CallChain(self.callback[type_], f, lambda t: t.type == type_)
+            else:
+                self.callback[type_] = f
+
+        self._scanner = LongestMatchScanner(terminals, self.g_regex_flags, self.re, self.use_bytes)
+
+
 class ContextualLexer(Lexer):
     lexers: Dict[int, AbstractBasicLexer]
     root_lexer: AbstractBasicLexer

diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py
@@ -2,7 +2,7 @@
 
 from .exceptions import ConfigurationError, GrammarError, assert_config
 from .utils import get_regexp_width, Serialize
-from .lexer import LexerThread, BasicLexer, ContextualLexer, Lexer
+from .lexer import LexerThread, BasicLexer, LongestMatchLexer, ContextualLexer, Lexer
 from .parsers import earley, xearley, cyk
 from .parsers.lalr_parser import LALR_Parser
 from .tree import Tree
@@ -74,6 +74,7 @@ def __init__(self, lexer_conf: LexerConf, parser_conf: ParserConf, options, pars
         elif isinstance(lexer_type, str):
             create_lexer = {
                 'basic': create_basic_lexer,
+                'longest_match': create_longest_match_lexer,
                 'contextual': create_contextual_lexer,
             }[lexer_type]
             self.lexer = create_lexer(lexer_conf, self.parser, lexer_conf.postlex, options)
@@ -117,7 +118,7 @@ def _validate_frontend_args(parser, lexer) -> None:
     assert_config(parser, ('lalr', 'earley', 'cyk'))
     if not isinstance(lexer, type):     # not custom lexer?
         expected = {
-            'lalr': ('basic', 'contextual'),
+            'lalr': ('basic', 'longest_match', 'contextual'),
             'earley': ('basic', 'dynamic', 'dynamic_complete'),
             'cyk': ('basic', ),
          }[parser]
@@ -147,6 +148,10 @@ def create_basic_lexer(lexer_conf, parser, postlex, options) -> BasicLexer:
     cls = (options and options._plugins.get('BasicLexer')) or BasicLexer
     return cls(lexer_conf)
 
+def create_longest_match_lexer(lexer_conf, parser, postlex, options) -> BasicLexer:
+    cls = (options and options._plugins.get('LongestMatchLexer')) or LongestMatchLexer
+    return cls(lexer_conf)
+
 def create_contextual_lexer(lexer_conf: LexerConf, parser, postlex, options) -> ContextualLexer:
     cls = (options and options._plugins.get('ContextualLexer')) or ContextualLexer
     parse_table: ParseTableBase[int] = parser._parse_table