diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index d173129a..8ac7ecbe 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -75,7 +75,7 @@ def __init__(self, lexer_conf: 'LexerConf', parser_conf: 'ParserConf', term_matc self.term_matcher = term_matcher - def predict_and_complete(self, i, to_scan, columns, transitives): + def predict_and_complete(self, i, to_scan, columns, transitives, node_cache): """The core Earley Predictor and Completer. At each stage of the input, we handling any completed items (things @@ -84,7 +84,6 @@ def predict_and_complete(self, i, to_scan, columns, transitives): non-terminals are recursively processed until we reach a set of, which can be added to the scan list for the next scanner cycle.""" # Held Completions (H in E.Scotts paper). - node_cache = {} held_completions = {} column = columns[i] @@ -203,7 +202,7 @@ def scan(i, token, to_scan): for item in self.Set(to_scan): if match(item.expect, token): new_item = item.advance() - label = (new_item.s, new_item.start, i) + label = (new_item.s, new_item.start, i + 1) # 'terminals' may not contain token.type when using %declare # Additionally, token is not always a Token # For example, it can be a Tree when using TreeMatcher @@ -227,7 +226,7 @@ def scan(i, token, to_scan): expect = {i.expect.name for i in to_scan} raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.s for i in to_scan)) - return next_to_scan + return next_to_scan, node_cache # Define parser functions @@ -245,16 +244,17 @@ def scan(i, token, to_scan): # step. expects = {i.expect for i in to_scan} i = 0 + node_cache = {} for token in lexer.lex(expects): - self.predict_and_complete(i, to_scan, columns, transitives) + self.predict_and_complete(i, to_scan, columns, transitives, node_cache) - to_scan = scan(i, token, to_scan) + to_scan, node_cache = scan(i, token, to_scan) i += 1 expects.clear() expects |= {i.expect for i in to_scan} - self.predict_and_complete(i, to_scan, columns, transitives) + self.predict_and_complete(i, to_scan, columns, transitives, node_cache) ## Column is now the final column in the parse. assert i == len(columns)-1 @@ -286,6 +286,9 @@ def parse(self, lexer, start): if not solutions: expected_terminals = [t.expect.name for t in to_scan] raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan)) + if len(solutions) > 1: + raise RuntimeError('Earley should not generate multiple start symbol items! Please report this bug.') + solution ,= solutions if self.debug: from .earley_forest import ForestToPyDotVisitor @@ -294,8 +297,7 @@ def parse(self, lexer, start): except ImportError: logger.warning("Cannot find dependency 'pydot', will not generate sppf debug image") else: - for i, s in enumerate(solutions): - debug_walker.visit(s, f"sppf{i}.png") + debug_walker.visit(solution, "sppf.png") if self.Tree is not None: @@ -304,14 +306,7 @@ def parse(self, lexer, start): # to prevent a tree construction bug. See issue #1283 use_cache = not self.resolve_ambiguity transformer = ForestToParseTree(self.Tree, self.callbacks, self.forest_sum_visitor and self.forest_sum_visitor(), self.resolve_ambiguity, use_cache) - solutions = [transformer.transform(s) for s in solutions] - - if len(solutions) > 1 and not self.resolve_ambiguity: - t: Tree = self.Tree('_ambig', solutions) - t.expand_kids_by_data('_ambig') # solutions may themselves be _ambig nodes - return t - return solutions[0] + return transformer.transform(solution) # return the root of the SPPF - # TODO return a list of solutions, or join them together somehow - return solutions[0] + return solution diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index a0f43acb..13d592dd 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -127,7 +127,7 @@ def scan(i, to_scan): considered_rules=considered_rules ) - return next_to_scan + return next_to_scan, node_cache delayed_matches = defaultdict(list) @@ -146,10 +146,11 @@ def scan(i, to_scan): # processed down to terminals/empty nodes to be added to the scanner for the next # step. i = 0 + node_cache = {} for token in stream: - self.predict_and_complete(i, to_scan, columns, transitives) + self.predict_and_complete(i, to_scan, columns, transitives, node_cache) - to_scan = scan(i, to_scan) + to_scan, node_cache = scan(i, to_scan) if token == '\n': text_line += 1 @@ -158,7 +159,7 @@ def scan(i, to_scan): text_column += 1 i += 1 - self.predict_and_complete(i, to_scan, columns, transitives) + self.predict_and_complete(i, to_scan, columns, transitives, node_cache) ## Column is now the final column in the parse. assert i == len(columns)-1 diff --git a/lark/utils.py b/lark/utils.py index 3767a66d..2d33f693 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -184,7 +184,7 @@ def is_id_start(s: str) -> bool: return _test_unicode_category(s, _ID_START) -def dedup_list(l: Sequence[T]) -> List[T]: +def dedup_list(l: Iterable[T]) -> List[T]: """Given a list (l) will removing duplicates from the list, preserving the original order of the list. Assumes that the list entries are hashable.""" diff --git a/tests/test_parser.py b/tests/test_parser.py index 59e9a718..98290710 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -836,14 +836,14 @@ def test_multiple_start_solutions(self): tree = l.parse('x') expected = Tree('_ambig', [ + Tree('start', [Tree('a', ['x'])]), Tree('start', ['x']), - Tree('start', [Tree('a', ['x'])])] - ) + ]) self.assertEqual(tree, expected) l = Lark(grammar, ambiguity='resolve', lexer=LEXER) tree = l.parse('x') - assert tree == Tree('start', ['x']) + assert tree == Tree('start', [Tree('a', ['x'])]) def test_cycle(self): @@ -872,10 +872,7 @@ def test_cycle2(self): tree = l.parse("ab") expected = ( Tree('start', [ - Tree('_ambig', [ - Tree('v', [Tree('v', [])]), - Tree('v', [Tree('v', [Tree('v', [])])]) - ]) + Tree('v', [Tree('v', [])]), ]) ) self.assertEqual(tree, expected) @@ -990,7 +987,7 @@ def test_consistent_derivation_order1(self): ''', lexer=LEXER) tree = parser.parse('..') - n = Tree('a', [Tree('b', [])]) + n = Tree('a', []) assert tree == Tree('start', [n, n]) _NAME = "TestFullEarley" + LEXER.capitalize()