add guidance on handling comments

lark-parser · Jan 2, 2025 · eb74454 · eb74454
1 parent 2f7c9a4
commit eb74454
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 10 deletions.
diff --git a/docs/recipes.md b/docs/recipes.md
@@ -79,6 +79,7 @@ Prints out:
 
 *Note: We don't have to return a token, because comments are ignored*
 
+
 ## CollapseAmbiguities
 
 Parsing ambiguous texts with earley and `ambiguity='explicit'` produces a single tree with `_ambig` nodes to mark where the ambiguity occurred.
@@ -193,3 +194,13 @@ def parse_with_progress(parser: Lark, text: str, start=None):
 ```
 
 Keep in mind that this implementation relies on the `InteractiveParser` and, therefore, only works with the `LALR(1)` parser, and not `Earley`.
+
+
+## Parsing a Language with Significant Whitespace
+
+If your grammar needs to support significant whitespace, you will need to use the `Indenter` class.
+Take a look at the [indented tree example][indent] as well as the [Python grammar][python] for
+inspiration.
+
+[indent]: /examples/indented_tree.py
+[python]: https://github.com/lark-parser/lark/blob/master/lark/grammars/python.lark
diff --git a/examples/indented_tree.py b/examples/indented_tree.py
@@ -3,28 +3,34 @@
 ===================
 
 A demonstration of parsing indentation (“whitespace significant” language)
-and the usage of the Indenter class.
+and the usage of the `Indenter` class.
 
 Since indentation is context-sensitive, a postlex stage is introduced to
-manufacture INDENT/DEDENT tokens.
+manufacture `INDENT`/`DEDENT` tokens.
 
-It is crucial for the indenter that the NL_type matches
-the spaces (and tabs) after the newline.
+It is crucial for the indenter that the `NL_type` matches the spaces (and
+tabs) after the newline.
+
+If your whitespace-significant grammar supports comments, then `NL_type`
+must match those comments too. Otherwise, comments that appear in the middle
+of a line will [confuse Lark][1].
+
+[1]: https://github.com/lark-parser/lark/issues/863
 """
 from lark import Lark
 from lark.indenter import Indenter
 
 tree_grammar = r"""
-    ?start: _NL* tree
-
-    tree: NAME _NL [_INDENT tree+ _DEDENT]
-
     %import common.CNAME -> NAME
     %import common.WS_INLINE
-    %declare _INDENT _DEDENT
+    %import common.SH_COMMENT
     %ignore WS_INLINE
+    %ignore SH_COMMENT
+    %declare _INDENT _DEDENT
 
-    _NL: /(\r?\n[\t ]*)+/
+    ?start: _NL* tree
+    tree: NAME _NL [_INDENT tree+ _DEDENT]
+    _NL: (/\r?\n[\t ]*/ | SH_COMMENT)+
 """
 
 class TreeIndenter(Indenter):
@@ -39,6 +45,7 @@ class TreeIndenter(Indenter):
 
 test_tree = """
 a
+    # check this comment out
     b
     c
         d