diff --git a/Cargo.toml b/Cargo.toml index 5cfcac2..8ca8bee 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -13,10 +13,11 @@ readme = "README.md" description = "Grammar framework." [dependencies] +cyclotron = "0.0.3" +elsa = "1.3.2" indexmap = "1" indexing = "0.3.2" proc-macro2 = "0.4.30" -elsa = "1.3.2" [lib] doctest = false diff --git a/src/bruteforce.rs b/src/bruteforce.rs new file mode 100644 index 0000000..014ffbb --- /dev/null +++ b/src/bruteforce.rs @@ -0,0 +1,647 @@ +use crate::context::{Context, IRule, IStr}; +use crate::forest::{ + dynamic::{CxAndGrammar, OwnedHandle}, + Node, +}; +use crate::input::{Input, InputMatch, Range}; +use crate::parser::{ParseResult, Parser}; +use crate::rule::{Rule, SepKind}; +use cyclotron::bruteforce; +use std::cell::RefCell; +use std::cmp::Reverse; +use std::collections::{BTreeSet, BinaryHeap, HashMap, HashSet, VecDeque}; +use std::fmt; +use std::hash::Hash; +use std::iter; +use std::ops::{RangeFrom, RangeTo}; +use std::rc::Rc; + +#[derive(Clone, Default, Debug)] +struct CachedParse { + lengths: Rc>, + approx_forest: Rc, +} + +// HACK(eddyb) hide `approx_forest` from the cyclotron, no need to +// ensure a fixpoint for it (and `HashMap` could make that tricky). +impl PartialEq for CachedParse { + fn eq(&self, other: &Self) -> bool { + self.lengths == other.lengths + } +} + +impl Eq for CachedParse {} + +/// An approximation of a parse forest, erasing the end of ranges +/// wherever convenient, to minimize allocations (especially for lists, +/// where this is linear instead of quadratic). +/// +/// NB: Requires a second validation pass to produce a proper SPPF. +#[derive(Default, Debug)] +struct ApproxForest { + possibilities: HashMap<(IRule, RangeFrom), (Option>, SmallSet)>, +} + +#[derive(Debug)] +enum SmallSet { + One(T), + Many(BTreeSet), +} + +impl SmallSet { + fn insert(&mut self, value: T) { + match self { + SmallSet::One(prev) => { + if value != *prev { + *self = SmallSet::Many([*prev, value].iter().cloned().collect()); + } + } + SmallSet::Many(set) => { + set.insert(value); + } + } + } + + fn iter<'a>(&'a self) -> impl Iterator { + match self { + SmallSet::One(x) => Some(x).into_iter().chain(None.into_iter().flatten()), + SmallSet::Many(set) => None + .into_iter() + .chain(Some(set.iter()).into_iter().flatten()), + } + } +} + +impl ApproxForest { + fn add(&mut self, rule: IRule, start: RangeFrom, end: Option>, x: usize) { + use std::collections::hash_map::Entry; + + match self.possibilities.entry((rule, start)) { + Entry::Vacant(entry) => { + entry.insert((end, SmallSet::One(x))); + } + Entry::Occupied(entry) => { + let (old_end, set) = entry.into_mut(); + if end != *old_end { + *old_end = None; + } + set.insert(x); + } + } + } +} + +fn parse_inner<'i, Pat: Clone + Ord + Hash + fmt::Debug, I: Input>( + // FIXME(eddyb) group some of these in a `struct`. + cx: &Context, + grammar: &crate::Grammar, + parser: &RefCell, I, Pat>>, + parse_cached: &mut dyn FnMut((IRule, Range<'i>)) -> CachedParse, + approx_forest: &mut ApproxForest, + rule: IRule, + range: Range<'i>, +) -> BTreeSet +where + I::Slice: InputMatch, +{ + match cx[rule] { + Rule::Empty => iter::once(0).collect(), + // FIXME(eddyb) find a way to avoid cloning the pattern. + Rule::Eat(ref pat) => parser + .borrow_mut() + .with_result_and_remaining(Range(range.frontiers().0), range) + .input_consume_left(pat.clone()) + .map(|parser| parser.result().len()) + .into_iter() + .collect(), + // FIXME(eddyb) avoid cloning the set from behind a `Rc`. + // May require something like `Cow` but for `T | Rc`? + Rule::Call(r) => (*parse_cached((grammar.rules[&r].rule, range)).lengths).clone(), + Rule::Concat([left, right]) => { + let mut lengths = BTreeSet::new(); + + for left_len in parse_inner( + cx, + grammar, + parser, + parse_cached, + approx_forest, + left, + range, + ) { + let (_, after_left, _) = range.split_at(left_len); + for right_len in parse_inner( + cx, + grammar, + parser, + parse_cached, + approx_forest, + right, + Range(after_left), + ) { + let len = left_len + right_len; + + approx_forest.add(rule, range.start().., Some(..range.start() + len), left_len); + + lengths.insert(len); + } + } + + lengths + } + Rule::Or(ref cases) => { + let mut lengths = BTreeSet::new(); + + for (i, &case) in cases.iter().enumerate() { + for len in parse_inner( + cx, + grammar, + parser, + parse_cached, + approx_forest, + case, + range, + ) { + approx_forest.add(rule, range.start().., Some(..range.start() + len), i); + + lengths.insert(len); + } + } + + lengths + } + Rule::Opt(rule) => iter::once(0) + .chain(parse_inner( + cx, + grammar, + parser, + parse_cached, + approx_forest, + rule, + range, + )) + .collect(), + Rule::RepeatMany(..) => parse_inner( + cx, + grammar, + parser, + parse_cached, + approx_forest, + rule.expand_repeats(cx), + range, + ), + Rule::RepeatMore(elem, sep) => { + let concat_elem_tail_rule = rule.expand_repeats(cx); + // FIXME(eddyb) dedup this with `IRule::expand_repeats`. + let sep = sep.map(|(sep, kind)| { + ( + sep, + kind, + cx.intern(Rule::Concat([ + sep, + match kind { + SepKind::Simple => rule, + SepKind::Trailing => { + cx.intern(Rule::RepeatMany(elem, Some((sep, SepKind::Trailing)))) + } + }, + ])), + ) + }); + + let mut lengths = BTreeSet::new(); + + // To avoid using stack space linear in the list length, + // and time/heap space quadratic in the list length, + // this uses a min-heap (`BinaryHeap` + `ord::Reverse`) + // as a queue, in a monotonic loop, completing a previous + // list with one more element (if possible) at every step. + let mut starts = BinaryHeap::new(); + starts.push(Reverse(0)); + while let Some(Reverse(start)) = starts.pop() { + let range = Range(range.split_at(start).1); + for elem_len in parse_inner( + cx, + grammar, + parser, + parse_cached, + approx_forest, + elem, + range, + ) { + approx_forest.add(concat_elem_tail_rule, range.start().., None, elem_len); + + let after_elem = Range(range.split_at(elem_len).1); + let end = start + elem_len; + if !lengths.insert(end) { + // Seen this list before, avoid re-enqueing it. + continue; + } + + if let Some((sep, kind, concat_sep_tail_rule)) = sep { + for sep_len in parse_inner( + cx, + grammar, + parser, + parse_cached, + approx_forest, + sep, + after_elem, + ) { + starts.push(Reverse(end + sep_len)); + approx_forest.add( + concat_sep_tail_rule, + after_elem.start().., + None, + sep_len, + ); + + match kind { + SepKind::Simple => {} + SepKind::Trailing => { + // The list can also end after the + // separator, not only before. + lengths.insert(end + sep_len); + } + } + } + } else { + starts.push(Reverse(end)); + } + } + } + + lengths + } + } +} + +fn build_sppf<'a, 'i, Pat: Clone + Ord + Hash + fmt::Debug, I: Input>( + cx: &Context, + grammar: &crate::Grammar, + parser: &RefCell, I, Pat>>, + mut parse_cached: impl FnMut((IRule, Range<'i>)) -> CachedParse, + root: Node<'i, CxAndGrammar<'a, Pat>>, +) where + I::Slice: InputMatch, +{ + let full_input = parser.borrow().remaining(); + + // Unpack `rule`, knowing it matched `range`, into a simpler + // rule, if possible. Only returns `None` for leaves. + let trivial_unpack_valid = |mut rule, range: Range<'_>| { + loop { + match cx[rule] { + Rule::Empty | Rule::Eat(_) => return None, + + Rule::Opt(child) => if range.is_empty() { + return None; + } else { + rule = child; + } + + Rule::RepeatMany(..) | + // FIXME(eddyb) handling `RepeatMore` is a waste of time. + // Maybe remove once repeats aren't aliases anymore? + Rule::RepeatMore(..) => rule = rule.expand_repeats(cx), + + _ => return Some(rule), + } + } + }; + + // Build the SPPF starting with "roots" known to be valid, and descending + // into their children. Only descendants of ambiguous nodes need validation, + // everything else can be assumed valid and inserted into the SPPF directly. + let mut roots = VecDeque::new(); + let mut seen_roots = HashSet::new(); + let mut valid_cache = HashMap::new(); + let mut stack = vec![]; + + roots.push_back((>::default(), root.kind, root.range)); + seen_roots.insert((root.kind, root.range)); + + 'roots: while let Some((approx_forest, mut rule, mut range)) = roots.pop_front() { + let mut add_root = |approx_forest: &Rc<_>, rule, range| { + if let Some(rule) = trivial_unpack_valid(rule, range) { + if seen_roots.insert((rule, range)) { + roots.push_back((Rc::clone(approx_forest), rule, range)); + } + } + }; + + // Peel off as many unambiguous layers of rules as possible. + loop { + let old = valid_cache.insert((rule, range), true); + if let Some(old) = old { + assert!(old); + continue 'roots; + } + + let possibilities = || { + let (end, set) = &approx_forest.possibilities[&(rule, range.start()..)]; + if let Some(end) = end { + assert_eq!(*end, ..range.end()); + } + let mut possibilities = set.iter().cloned(); + (possibilities.next().unwrap(), possibilities.next()) + }; + + match cx[rule] { + // Handled by `trivial_unpack_valid`. + Rule::Empty + | Rule::Eat(_) + | Rule::Opt(_) + | Rule::RepeatMany(..) + | Rule::RepeatMore(..) => unreachable!(), + + Rule::Call(r) => { + let rule = grammar.rules[&r].rule; + let result = parse_cached((rule, Range(full_input.split_at(range.start()).1))); + assert!(result.lengths.contains(&range.len())); + add_root(&result.approx_forest, rule, range); + continue 'roots; + } + + Rule::Concat([left, right]) => { + let (split, second_split) = possibilities(); + assert!(split <= range.len()); + + // Only ambiguous if the second possibility also fits in this range. + if second_split.filter(|&x| x <= range.len()).is_some() { + break; + } + + let (left_range, right_range, _) = range.split_at(split); + + add_root(&approx_forest, left, Range(left_range)); + + // HACK(eddyb) need a more ergonomic SPPF builder/parser API. + parser + .borrow_mut() + .with_result_and_remaining( + Range(right_range), + Range(full_input.split_at(range.end()).1), + ) + .forest_add_split( + rule, + Node { + kind: left, + range: Range(left_range), + }, + ); + + rule = right; + range = Range(right_range); + } + + Rule::Or(ref cases) => { + let (choice, second_choice) = possibilities(); + if second_choice.is_some() { + break; + } + + // HACK(eddyb) need a more ergonomic SPPF builder/parser API. + parser + .borrow_mut() + .with_result_and_remaining(range, Range(full_input.split_at(range.end()).1)) + .forest_add_choice(rule, choice); + + rule = cases[choice]; + } + } + rule = match trivial_unpack_valid(rule, range) { + Some(rule) => rule, + None => continue 'roots, + }; + } + + // If we reach this point, we have ambiguities that we need to validate + // recursively. To avoid running out of stack, we use an emulated one. + stack.clear(); + stack.push(( + rule, + range, + // FIXME(eddyb) reduce the cost of this (already computed above). + approx_forest.possibilities[&(rule, range.start()..)] + .1 + .iter() + .cloned() + .next() + .unwrap(), + false, + )); + + 'stack: while let Some(&(rule, range, i, _any_valid)) = stack.last() { + let (mut rule, range) = match cx[rule] { + Rule::Concat([_, right]) => (right, Range(range.split_at(i).1)), + + Rule::Or(ref cases) => (cases[i], range), + + // Only `Concat` and `Or` can be on the stack. + _ => unreachable!(), + }; + + // Try to unpack the `Concat`/`Or` child - note that this only + // exits directly when a leaf is reached, while reaching + // `Concat`/`Or` results in `continue 'stack`. + let mut valid = loop { + let first_possibility = || { + let (end, set) = approx_forest.possibilities.get(&(rule, range.start()..))?; + if let Some(end) = end { + if *end != ..range.end() { + return None; + } + } + set.iter().cloned().next() + }; + + if let Some(&valid) = valid_cache.get(&(rule, range)) { + break valid; + } + + let valid = match cx[rule] { + Rule::Empty => range.is_empty(), + + // FIXME(eddyb) maybe checking the pattern again would be cheaper? + Rule::Eat(_) => { + parse_cached((rule, Range(full_input.split_at(range.start()).1))) + .lengths + .contains(&range.len()) + } + + Rule::Call(r) => { + let rule = grammar.rules[&r].rule; + let result = + parse_cached((rule, Range(full_input.split_at(range.start()).1))); + let valid = result.lengths.contains(&range.len()); + if valid { + add_root(&result.approx_forest, rule, range); + } + valid + } + + Rule::Concat(_) => match first_possibility().filter(|&x| x <= range.len()) { + Some(split) => { + stack.push((rule, range, split, false)); + continue 'stack; + } + None => false, + }, + + Rule::Or(_) => match first_possibility() { + Some(choice) => { + stack.push((rule, range, choice, false)); + continue 'stack; + } + None => false, + }, + + Rule::Opt(child) => { + if range.is_empty() { + true + } else { + rule = child; + continue; + } + } + + Rule::RepeatMany(..) | Rule::RepeatMore(..) => { + rule = rule.expand_repeats(cx); + continue; + } + }; + + valid_cache.insert((rule, range), valid); + break valid; + }; + + // Commit the validity into the parent frames, advance them to + // the next split/choice, and pop them if they're complete. + while let Some(&mut (rule, range, ref mut i, ref mut any_valid)) = stack.last_mut() { + if valid { + *any_valid = true; + + // FIXME(eddyb) deduplicate this with the other place which + // does exactly this. + match cx[rule] { + Rule::Concat([left, _]) => { + let split = *i; + + let (left_range, right_range, _) = range.split_at(split); + + add_root(&approx_forest, left, Range(left_range)); + + // HACK(eddyb) need a more ergonomic SPPF builder/parser API. + parser + .borrow_mut() + .with_result_and_remaining( + Range(right_range), + Range(full_input.split_at(range.end()).1), + ) + .forest_add_split( + rule, + Node { + kind: left, + range: Range(left_range), + }, + ); + } + + Rule::Or(_) => { + let choice = *i; + + // HACK(eddyb) need a more ergonomic SPPF builder/parser API. + parser + .borrow_mut() + .with_result_and_remaining( + range, + Range(full_input.split_at(range.end()).1), + ) + .forest_add_choice(rule, choice); + } + + // Only `Concat` and `Or` can be on the stack. + _ => unreachable!(), + } + } + + // Try to advance this frame. + let mut next = match &approx_forest.possibilities[&(rule, range.start()..)].1 { + SmallSet::One(_) => None, + SmallSet::Many(set) => { + use std::ops::Bound::*; + set.range((Excluded(*i), Unbounded)).next().cloned() + } + }; + if let Rule::Concat(_) = cx[rule] { + next = next.filter(|&x| x <= range.len()); + } + + match next { + Some(next) => { + // Advance this frame and keep it around. + *i = next; + continue 'stack; + } + None => { + // Pop this frame and repeat for its parent frame. + valid = *any_valid; + valid_cache.insert((rule, range), valid); + stack.pop(); + } + } + } + } + } +} + +pub fn parse<'a, Pat: Clone + Ord + Hash + fmt::Debug, I: Input>( + cx: &'a Context, + grammar: &'a crate::Grammar, + named_rule: IStr, + input: I, +) -> ParseResult> +where + I::Slice: InputMatch, +{ + Parser::parse_with(CxAndGrammar { cx, grammar }, input, |parser| { + let full_input = parser.remaining(); + let parser = &RefCell::new(parser); + let mut parse_cached = bruteforce::memoize(|parse_cached, (rule, range)| { + let mut approx_forest = ApproxForest::default(); + let lengths = parse_inner( + cx, + grammar, + parser, + parse_cached, + &mut approx_forest, + rule, + range, + ); + CachedParse { + lengths: Rc::new(lengths), + approx_forest: Rc::new(approx_forest), + } + }); + + let longest = parse_cached((grammar.rules[&named_rule].rule, full_input)) + .lengths + .iter() + .cloned() + .rev() + .next()?; + let node = Node { + kind: cx.intern(Rule::Call(named_rule)), + range: Range(full_input.split_at(longest).0), + }; + + // Only construct the SPPF in case of success. + if node.range == full_input { + build_sppf(cx, grammar, parser, parse_cached, node); + } + + Some(node) + }) + .map(|forest_and_node| OwnedHandle { forest_and_node }) +} diff --git a/src/forest.rs b/src/forest.rs index 3b9cca2..9aea52c 100644 --- a/src/forest.rs +++ b/src/forest.rs @@ -6,6 +6,7 @@ use std::collections::{BTreeSet, HashMap, HashSet, VecDeque}; use std::fmt; use std::hash::{Hash, Hasher}; use std::io::{self, Write}; +use std::iter; use std::str; #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord)] @@ -241,9 +242,12 @@ impl<'i, G: GrammarReflector, I: Input> ParseForest<'i, G, I> { } } - pub fn dump_graphviz(&self, out: &mut dyn Write) -> io::Result<()> { + pub fn dump_graphviz(&self, root: Option>, out: &mut dyn Write) -> io::Result<()> { writeln!(out, "digraph forest {{")?; - let mut queue: VecDeque<_> = self.possibilities.keys().cloned().collect(); + let mut queue: VecDeque<_> = match root { + Some(root) => iter::once(root).collect(), + None => self.possibilities.keys().cloned().collect(), + }; let mut seen: HashSet<_> = queue.iter().cloned().collect(); let mut p = 0; let node_name = |Node { kind, range }| { @@ -304,6 +308,627 @@ impl<'i, G: GrammarReflector, I: Input> ParseForest<'i, G, I> { } } +pub mod dynamic { + use super::{ + GrammarReflector, MoreThanOne, Node, NodeShape, OwnedParseForestAndNode, ParseForest, + }; + use crate::context::{Context, IFields, IRule, IStr}; + use crate::input::{Input, Range}; + use crate::rule::{Fields, Rule}; + use std::fmt; + use std::hash::Hash; + use std::rc::Rc; + + pub struct CxAndGrammar<'a, Pat> { + pub cx: &'a Context, + pub grammar: &'a crate::Grammar, + } + + impl GrammarReflector for CxAndGrammar<'_, Pat> { + type NodeKind = IRule; + + fn node_shape(&self, rule: IRule) -> NodeShape { + rule.node_shape(self.cx, Some(&self.grammar.rules)) + } + fn node_shape_choice_get(&self, rule: IRule, i: usize) -> IRule { + match &self.cx[rule] { + Rule::Or(cases) => cases[i], + _ => unreachable!(), + } + } + fn node_desc(&self, rule: IRule) -> String { + rule.node_desc(self.cx) + } + } + + // TODO(eddyb) remove this entirely, only user of it left is `ListHandle`. + #[derive(Clone)] + struct ExpandedTree<'i, G: GrammarReflector> { + node: Node<'i, G>, + kind: ExpandedTreeKind<'i, G>, + } + + #[derive(Clone)] + enum ExpandedTreeKind<'i, G: GrammarReflector> { + Leaf, + Or(G::NodeKind, Rc>), + Opt(Option>>), + Concat([Rc>; 2]), + } + + impl<'i, G: GrammarReflector> ExpandedTree<'i, G> { + fn one_from_node( + forest: &ParseForest<'i, G, I>, + node: Node<'i, G>, + ) -> Result, MoreThanOne> + where + I: Input, + { + let kind = match forest.grammar.node_shape(node.kind) { + NodeShape::Opaque | NodeShape::Alias(_) => ExpandedTreeKind::Leaf, + NodeShape::Choice(_) => { + let child = forest.one_choice(node)?; + ExpandedTreeKind::Or(child.kind, Self::one_from_node(forest, child)?) + } + NodeShape::Opt(_) => ExpandedTreeKind::Opt(match forest.unpack_opt(node) { + Some(child) => Some(Self::one_from_node(forest, child)?), + None => None, + }), + NodeShape::Split(..) => { + let (left, right) = forest.one_split(node)?; + ExpandedTreeKind::Concat([ + Self::one_from_node(forest, left)?, + Self::one_from_node(forest, right)?, + ]) + } + }; + Ok(Rc::new(ExpandedTree { node, kind })) + } + + fn all_from_node(forest: &ParseForest<'i, G, I>, node: Node<'i, G>) -> Vec> + where + I: Input, + { + let new = |kind| Rc::new(ExpandedTree { node, kind }); + match forest.grammar.node_shape(node.kind) { + NodeShape::Opaque | NodeShape::Alias(_) => vec![new(ExpandedTreeKind::Leaf)], + NodeShape::Choice(_) => forest + .all_choices(node) + .flat_map(|child| { + Self::all_from_node(forest, child) + .into_iter() + .map(move |child_tree| { + new(ExpandedTreeKind::Or(child.kind, child_tree)) + }) + }) + .collect(), + NodeShape::Opt(_) => match forest.unpack_opt(node) { + Some(child) => Self::all_from_node(forest, child) + .into_iter() + .map(|child_tree| new(ExpandedTreeKind::Opt(Some(child_tree)))) + .collect(), + None => vec![new(ExpandedTreeKind::Opt(None))], + }, + NodeShape::Split(..) => forest + .all_splits(node) + .flat_map(|(left, right)| { + Self::all_from_node(forest, left) + .into_iter() + .flat_map(move |left_tree| { + Self::all_from_node(forest, right).into_iter().map( + move |right_tree| { + new(ExpandedTreeKind::Concat([ + left_tree.clone(), + right_tree, + ])) + }, + ) + }) + }) + .collect(), + } + } + } + + #[derive(Debug)] + pub struct Ambiguity(T); + + pub struct OwnedHandle<'a, Pat: Eq + Hash + fmt::Debug, I: Input> { + pub forest_and_node: OwnedParseForestAndNode, I>, + } + + impl OwnedHandle<'_, Pat, I> { + pub fn source_info(&self) -> I::SourceInfo { + self.forest_and_node.unpack_ref(|_, forest_and_node| { + let (ref forest, node) = *forest_and_node; + forest.source_info(node.range) + }) + } + + pub fn with(&self, f: impl FnOnce(Handle<'_, '_, '_, Pat, I>) -> R) -> R { + self.forest_and_node.unpack_ref(|_, forest_and_node| { + let (ref forest, node) = *forest_and_node; + f(Handle { + forest, + node, + fields: None, + disambiguator: None, + }) + }) + } + } + + impl fmt::Debug for OwnedHandle<'_, Pat, I> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.with(|handle| handle.fmt(f)) + } + } + + // FIXME(eddyb) figure out how to maybe get rid of the 'a/'b split. + pub struct Handle<'a, 'b, 'i, Pat: Eq + Hash + fmt::Debug, I: Input> { + pub forest: &'a ParseForest<'i, CxAndGrammar<'b, Pat>, I>, + pub node: Node<'i, CxAndGrammar<'b, Pat>>, + pub fields: Option, + // FIXME(eddyb) support an arbitrary number of disambiguations here + disambiguator: Option<(Node<'i, CxAndGrammar<'b, Pat>>, usize)>, + } + + impl Copy for Handle<'_, '_, '_, Pat, I> {} + + impl Clone for Handle<'_, '_, '_, Pat, I> { + fn clone(&self) -> Self { + *self + } + } + + impl fmt::Debug for Handle<'_, '_, '_, Pat, I> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + struct FieldName<'a>(&'a str); + impl fmt::Debug for FieldName<'_> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.0) + } + } + + self.source_info().fmt(f)?; + + let cx = self.forest.grammar.cx; + let mut first = true; + + let name = match cx[self.node.kind] { + Rule::RepeatMany(..) | Rule::RepeatMore(..) => { + f.write_str(" => ")?; + for x in self.all_lists() { + if !first { + f.write_str(" | ")?; + } + first = false; + x.fmt(f)?; + } + return Ok(()); + } + Rule::Call(name) => Some(name), + _ => None, + }; + + if self.fields.is_some() || name.is_some() { + f.write_str(" => ")?; + for x in self.all_records() { + if !first { + f.write_str(" | ")?; + } + first = false; + + if let Some(name) = name { + f.write_str(&cx[name])?; + f.write_str(" ")?; + } + + let mut f = f.debug_map(); + x.visit_fields(&mut |r| match r { + Ok((name, field)) => { + f.entry(&FieldName(&cx[name]), &field); + } + Err(Ambiguity(handle)) => { + // FIXME(eddyb) print this properly, similar to lists. + // (will require reimplementing the `debug_map` adapter) + f.entry(&FieldName(".."), &handle); + } + }); + f.finish()?; + } + } + Ok(()) + } + } + + impl<'a, 'b, 'i, Pat: Eq + Hash + fmt::Debug, I: Input> Handle<'a, 'b, 'i, Pat, I> { + pub fn source(self) -> &'a I::Slice { + self.forest.input(self.node.range) + } + + pub fn source_info(self) -> I::SourceInfo { + self.forest.source_info(self.node.range) + } + + // FIXME(eddyb) make this return an iterator or get rid of somehow. + fn all_records(self) -> Vec { + let forest = self.forest; + let cx = forest.grammar.cx; + + let mut node = self.node; + let fields = self.fields.unwrap_or_else(|| match cx[node.kind] { + Rule::Call(name) => { + if let NodeShape::Alias(inner) = forest.grammar.node_shape(node.kind) { + node.kind = inner; + } + forest.grammar.grammar.rules[&name].fields + } + _ => unreachable!("not a record"), + }); + + let rec = |disambiguator: Option| Handle { + disambiguator: disambiguator.map(|i| (node, i)), + ..self + }; + + if let Fields::Aggregate(_) = cx[fields] { + match &cx[node.kind] { + Rule::Concat(_) => { + return forest + .all_splits(node) + .map(|(left, _)| rec(Some(left.range.len()))) + .collect() + } + Rule::Or(rules) => { + return forest + .all_choices(node) + .map(|child| { + // FIXME(eddyb) expose the index from the forest, + // or integrate fielded traversal through the forest. + let i = rules.iter().position(|&rule| child.kind == rule).unwrap(); + rec(Some(i)) + }) + .collect(); + } + _ => {} + } + } + vec![rec(None)] + } + + pub fn visit_fields( + &self, + f: &mut impl FnMut( + // FIXME(eddyb) maybe make the error case, or Ambiguity itself, an iterator? + // Maybe `Ambiguities` would be better? Same for list tails? + Result<(IStr, Self), Ambiguity>, + ), + ) { + let forest = self.forest; + let cx = forest.grammar.cx; + + let mut node = self.node; + // FIXME(eddyb) remember the name here. + let fields = self.fields.unwrap_or_else(|| match cx[node.kind] { + Rule::Call(name) => { + if let NodeShape::Alias(inner) = forest.grammar.node_shape(node.kind) { + node.kind = inner; + } + forest.grammar.grammar.rules[&name].fields + } + _ => unreachable!("not a record"), + }); + + let children = match &cx[fields] { + Fields::Leaf(field) => { + if let Some(field) = field { + f(Ok(( + field.name, + Handle { + forest, + node, + fields: if cx[field.sub] == Fields::Leaf(None) { + // HACK(eddyb) figure out a nicer way to communicate leaves. + None + } else { + Some(field.sub) + }, + disambiguator: None, + }, + ))) + } + return; + } + Fields::Aggregate(children) => children, + }; + let mut visit_child = |child, i| { + Handle { + forest, + node: child, + fields: Some(children[i]), + disambiguator: None, + } + .visit_fields(f); + }; + + match cx[node.kind] { + Rule::Concat([left_rule, right_rule]) => { + let split = match self.disambiguator { + Some((dis_node, dis_split)) if dis_node == node => { + let (left, right, _) = node.range.split_at(dis_split); + Ok(( + Node { + kind: left_rule, + range: Range(left), + }, + Node { + kind: right_rule, + range: Range(right), + }, + )) + } + _ => forest.one_split(node), + }; + match split { + Ok((left, right)) => { + visit_child(left, 0); + visit_child(right, 1); + } + Err(_) => return f(Err(Ambiguity(*self))), + } + } + Rule::Or(ref rules) => { + let choice = match self.disambiguator { + Some((dis_node, dis_choice)) if dis_node == node => Ok(Node { + kind: rules[dis_choice], + range: node.range, + }), + _ => forest.one_choice(node), + }; + match choice { + Ok(child) => { + // FIXME(eddyb) use `IndexSet` in `Rule::Or`. + let i = rules.iter().position(|&rule| child.kind == rule).unwrap(); + visit_child(child, i); + } + Err(_) => return f(Err(Ambiguity(*self))), + } + } + Rule::Opt(_) => { + if let Some(child) = forest.unpack_opt(node) { + visit_child(child, 0); + } + } + _ => unreachable!("not an aggregate"), + } + } + + pub fn field(&self, name: IStr) -> Result, Ambiguity> { + // FIXME(eddyb) speed this up somehow. + let mut found = None; + let mut ambiguity = None; + self.visit_fields(&mut |r| match r { + Ok((field_name, field)) => { + if field_name == name { + found = Some(field); + } + } + Err(a) => { + if ambiguity.is_none() { + ambiguity = Some(a); + } + } + }); + match (found, ambiguity) { + (Some(field), _) => Ok(Some(field)), + (_, Some(ambiguity)) => Err(ambiguity), + _ => Ok(None), + } + } + + pub fn field_by_str(&self, name: &str) -> Result, Ambiguity> { + self.field(self.forest.grammar.cx.intern(name)) + } + + // FIXME(eddyb) maybe these should be deep?! then `ExpandedTree` shouldn't + // be controlled using `Alias` but something else (and maybe stop using `Alias` + // for `Repeat{Many,More}`?). This is all kinda tricky. + pub fn as_list(mut self) -> ListHandle<'a, 'b, 'i, Pat, I> { + assert_eq!(self.fields, None); + let tree = match self.forest.grammar.cx[self.node.kind] { + Rule::RepeatMany(..) => { + // Can't be ambiguous, due to being `Opt`. + self.node = self.forest.unpack_alias(self.node); + ExpandedTree::one_from_node(self.forest, self.node).unwrap() + } + Rule::RepeatMore(..) => { + // Might be ambiguous, fake it being a `Many`. + // NOTE(eddyb) the unwrap is fine because we haven't done `unpack_alias`. + let many = ExpandedTree::one_from_node(self.forest, self.node).unwrap(); + Rc::new(ExpandedTree { + node: self.node, + kind: ExpandedTreeKind::Opt(Some(many)), + }) + } + _ => unreachable!("not a list"), + }; + ListHandle { + forest: self.forest, + tree, + } + } + + // FIXME(eddyb) move to `ListHandle` *or* make deep. + fn all_lists(mut self) -> impl Iterator> { + assert_eq!(self.fields, None); + match self.forest.grammar.cx[self.node.kind] { + Rule::RepeatMany(..) | Rule::RepeatMore(..) => {} + _ => unreachable!("not a list"), + } + self.node = self.forest.unpack_alias(self.node); + ExpandedTree::all_from_node(self.forest, self.node) + .into_iter() + .map(move |tree| ListHandle { + forest: self.forest, + tree, + }) + } + } + + pub struct ListHandle<'a, 'b, 'i, Pat: Eq + Hash + fmt::Debug, I: Input> { + pub forest: &'a ParseForest<'i, CxAndGrammar<'b, Pat>, I>, + tree: Rc>>, + } + + impl Clone for ListHandle<'_, '_, '_, Pat, I> { + fn clone(&self) -> Self { + ListHandle { + forest: self.forest, + tree: self.tree.clone(), + } + } + } + + impl<'a, 'b, 'i, Pat: Eq + Hash + fmt::Debug, I: Input> Iterator + for ListHandle<'a, 'b, 'i, Pat, I> + { + type Item = Result, Ambiguity>>; + + fn next(&mut self) -> Option { + match &self.tree.kind { + ExpandedTreeKind::Opt(Some(more)) => { + let more = self.forest.unpack_alias(more.node); + match ExpandedTree::one_from_node(self.forest, more) { + Ok(more) => self.tree = more, + Err(_) => { + return Some(Err(Ambiguity(Handle { + forest: self.forest, + node: more, + fields: None, + disambiguator: None, + }))) + } + } + } + ExpandedTreeKind::Opt(None) => return None, + _ => {} + } + match &self.tree.kind { + ExpandedTreeKind::Concat([elem, tail]) => { + let elem = Handle { + forest: self.forest, + node: elem.node, + fields: None, + disambiguator: None, + }; + + self.tree = tail.clone(); + loop { + match &self.tree.kind { + // HACK(eddyb) this only works because it's handled first + // in the next `::next` call, even if + // it might be otherwise not the right rule. + ExpandedTreeKind::Opt(None) => return Some(Ok(elem)), + ExpandedTreeKind::Opt(Some(tail)) + | ExpandedTreeKind::Concat([_, tail]) => { + self.tree = tail.clone(); + } + ExpandedTreeKind::Leaf => { + *self = Handle { + forest: self.forest, + node: self.tree.node, + fields: None, + disambiguator: None, + } + .as_list(); + return Some(Ok(elem)); + } + _ => unreachable!(), + } + } + } + _ => unreachable!(), + } + } + } + + impl fmt::Debug for ListHandle<'_, '_, '_, Pat, I> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + struct Spread(T); + impl fmt::Debug for Spread { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("...(")?; + self.0.fmt(f)?; + f.write_str(")") + } + } + + let mut f = f.debug_list(); + for x in self.clone() { + match x { + Ok(elem) => { + f.entry(&elem); + } + Err(Ambiguity(tail)) => { + f.entry(&Spread(tail)); + break; + } + } + } + f.finish() + } + } + + // HACK(eddyb) work around `macro_rules` not being `use`-able. + pub use crate::__forest_dynamic_handle as handle; + + #[macro_export] + macro_rules! __forest_dynamic_handle { + (let _ = $handle:expr) => { + let _ = $handle; + }; + (let $x:ident = $handle:expr) => { + let $x = $handle; + }; + (let { $($field:ident),* $(,)? } = $handle:expr) => { + let handle = &$handle; + $(handle!(let $field = handle.field_by_str(stringify!($field)).unwrap().unwrap());)* + }; + + (if let _ = $handle:ident $body:block) => { + match $handle { _ => $body } + }; + (if let $x:ident = $handle:ident $body:block) => { + match $handle { $x => $body } + }; + (if let {} = $handle:ident $body:block) => { + match $handle { _ => $body } + }; + (if let { $field:ident: $pat:tt $(, $($rest:tt)*)? } = $handle:ident $body:block) => { + if let Some(x) = $handle.field_by_str(stringify!($field)).unwrap() { + handle!(if let $pat = x { + handle!(if let { $($($rest)*)? } = $handle $body) + }) + } + }; + (if let { $field:ident $(,)? $(, $($rest:tt)*)? } = $handle:ident $body:block) => { + handle!(if let { $field: $field $(, $($rest)*)? } = $handle $body) + }; + + (match $handle:ident { $($pat:tt => $e:expr),* $(,)? }) => { + loop { + $(handle!(if let $pat = $handle { + break $e; + });)* + #[allow(unreachable_code)] { + unreachable!(); + } + } + }; + } +} + pub mod typed { use super::{GrammarReflector, MoreThanOne, Node, ParseForest}; use crate::input::Input; diff --git a/src/lib.rs b/src/lib.rs index 93fd2d8..4b30374 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -7,6 +7,8 @@ mod high; #[allow(unsafe_code)] mod indexing_str; +#[forbid(unsafe_code)] +pub mod bruteforce; #[forbid(unsafe_code)] pub mod context; #[forbid(unsafe_code)] @@ -14,6 +16,8 @@ pub mod forest; #[forbid(unsafe_code)] pub mod input; #[forbid(unsafe_code)] +pub mod lyg; +#[forbid(unsafe_code)] pub mod parser; #[forbid(unsafe_code)] pub mod proc_macro; @@ -78,101 +82,3 @@ impl Grammar { } } } - -/// Construct a (meta-)grammar for parsing a grammar. -pub fn grammar_grammar>(cx: &Context) -> Grammar { - use crate::rule::*; - - // HACK(eddyb) more explicit subset of the grammar, for bootstrapping. - macro_rules! rule { - ({ $start:tt ..= $end:tt }) => { - eat($start..=$end) - }; - ({ ! $pat:tt }) => { - negative_lookahead($pat) - }; - ({ ! $start:tt ..= $end:tt }) => { - negative_lookahead($start..=$end) - }; - ($rule:ident) => { - call(stringify!($rule)) - }; - ({ $name:ident : $rule:tt }) => { - rule!($rule).field(stringify!($name)) - }; - ({ $rule:tt ? }) => { - rule!($rule).opt() - }; - ({ $elem:tt * }) => { - rule!($elem).repeat_many() - }; - ({ $elem:tt + }) => { - rule!($elem).repeat_more() - }; - ({ $elem:tt + % $sep:tt }) => { - rule!($elem).repeat_more_sep(rule!($sep), SepKind::Simple) - }; - ({ $rule0:tt $(| $rule:tt)+ }) => { - rule!($rule0) $(| rule!($rule))+ - }; - ({ $rule0:tt $($rule:tt)* }) => { - rule!($rule0) $(+ rule!($rule))* - }; - ($pat:expr) => { - eat($pat) - }; - } - - macro_rules! grammar { - ($($rule_name:ident = $($rule:tt)|+;)*) => ({ - let mut grammar = Grammar::new(); - $(grammar.define( - cx.intern(stringify!($rule_name)), - rule!({ $($rule)|+ }).finish(cx), - );)* - grammar - }) - } - - // Main grammar. - let mut grammar = grammar! { - Grammar = { FileStart {rules:{RuleDef*}} FileEnd }; - RuleDef = { {name:Ident} "=" {rule:Or} ";" }; - Or = {{"|"?} {rules:{Concat+ % "|"}}}; - Concat = {rules:{Rule+}}; - Rule = { {{ {field:Ident} ":" }?} {rule:Primary} {{modifier:Modifier}?} }; - Primary = - {Eat:Pattern} | - {Call:Ident} | - {Group:{ "{" {{or:Or}?} "}" }}; - Modifier = - {Opt:"?"} | - {Repeat:{ {repeat:Repeat} {{ {kind:SepKind} {sep:Primary} }?} }}; - Repeat = - {Many:"*"} | - {More:"+"}; - SepKind = - {Simple:"%"} | - // HACK(eddyb) should be "%%", but `rustc`'s `proc_macro` server doesn't - // always preserve jointness, except within multi-character Rust operators. - {Trailing:{"%" "%"}}; - Pattern = - {Str:StrLit} | - {CharRange:{ {{start:CharLit}?} ".." {{end:CharLit}?} }} | - {CharRangeInclusive:{ {{start:CharLit}?} "..=" {end:CharLit} }}; - }; - - // Lexical fragment of the grammar. - grammar.extend(grammar! { - FileStart = ""; - FileEnd = ""; - - Ident = IDENT; - - // FIXME(eddyb) restrict literals, once `proc_macro` allows it. - StrLit = LITERAL; - CharLit = LITERAL; - }); - - grammar -} diff --git a/src/lyg.rs b/src/lyg.rs new file mode 100644 index 0000000..531ca0f --- /dev/null +++ b/src/lyg.rs @@ -0,0 +1,273 @@ +use crate::context::Context; +use crate::forest::dynamic::handle; +use crate::parser::ParseError; +use crate::proc_macro::{FlatToken, Pat as PMPat, Span, TokenStream}; +use crate::rule; +use crate::scannerless::Pat as SPat; +use crate::Grammar; +use std::hash::Hash; +use std::ops::Bound; + +/// Construct a (meta-)grammar for parsing a `lyg` grammar. +pub fn grammar>(cx: &Context) -> Grammar { + use crate::rule::*; + + // HACK(eddyb) more explicit subset of the grammar, for bootstrapping. + macro_rules! rule { + ({ $start:tt ..= $end:tt }) => { + eat($start..=$end) + }; + ({ ! $pat:tt }) => { + negative_lookahead($pat) + }; + ({ ! $start:tt ..= $end:tt }) => { + negative_lookahead($start..=$end) + }; + ($rule:ident) => { + call(stringify!($rule)) + }; + ({ $name:ident : $rule:tt }) => { + rule!($rule).field(stringify!($name)) + }; + ({ $rule:tt ? }) => { + rule!($rule).opt() + }; + ({ $elem:tt * }) => { + rule!($elem).repeat_many() + }; + ({ $elem:tt + }) => { + rule!($elem).repeat_more() + }; + ({ $elem:tt + % $sep:tt }) => { + rule!($elem).repeat_more_sep(rule!($sep), SepKind::Simple) + }; + ({ $rule0:tt $(| $rule:tt)+ }) => { + rule!($rule0) $(| rule!($rule))+ + }; + ({ $rule0:tt $($rule:tt)* }) => { + rule!($rule0) $(+ rule!($rule))* + }; + ($pat:expr) => { + eat($pat) + }; + } + + macro_rules! grammar { + ($($rule_name:ident = $($rule:tt)|+;)*) => ({ + let mut grammar = Grammar::new(); + $(grammar.define( + cx.intern(stringify!($rule_name)), + rule!({ $($rule)|+ }).finish(cx), + );)* + grammar + }) + } + + // Main grammar. + let mut grammar = grammar! { + Grammar = { FileStart {rules:{RuleDef*}} FileEnd }; + RuleDef = { {name:Ident} "=" {rule:Or} ";" }; + Or = {{"|"?} {rules:{Concat+ % "|"}}}; + Concat = {rules:{Rule+}}; + Rule = { {{ {field:Ident} ":" }?} {rule:Primary} {{modifier:Modifier}?} }; + Primary = + {Eat:Pattern} | + {Call:Ident} | + {Group:{ "{" {{or:Or}?} "}" }}; + Modifier = + {Opt:"?"} | + {Repeat:{ {repeat:Repeat} {{ {kind:SepKind} {sep:Primary} }?} }}; + Repeat = + {Many:"*"} | + {More:"+"}; + SepKind = + {Simple:"%"} | + // HACK(eddyb) should be "%%", but `rustc`'s `proc_macro` server doesn't + // always preserve jointness, except within multi-character Rust operators. + {Trailing:{"%" "%"}}; + Pattern = + {Str:StrLit} | + {CharRange:{ {{start:CharLit}?} ".." {{end:CharLit}?} }} | + {CharRangeInclusive:{ {{start:CharLit}?} "..=" {end:CharLit} }}; + }; + + // Lexical fragment of the grammar. + grammar.extend(grammar! { + FileStart = ""; + FileEnd = ""; + + Ident = IDENT; + + // FIXME(eddyb) restrict literals, once `proc_macro` allows it. + StrLit = LITERAL; + CharLit = LITERAL; + }); + + grammar +} + +type Handle<'a, 'b, 'i> = crate::forest::dynamic::Handle<'a, 'b, 'i, PMPat, TokenStream>; + +pub fn parse>( + cx: &Context, + stream: TokenStream, +) -> Result> { + let lyg_cx = &crate::proc_macro::Context::new(); + let mut lyg_grammar; + + let g = { + let cx = lyg_cx; + lyg_grammar = crate::proc_macro::builtin(cx); + lyg_grammar.extend(grammar(cx)); + crate::bruteforce::parse(cx, &lyg_grammar, cx.intern("Grammar"), stream.clone()) + }; + + let mut grammar = Grammar::new(); + g?.with(|g| { + handle!(let { rules } = g); + for rule_def in rules.as_list() { + handle!(let { name, rule } = rule_def.unwrap()); + let name = match name.source() { + [FlatToken::Ident(ident)] => ident.to_string(), + _ => unreachable!(), + }; + grammar.define(cx.intern(&name[..]), lower_or(rule, cx)); + } + }); + Ok(grammar) +} + +fn lower_or>( + this: Handle<'_, '_, '_>, + cx: &Context, +) -> rule::RuleWithFields { + handle!(let { rules } = this); + let mut rules = rules.as_list().map(|rule| rule.unwrap()); + let first = lower_concat(rules.next().unwrap(), cx); + rules.fold(first, |a, b| (a | lower_concat(b, cx)).finish(cx)) +} + +fn lower_concat>( + this: Handle<'_, '_, '_>, + cx: &Context, +) -> rule::RuleWithFields { + handle!(let { rules } = this); + rules + .as_list() + .map(|rule| rule.unwrap()) + .fold(rule::empty().finish(cx), |a, b| { + (a + lower_rule(b, cx)).finish(cx) + }) +} + +fn lower_rule>( + this: Handle<'_, '_, '_>, + cx: &Context, +) -> rule::RuleWithFields { + handle!(let { rule } = this); + let mut rule = lower_primary(rule, cx); + handle!(if let { modifier } = this { + rule = lower_modifier(modifier, cx, rule); + }); + handle!(if let { field } = this { + let field = match field.source() { + [FlatToken::Ident(ident)] => ident.to_string(), + _ => unreachable!(), + }; + rule = rule.field(&field).finish(cx); + }); + rule +} + +fn lower_primary>( + this: Handle<'_, '_, '_>, + cx: &Context, +) -> rule::RuleWithFields { + handle!(match this { + {Eat:pat} => rule::eat(lower_pattern(pat)).finish(cx), + {Call:name} => { + let name = match name.source() { + [FlatToken::Ident(ident)] => ident.to_string(), + _ => unreachable!(), + }; + rule::call(&name).finish(cx) + }, + {Group:{ or }} => lower_or(or, cx), + {Group:_} => rule::empty().finish(cx), + }) +} + +fn lower_modifier>( + this: Handle<'_, '_, '_>, + cx: &Context, + rule: rule::RuleWithFields, +) -> rule::RuleWithFields { + handle!(match this { + {Opt:_} => rule.opt().finish(cx), + {Repeat:{ repeat, sep, kind }} => { + let repeat = repeat; + let sep = lower_primary(sep, cx); + let kind = lower_sep_kind(kind); + handle!(match repeat { + {Many:_} => rule.repeat_many_sep(sep, kind).finish(cx), + {More:_} => rule.repeat_more_sep(sep, kind).finish(cx), + }) + }, + {Repeat:{ repeat }} => { + let repeat = repeat; + handle!(match repeat { + {Many:_} => rule.repeat_many().finish(cx), + {More:_} => rule.repeat_more().finish(cx), + }) + } + }) +} + +fn lower_sep_kind(this: Handle<'_, '_, '_>) -> rule::SepKind { + handle!(match this { + {Simple:_} => rule::SepKind::Simple, + {Trailing:_} => rule::SepKind::Trailing, + }) +} + +fn lower_pattern(this: Handle<'_, '_, '_>) -> SPat { + fn unescape(handle: Handle<'_, '_, '_>) -> String { + let mut out = String::new(); + let s = match handle.source() { + [FlatToken::Literal(lit)] => lit.to_string(), + _ => unreachable!(), + }; + let mut chars = s[1..s.len() - 1].chars(); + while let Some(c) = chars.next() { + let c = match c { + '\\' => match chars.next().unwrap() { + 't' => '\t', + 'n' => '\n', + 'r' => '\r', + c => c, + }, + _ => c, + }; + out.push(c); + } + out + } + let unescape_char = |c| unescape(c).parse::().unwrap(); + handle!(match this { + {Str:s} => SPat::from(unescape(s)), + {CharRange:_} => SPat::from(( + handle!(match this { {CharRange:{ start }} => Some(start), _ => None }) + .map(unescape_char) + .map_or(Bound::Unbounded, Bound::Included), + handle!(match this { {CharRange:{ end }} => Some(end), _ => None }) + .map(unescape_char) + .map_or(Bound::Unbounded, Bound::Excluded), + )), + {CharRangeInclusive:{ end }} => SPat::from(( + handle!(match this { {CharRangeInclusive:{ start }} => Some(start), _ => None }) + .map(unescape_char) + .map_or(Bound::Unbounded, Bound::Included), + Bound::Included(unescape_char(end)), + )), + }) +} diff --git a/src/proc_macro_input.rs b/src/proc_macro_input.rs index f7a0c77..dcb63aa 100644 --- a/src/proc_macro_input.rs +++ b/src/proc_macro_input.rs @@ -1,7 +1,7 @@ use crate::input::{Input, InputMatch, Range}; -use crate::proc_macro::{flatten, FlatToken, FlatTokenPat, Span, TokenStream}; +use crate::proc_macro::{flatten, FlatToken, FlatTokenPat, Pat, Span, TokenStream}; use indexing::{proof::Provable, Container, Index, Unknown}; -use std::ops; +use std::ops::{self, Deref}; impl Input for TokenStream { type Container = Vec; @@ -52,8 +52,20 @@ impl Input for TokenStream { } } -impl InputMatch<[FlatTokenPat<&'_ str>]> for [FlatToken] { - fn match_left(&self, pat: &[FlatTokenPat<&str>]) -> Option { +// FIXME(eddyb) can't use `Pats: AsRef<[FlatTokenPat]` as it doesn't constrain `S`. +impl, Pats: Deref]>> InputMatch> + for [FlatToken] +{ + fn match_left(&self, pat: &Pat) -> Option { + self.match_left(&*pat.0) + } + fn match_right(&self, pat: &Pat) -> Option { + self.match_right(&*pat.0) + } +} + +impl> InputMatch<[FlatTokenPat]> for [FlatToken] { + fn match_left(&self, pat: &[FlatTokenPat]) -> Option { if self .iter() .zip(pat) @@ -66,7 +78,7 @@ impl InputMatch<[FlatTokenPat<&'_ str>]> for [FlatToken] { None } } - fn match_right(&self, pat: &[FlatTokenPat<&str>]) -> Option { + fn match_right(&self, pat: &[FlatTokenPat]) -> Option { if self .iter() .zip(pat) diff --git a/src/rule.rs b/src/rule.rs index aafb887..38885f1 100644 --- a/src/rule.rs +++ b/src/rule.rs @@ -418,21 +418,39 @@ impl IRule { Rule::Concat([left, right]) => NodeShape::Split(left, right), Rule::Or(ref cases) => NodeShape::Choice(cases.len()), Rule::Opt(rule) => NodeShape::Opt(rule), - Rule::RepeatMany(elem, sep) => NodeShape::Opt(cx.intern(Rule::RepeatMore(elem, sep))), - Rule::RepeatMore(rule, None) => { - NodeShape::Split(rule, cx.intern(Rule::RepeatMany(rule, None))) + Rule::RepeatMany(..) | Rule::RepeatMore(..) => { + NodeShape::Alias(self.expand_repeats(cx)) } - Rule::RepeatMore(elem, Some((sep, SepKind::Simple))) => NodeShape::Split( - elem, - cx.intern(Rule::Opt(cx.intern(Rule::Concat([sep, self])))), - ), - Rule::RepeatMore(elem, Some((sep, SepKind::Trailing))) => NodeShape::Split( + } + } + + pub fn expand_repeats(self, cx: &Context) -> Self { + match cx[self] { + Rule::Empty + | Rule::Eat(_) + | Rule::Call(_) + | Rule::Concat(_) + | Rule::Or(_) + | Rule::Opt(_) => self, + + Rule::RepeatMany(elem, sep) => { + cx.intern(Rule::Opt(cx.intern(Rule::RepeatMore(elem, sep)))) + } + Rule::RepeatMore(elem, sep) => cx.intern(Rule::Concat([ elem, - cx.intern(Rule::Opt(cx.intern(Rule::Concat([ - sep, - cx.intern(Rule::RepeatMany(elem, Some((sep, SepKind::Trailing)))), - ])))), - ), + match sep { + None => cx.intern(Rule::RepeatMany(elem, None)), + Some((sep, kind)) => cx.intern(Rule::Opt(cx.intern(Rule::Concat([ + sep, + match kind { + SepKind::Simple => self, + SepKind::Trailing => { + cx.intern(Rule::RepeatMany(elem, Some((sep, SepKind::Trailing)))) + } + }, + ])))), + }, + ])), } } diff --git a/src/scannerless.rs b/src/scannerless.rs index 4139177..9410a08 100644 --- a/src/scannerless.rs +++ b/src/scannerless.rs @@ -1,3 +1,4 @@ +use crate::input::InputMatch; use crate::rule::{MatchesEmpty, MaybeKnown}; use std::char; use std::fmt; @@ -98,3 +99,30 @@ impl> MatchesEmpty for Pat { }) } } + +impl InputMatch> for str +where + str: InputMatch + InputMatch>, +{ + fn match_left(&self, pat: &Pat) -> Option { + match pat { + Pat::String(s) => self.match_left(s), + &Pat::Range(start, end) => self.match_left(&(start..=end)), + } + } + fn match_right(&self, pat: &Pat) -> Option { + match pat { + Pat::String(s) => self.match_right(s), + &Pat::Range(start, end) => self.match_right(&(start..=end)), + } + } +} + +impl InputMatch for str { + fn match_left(&self, pat: &String) -> Option { + self.match_left(&pat[..]) + } + fn match_right(&self, pat: &String) -> Option { + self.match_right(&pat[..]) + } +} diff --git a/tests/basic.rs b/tests/basic.rs new file mode 100644 index 0000000..f89023f --- /dev/null +++ b/tests/basic.rs @@ -0,0 +1,238 @@ +#![deny(rust_2018_idioms)] + +use std::fs::File; + +macro_rules! testcases { + ($($name:ident { $($grammar:tt)* }: $($rule:ident($input:expr) => $expected:expr),* ;)*) => { + $(#[test] + fn $name() { + let cx = &grammer::scannerless::Context::new(); + let grammar = &grammer::lyg::parse( + cx, + stringify!($($grammar)*).parse::().unwrap(), + ).unwrap(); + grammar.check(cx); + + $( + let rule = cx.intern(stringify!($rule)); + let result = grammer::bruteforce::parse(cx, grammar, rule, $input); + if let Ok(result) = &result { + result.with(|result| { + result.forest + .dump_graphviz( + Some(result.node), + &mut File::create(concat!( + env!("CARGO_MANIFEST_DIR"), + "/target/", + stringify!($name), + "-forest.dot" + )).unwrap(), + ).unwrap(); + }); + } + + let result = match &result { + Ok(result) => format!("{:#?}", result), + Err(grammer::parser::ParseError { + at, + expected, + }) => { + format!("{:?}: error: expected {:?}", at, expected) + } + }; + + assert!( + result == $expected, + "mismatched output, expected:\n{}\n\nfound:\n{}", + $expected, + result + ); + )*})* + }; +} + +testcases![ + gll10_g0 { + S = X:{ a:A s:S d:"d" } | + Y:{ b:B s:S } | + Z:{}; + + A = A:"a" | + C:"c"; + + B = A:"a" | + B:"b"; + }: + S("aad") => "\ +1:1-1:4 => S { + X: 1:1-1:4 => { + a: 1:1-1:2 => A { + A: 1:1-1:2, + }, + s: 1:2-1:3 => S { + Y: 1:2-1:3 => { + b: 1:2-1:3 => B { + A: 1:2-1:3, + }, + s: 1:3-1:3 => S { + Z: 1:3-1:3, + }, + }, + }, + d: 1:3-1:4, + }, +} | S { + Y: 1:1-1:4 => { + b: 1:1-1:2 => B { + A: 1:1-1:2, + }, + s: 1:2-1:4 => S { + X: 1:2-1:4 => { + a: 1:2-1:3 => A { + A: 1:2-1:3, + }, + s: 1:3-1:3 => S { + Z: 1:3-1:3, + }, + d: 1:3-1:4, + }, + }, + }, +}", +// FIXME(eddyb) get replace quotes with backticks and pretify the `expected` list. + S("aax") => r#"1:3: error: expected ["a", "b", "c", "d"]"#; + + gll10_g0_opaque { + S = { a:A s:S "d" } | + { b:B s:S } | + {}; + A = "a" | "c"; + B = "a" | "b"; + }: + S("aad") => "\ +1:1-1:4 => S { + a: 1:1-1:2 => A {}, + s: 1:2-1:3 => S { + b: 1:2-1:3 => B {}, + s: 1:3-1:3 => S {}, + }, +} | S { + b: 1:1-1:2 => B {}, + s: 1:2-1:4 => S { + a: 1:2-1:3 => A {}, + s: 1:3-1:3 => S {}, + }, +}", +// FIXME(eddyb) get replace quotes with backticks and pretify the `expected` list. + S("aax") => r#"1:3: error: expected ["a", "b", "c", "d"]"#; + + gll13_g1 { + S = X:{ a:"a" s:S b:"b" } | + Y:{ "d" } | + Z:{ a:"a" d:"d" b:"b" }; + }: + S("adb") => "\ +1:1-1:4 => S { + X: 1:1-1:4 => { + a: 1:1-1:2, + s: 1:2-1:3 => S { + Y: 1:2-1:3, + }, + b: 1:3-1:4, + }, +} | S { + Z: 1:1-1:4 => { + a: 1:1-1:2, + d: 1:2-1:3, + b: 1:3-1:4, + }, +}", +// FIXME(eddyb) get replace quotes with backticks and pretify the `expected` list. + S("aax") => r#"1:3: error: expected ["a", "d"]"#; + + gll15_g0 { + A = X:{ a:"a" x:A b:"b" } | + Y:{ a:"a" x:A c:"c" } | + Z:{ "a" }; + }: + A("aac") => "\ +1:1-1:4 => A { + Y: 1:1-1:4 => { + a: 1:1-1:2, + x: 1:2-1:3 => A { + Z: 1:2-1:3, + }, + c: 1:3-1:4, + }, +}", +// FIXME(eddyb) get replace quotes with backticks and pretify the `expected` list. + A("aax") => r#"1:3: error: expected ["a", "b", "c"]"#; + + gll15_g0_nested { + A = X:{ a:"a" { x:A b:"b" } } | + Y:{ a:"a" x:A c:"c" } | + Z:{ "a" "" }; + }: + A("aab") => "\ +1:1-1:4 => A { + X: 1:1-1:4 => { + a: 1:1-1:2, + x: 1:2-1:3 => A { + Z: 1:2-1:3, + }, + b: 1:3-1:4, + }, +}", +// FIXME(eddyb) get replace quotes with backticks and pretify the `expected` list. + A("aax") => r#"1:3: error: expected ["a", "b", "c"]"#; + + repeat_many_trailing { + A = elems:"a"* %% "b"; + }: + A("abab") => "\ +1:1-1:5 => A { + elems: 1:1-1:5 => [ + 1:1-1:2, + 1:3-1:4, + ], +}", + A("aba") => "\ +1:1-1:4 => A { + elems: 1:1-1:4 => [ + 1:1-1:2, + 1:3-1:4, + ], +}", +// FIXME(eddyb) get replace quotes with backticks and pretify the `expected` list. + A("b") => r#"1:1: error: expected ["a"]"#; + + nested_or { + A = x:"x" { a:"a" | b:"b" }; + }: + A("xa") => "\ +1:1-1:3 => A { + x: 1:1-1:2, + a: 1:2-1:3, +}", +// FIXME(eddyb) get replace quotes with backticks and pretify the `expected` list. + A("xy") => r#"1:2: error: expected ["a", "b"]"#; + + split_ambiguity { + A = a:"x"? b:"x"? c:"x"?; + }: + A("xx") => "\ +1:1-1:3 => A { + ..: 1:1-1:2 => { + a: 1:1-1:1, + b: 1:1-1:2, + } | { + a: 1:1-1:2, + b: 1:2-1:2, + }, + c: 1:2-1:3, +} | A { + a: 1:1-1:2, + b: 1:2-1:3, + c: 1:3-1:3, +}"; +]; diff --git a/tests/json.rs b/tests/json.rs new file mode 100644 index 0000000..9a7489c --- /dev/null +++ b/tests/json.rs @@ -0,0 +1,162 @@ +#![deny(rust_2018_idioms)] + +const GRAMMAR: &str = stringify!( + Value = + | Null:"null" + | False:"false" + | True:"true" + | Literal:LITERAL + | Array:{ "[" elems:Value* % "," "]" } + | Object:{ "{" fields:Field* % "," "}" } + | InterpolateRust:{ "(" TOKEN_TREE+ ")" } + ; + Field = name:IDENT ":" value:Value; +); + +fn json_like_testcase(input: &str, expected: &str) { + let cx = &grammer::proc_macro::Context::new(); + let mut grammar = grammer::proc_macro::builtin(cx); + grammar.extend( + grammer::lyg::parse( + cx, + GRAMMAR.parse::().unwrap(), + ) + .unwrap(), + ); + grammar.check(cx); + + let tokens = input.parse::().unwrap(); + + let rule = cx.intern("Value"); + let result = grammer::bruteforce::parse(cx, &grammar, rule, tokens); + let result = match &result { + Ok(result) => format!("{:#?}", result), + Err(grammer::parser::ParseError { at, expected }) => { + format!("{:?}: error: expected {:?}", at, expected) + } + }; + + // HACK(eddyb) clean up the result, as we have no span info. + let result = result + .replace("Span", "?") + .replace("?..? => ", "") + .replace("?..?", "?"); + + assert!( + result == expected, + "mismatched output, expected:\n{}\n\nfound:\n{}", + expected, + result + ); +} + +#[test] +fn json_like_success() { + let input = stringify! { + // Example from `serde_json`. + { + name: "John Doe", + age: 43, + address: { + street: "10 Downing Street", + city: "London" + }, + phones: [ + "+44 1234567", + "+44 2345678" + ], + + test: [null, false, true, (format!("{:?}", Some(1 + 2)))] + } + }; + + let expected = "\ +Value { + Object: { + fields: [ + Field { + name: IDENT {}, + value: Value { + Literal: LITERAL {}, + }, + }, + Field { + name: IDENT {}, + value: Value { + Literal: LITERAL {}, + }, + }, + Field { + name: IDENT {}, + value: Value { + Object: { + fields: [ + Field { + name: IDENT {}, + value: Value { + Literal: LITERAL {}, + }, + }, + Field { + name: IDENT {}, + value: Value { + Literal: LITERAL {}, + }, + }, + ], + }, + }, + }, + Field { + name: IDENT {}, + value: Value { + Array: { + elems: [ + Value { + Literal: LITERAL {}, + }, + Value { + Literal: LITERAL {}, + }, + ], + }, + }, + }, + Field { + name: IDENT {}, + value: Value { + Array: { + elems: [ + Value { + Null: ?, + }, + Value { + False: ?, + }, + Value { + True: ?, + }, + Value { + InterpolateRust: ?, + }, + ], + }, + }, + }, + ], + }, +}"; + + json_like_testcase(input, expected); +} + +#[test] +fn json_like_error() { + let input = stringify! { + stray_identifier + }; + + let expected = r#"?: error: expected ["(", "[", "{", "false", "null", "true", LITERAL]"#; + + json_like_testcase(input, expected); +}