title: Learn peg.js for fun and profit author: name: sudodoki email: [email protected] github: sudodoki twitter: sudodoki theme: sudodoki/reveal-cleaver-theme output: index.html
--
--
<style> .reveal code { // font-family: "Joystix"; font-size: 42px; line-height: 46px; } </style>--
--
Let user type SQL-like queries.
<COLUMN> <KEYWORD> <ARGS>
--
--
--
- theory: common ground & vocabulary
- PEG.js: not a silver bullet, but it works
--
--
- structured – Parsing
- unstructured – NLP
--
- Spec comes out in 1960
- Ned Irons publishes ALGOL parser, based on formal notation
--
--
↓
↓
--
- languages / dsl / data format / custom protocol
- data mining / parsing
- reading language specs --
- T – terminals
- N – non-terminals
- S – root
- R – production rules
--
Letter = 'A' / 'B' / … / 'Z'
Digit = [0-9]
Integer = Digit Integer / Digit
--
<Name> = [A-Z] [a-z]+
<Name> = [A-Z] <Letters>
<Name> = [A-Za-z] <Name>
- …
--
<COLUMN, "Id">
<KEYWORD, "=">
<ARG, 1>
--
--
--
- Bottom-up
- Top-down
- [Universal]
--
Start = Term
Digit = [0-9]
Sign = '-' / '+'
Term = Digit Sign Term / Digit
--
--
--
--
Vanilla JS Recursive descent~ish
--
let result = [];
let curIndex = 0;
const descend = (input) => {
term(input);
return result;
}
--
const term = (input) => {
result.push(['term']);
digit(input);
if (input[curIndex + 1]) {
sign(input);
term(input);
}
}
--
const sign = (input) => {
const curChar = input[curIndex];
if (isSign(curChar)) {
result.push(['sign', curChar])
curIndex += 1;
} else {
throw new
SyntaxError(`${curChar||'∅'}\
instead of sign`); } }
--
const isSign = (c) =>
['+', '-'].includes(c)
const isDigit = (c) =>
'0123456789'
.split('')
.includes(c);
--
const parsed = JSON.stringify(descend("2+3-4"))
console.log('parsed: ', parsed)
// parsed: [["term"],["digit","2"]
// ["sign","+"],["term"],["digit","3"]
// ["sign","-"],["term"],["digit","4"]]
--
--
--
- LL / LR / LALR / SLR
- SAX
- PEG & linear time => http://bford.info/packrat/
- Formal grammars and Automatons
--
--
by @dmajda
--
- when simple regexp is not good enough
- keeping some context / etc
- loose syntax / casing
--
npm i pegjs chokidar-cli
// in package.json scripts
chokidar 'parens.pegjs'
--initial
--silent
-c 'pegjs -o
parens-parser.js
parens.pegjs'
--
- Terms / Non-terms
- "literals" / 'literals'
.
- any char- [a-z] [0-9] [\u00C0-\u10FFFF] char ranges
*
/+
/?
/()
--
start = group
lpar = '('
rpar = ')'
nothing = ''
group =
lpar group rpar
/ group group
/ ''
--
--
--
--
group =
lpar group rpar
/ group group
/ ''
--
group = lpar rest
/ lpar group rpar
/ ''
rest = ')' group
/ ')'
--
start = FLOAT
SIGN = "-" / "+"
DIGIT = [0-9]
INTEGER =
d:DIGIT int:INTEGER
{ /* other side effect */
return d + int }
/ d:DIGIT
{ return d }
--
{
const notNull = (thing) =>
thing != null
}
--
parser.parse(string, options);
--
FLOAT = sign:SIGN?
whole:INTEGER? "."?
fraction:INTEGER? &{
return notNull(whole)
|| notNull(fraction)
}
{ return { sign,
whole,
fraction } }
--
- case sensitivity
- loose syntax
- distinguishing lexemes
- AST
--
- in terms of building parse tree
- in terms of deciding token's type
--
"ID" >= (20)
ID less than 20
--
// level2 * /
// level1 + -
start = level1
level1 =
level2 ([+-] level2)+
/ level2
level2 =
level0 ([*/] level0)+
/ level0
level0 = number / "(" level1 ")"
number = [0-9]+
--
- Sequential
- Non-sequential
--
{
column: { name, location, type }
keyword: { value, location }
argument: [ { value, location }]
}
--
{
complete: true,
sequence: [
{ column: { name, location, type }},
{ whitespace: true },
{ keyword: { value, location, type }},
{ whitespace: true },
{ argument: [{ value, location }]}
]
}
--
- offset
- line
- column
--
- Syntax error
- Semantic error
--
is a valid expression
--
--
- ditching peg.js
- using predicates and complete / non-complete attrs
- swallowing parse errors and trying to figure out how lethal it is
- incorporating 'incomplete' grammar
--
progressiveExpression =
column:column whitespace keyword:keyword whitespace* argument:inprogArgumentList
{ return { complete: false, sequence: [makeColumn(column), makeWhitespace(),
makeKeyword(keyword), makeWhitespace(), makeArgument(argument) ] }}
/ column:column whitespace keyword:keyword whitespace+
{ return { complete: false, sequence: [makeColumn(column), makeWhitespace(),
makeKeyword(keyword), makeWhitespace()] }}
… 3 redacted
/ column:column { return { complete: false, sequence: [makeColumn(column)] }}
/ inprogColumn
--
- manipulating AST
- stringifying
--
equalOp = ['equal']i 's'i?
--
--
--
- Compilers: Principles, Techniques, and Tools
- Nathan's University PL101: Parsing
- Parsing: a timeline
- Chomsky Hierarchy for Languages
- Compiler Design - Top-Down Parser
- Top-down vs bottom-up
- CYK Parsing algorithm
- You could have invented Parser Combinators
- Earley Parsing Explained
--