This is a simple example of LL(1) parsing, by hand, in O'Caml and C++. The O'Caml version is the original, and has better comments.
LL(1) parsing is a top-down (analyse descendante en Français) parsing technique, also known as recursive descent (indeed in our example there is no explicit stack -- instead the language's call stack is used). Some interesting online references about parsing in general, and LL(1) in particular are Pierre-Yves Schobbens's slides (in French), and the online book, Parsing Techniques - A Practical Guide (in English).
Download the O'Caml and C++ code. Underneath, the parsing part is shown. The lexer is not shown, look at the code if you want to see it (it is a very simple module with 3 public functions: create_from_string, peek, and next).
1. original (ambiguous) grammar, for elementary arithmetic expressions:
E -> ( E ) { $2 }
E -> E + E { $1 + $3 }
E -> E * E { $1 * $3 }
E -> int { $1 }
with the tokens: int = [0-9]+, (, ), +, *
2. the same without left recursion (we don't bother with actions at this point):
E -> ( E )
E -> int
E -> int E'
E' -> + E
E' -> * E
E' -> + E E'
E' -> * E E'
3. now with factored right parts (we also re-added actions):
E -> ( E ) { $2 }
E -> int F { let (op,v) = $2 in op $1 $2 }
F -> { ((+), 0) }
F -> E' { $1 }
E' -> + E F { let (op,v) = $3 in ((+), op $2 v) }
E' -> * E F { let (op,v) = $3 in ((+), op $2 v) }
First sets:
First(E) = { (, int }
First(F) = { +, * }
First(E') = { +, * }
Follow sets (with a pseudo-token, EOF = end of file (EOF is also often marked as $)):
Follow(E) = { ), +, *, EOF }
Follow(F) = { ), +, *, EOF }
Follow(E') = { ), +, *, EOF }
Nullables = { F }
open Token
open Ll1_common
let rec parse_E stream = match Tokenizer.peek stream with
(* E -> ( E ) *)
| LPar ->
consume stream LPar;
let e = parse_E stream in
consume stream RPar; e
(* E -> int F *)
| Int(i) -> consume stream (Int(i)); let (op,v) = parse_F stream in op i v
| _ as x -> raise (Parse_error (unexpected_msg x))
(* there is a LL(1) conflict here on Plus and Times, that we will resolve in favor of F -> E' *)
and parse_F stream = match Tokenizer.peek stream with
(* F -> *)
| RPar | EOF -> (+), 0
(* F -> E' *)
| Plus | Times -> parse_E' stream
| _ as x -> raise (Parse_error (unexpected_msg x))
and parse_E' stream = match Tokenizer.peek stream with
(* E' -> + E F *)
| Plus ->
consume stream Plus;
let e = parse_E stream in
let (op,v) = parse_F stream in
(+), (op e v)
(* E' -> * E F *)
| Times ->
consume stream Times;
let e = parse_E stream in
let (op,v) = parse_F stream in
( * ), (op e v)
| _ as x -> raise (Parse_error (unexpected_msg x))
(* allows to test the parser on a string *)
let test_parse s =
let t = Tokenizer.create_from_string s in
parse_E t
Now we have a problem: the previous grammar will evaluate
2 * 3 + 2 as 2 * ( 3 + 2 ) = 10 and
2 + 3 * 2 as 2 + ( 3 * 2 ) = 8
ie it always associates from the right, not taking precedences into account.
To fix this, we can rework 1 as (there are many possibilities):
1'. 1 with precedences fixed:
E -> Term { $1 + $3 }
Term -> Factor { $1 }
Term -> Term + Term { $1 + $3 }
Factor -> int { $1 }
Factor -> ( E ) { $2 }
Factor -> Factor * Factor { $1 * $3 }
(note: this is still ambiguous wrt 2 + 2 + 3 =? (2+2) + 3 or 2 + (2+3))
2'. without left recursion (and without actions at this point):
E -> Term
Term -> Factor
Term -> Factor Term'
Term' -> + Term
Term' -> + Term Term'
Factor -> int
Factor -> ( E )
Factor -> int Factor'
Factor -> ( E ) Factor'
Factor' -> * Factor
Factor' -> * Factor Factor'
3'. now with common right part prefixes factored out, and actions back in:
E -> Term { $1 }
Term -> Factor MaybeTerm' { let (op,v)=$2 in op $1 v) }
MaybeTerm' -> { (+), 0 }
MaybeTerm' -> Term' { $1 }
Term' -> + Term MaybeTerm' { let (op,v)=$3 in (+), (op $2 v) }
Factor -> int MaybeFactor' { let (op,v)=$2 in op $1 v }
Factor -> ( E ) MaybeFactor' { let (op,v)=$4 in op $2 v }
MaybeFactor' -> { ( * ), 1 }
MaybeFactor' -> Factor' { $1 }
Factor' -> * Factor MaybeFactor' { let (op,v)=$3 in ( * ),(op $2 v) }
First sets:
First(E) = { int, ( }
First(Term) = { int, ( }
First(Term') = { + }
First(MaybeTerm') = { + }
First(Factor) = { int, ( }
First(Factor') = { * }
First(MaybeFactor') = { * }
Follow sets (with a pseudo-token, EOF = end of file):
Follow(E) = { ), EOF }
Follow(Term) = { +, ), EOF }
Follow(Term') = { +, ), EOF }
Follow(MaybeTerm') = { +, ), EOF }
Follow(Factor) = { +, ), EOF }
Follow(Factor') = { +, ), EOF }
Follow(MaybeFactor') = { +, ), EOF }
Nullables = { MaybeTerm', MaybeFactor' }
open Token
open Ll1_common
(* a little sugar for this implementation, to aid debugging *)
let unexp nt tok =
raise (Parse_error (nt ^ ": " ^ (unexpected_msg tok)))
let parse name stream f =
Printf.printf "parsing: %s\n" name;
f (Tokenizer.peek stream)
let rec parse_E2 stream = parse "E2" stream (function
| LPar | Int(_) -> parse_Term stream
| _ as x -> unexp "E2" x)
and parse_Term stream = parse "Term" stream (function
| LPar | Int(_) ->
let f = parse_Factor stream in
let (op,v) = parse_MaybeTerm' stream in
op f v
| _ as x -> unexp "Term" x)
(* here we have an ambiguity on +, which is part of both Follow(MaybeTerm') and First(Term')
we choose to resolve it by using the "MaybeTerm' -> Term" rule
this means that this grammar is not LL(1) -- however, by making an educated choice, we cope without modifying it
(we could modify the grammar, but this is not worth the trouble)
*)
and parse_MaybeTerm' stream = parse "MaybeTerm'" stream (function
| RPar | EOF -> (+), 0
| Plus -> parse_Term' stream
| _ as x -> unexp "MaybeTerm'" x)
and parse_Term' stream = parse "Term'" stream (function
| Plus ->
consume stream Plus;
let t = parse_Term stream in
let (op,v) = parse_MaybeTerm' stream in
(+), (op t v)
| _ as x -> unexp "Term'" x)
and parse_Factor stream = parse "Factor" stream (function
| Int(i) ->
consume stream (Int(i));
let (op,v) = parse_MaybeFactor' stream in
op i v
| LPar ->
consume stream LPar;
let e = parse_E2 stream in
consume stream RPar;
let (op,v) = parse_MaybeFactor' stream in
op e v
| _ as x -> unexp "Factor" x)
and parse_MaybeFactor' stream = parse "MaybeFactor'" stream (function
| Plus | RPar | EOF -> ( * ), 1
| Times -> parse_Factor' stream
| _ as x -> unexp "MaybeFactor'" x)
and parse_Factor' stream = parse "Factor'" stream (function
| Times ->
consume stream Times;
let f = parse_Factor stream in
let (op,v) = parse_MaybeFactor' stream in
( * ), (op f v)
| _ as x -> unexp "Factor'" x)
let test_parse2 s =
let t = Tokenizer.create_from_string s in
parse_E2 t