7 Parsing
7.1 Lexing
Lexing is the process of converting a program’s textual input into a sequence of tokens. The component that performs this task is called a lexer, tokenizer, or scanner, and the process itself is often referred to as scanning. The tokens produced by the lexer correspond to the terminals of the parser’s context-free grammar (CFG), such as keywords, identifiers, numeric literals, and punctuation symbols. During this stage, the lexer typically ignores or discards whitespace and comments, as they are not needed for syntactic analysis.
We will create a lexer and parser for a simple expression language, which only supports integers addition. First, we create the token type that represents all the tokens in the alphabet.
type token =
Tok_Num of char (* represents a single digit number *)
| Tok_Add (* represents + *)
| Tok_END (* end of token list)
tokenize "1 + 2" = [Tok_Num '1'; Tok_Add; Tok_Num '2'; Tok_END]Here is the full tokenizer code:
#require "str";;
let re_num = Str.regexp "[1-9][0-9]*";;
let re_add = Str.regexp "+";;
let re_sub = Str.regexp "-";;
let re_lparen = Str.regexp "(";;
let re_rparen = Str.regexp ")";;
let re_mult = Str.regexp "*";;
let re_space = Str.regexp " ";;
exception ParseError of string
type token =
| Tok_Num of string (* represent a number *)
| Tok_Add
| Tok_Mult
| Tok_Sub
| Tok_lparen
| Tok_rparen
| Tok_END;;
let tokenize str =
let rec tok pos s =
if pos >= String.length s then [ Tok_END ]
else if Str.string_match re_num s pos then
let token = Str.matched_string s in
let len = String.length token in
Tok_Num token :: tok (pos + len) s
else if Str.string_match re_space s pos then tok (pos + 1) s
else if Str.string_match re_add s pos then Tok_Add :: tok (pos + 1) s
else if Str.string_match re_mult s pos then Tok_Mult :: tok (pos + 1) s
else if Str.string_match re_sub s pos then Tok_Sub :: tok (pos + 1) s
else if Str.string_match re_lparen s pos then Tok_lparen :: tok (pos + 1) s
else if Str.string_match re_rparen s pos then Tok_rparen :: tok (pos + 1) s
else raise (ParseError "tokenize")
in
tok 0 str;;
tokenize "3 * 4 + 2";;
- : token list = [Tok_Num "3"; Tok_Mult; Tok_Num "4"; Tok_Add; Tok_Num "2"; Tok_END]
tokenize "3 *+++";;
- : token list = [Tok_Num "3"; Tok_Mult; Tok_Add; Tok_Add; Tok_Add; Tok_END]
"2+3*4" -> Lexer/Tokenizer/Scanner split the input string into tokens 2, +, 3, *, 4
int, main, (, void, ), { ..... } for3, for,_abc,$abc,
7.2 Parsing
A parser is a program that takes a list of tokens as input, verifies that they conform to a specified grammar or format, and transforms them into an abstract syntax tree (AST).
AST is a shape of a program. Once we have the AST, we can optimize, evaluate, or generate machine code for the program. For example, given a context free grammar
S -> A + S | A
A -> n | (S)[Tok_Num "3"; Tok_Mult; Tok_Num "4"; Tok_Add; Tok_Num "2"; Tok_END]
7.2.1 Recursive Descent Parser
In practice, parsers are often created using tools known as parser generators. Examples include Yacc, Bison, ANTLR, and LALRPOP. In this lecture, we will focus on the Recursive Descent Parser, a simple type of parser that can be implemented manually.
A Recursive Descent Parser attempts to produce a leftmost derivation of the input. Starting from the start symbol (S) and the sequence of input tokens (t), it repeatedly rewrites S and consumes tokens from t using the grammar’s production rules, continuing until all tokens are matched or a parsing failure occurs.
type expr =
Num of int
| Add of expr * expr
| Mult of expr * expr
let lookahead tokens =
match tokens with [] -> raise (ParseError "no tokens") | h :: t -> h
let match_token (toks : token list) (tok : token) =
match toks with
| [] -> raise (ParseError "error:")
| h :: t when h = tok -> t
| h :: _ -> raise (ParseError "error")
(*
Arithmetic expression grammar:
S -> A + S | A
A -> n | (S)
[Basic grammar with tokens]
S -> A Tok_Add S | A
A -> Tok_Num | Tok_LParen S Tok_RParen
*)
let rec parse_S tokens =
let e1, t1 = parse_A tokens in
match lookahead t1 with
| Tok_Add ->
let t2 = match_token t1 Tok_Add in
let e2, t3 = parse_S t2 in
(Add (e1, e2), t3)
| _ -> (e1, t1)
and parse_A tokens =
match lookahead tokens with
(* B -> Tok_Num *)
| Tok_Num c ->
let t = match_token tokens (Tok_Num c) in
(Num (int_of_string c), t)
(* B -> Tok_LParen E Tok_RParen *)
| Tok_lparen ->
let t = match_token tokens Tok_lparen in
let e2, t2 = parse_S t in
let _ = Printf.printf "%s
" "error" in
if lookahead t2 = Tok_rparen then (e2, match_token t2 Tok_rparen)
else raise (ParseError "parse_B 1")
| _ -> raise (ParseError "parse_B 2")7.3 Recursive Descent Parser Example
OCaml REPL
#require "str";; (* expr : user-defined variant datatype for arithmetic expressions *) type expr = Num of int | Add of expr * expr | Sub of expr * expr | Mult of expr * expr ;; (* function a_to_str : arith -> string converts arithmetic expression into a string *) let rec expr_to_str a = match a with | Num n -> string_of_int n (* from Pervasives *) | Add (a1, a2) -> "(" ^ expr_to_str a1 ^ " + " ^ expr_to_str a2 ^ ")" | Sub (a1, a2) -> "(" ^ expr_to_str a1 ^ " - " ^ expr_to_str a2 ^ ")" | Mult (a1, a2) -> "(" ^ expr_to_str a1 ^ " * " ^ expr_to_str a2 ^ ")" ;; (* finds value of arithmetic expression. always returns (Num n) *) let rec eval e = match e with | Num n -> n | Add (a1, a2) -> ( match (eval a1, eval a2) with n1, n2 -> n1 + n2) | Sub (a1, a2) -> ( match (eval a1, eval a2) with n1, n2 -> n1 - n2) | Mult (a1, a2) -> ( match (eval a1, eval a2) with n1, n2 -> n1 * n2) ;; exception IllegalExpression of string;; exception ParseError of string;; (* Scanner *) type token = | Tok_Num of string | Tok_Add | Tok_Sub | Tok_Mult | Tok_LParen | Tok_RParen | Tok_END ;; let re_num = Str.regexp "[0-9]+" (* single digit number *);; let re_add = Str.regexp "+";; let re_sub = Str.regexp "-";; let re_mult = Str.regexp "*";; let re_lparen = Str.regexp "(";; let re_rparen = Str.regexp ")";; let re_space = Str.regexp " ";; (*---------------------------------------------------------- function tokenize : string -> token list converts string into a list of tokens *) let tokenize str = let rec tok pos s = if pos >= String.length s then [ Tok_END ] else if Str.string_match re_num s pos then let token = Str.matched_string s in let len = String.length token in Tok_Num token :: tok (pos + len) s else if Str.string_match re_space s pos then tok (pos + 1) s else if Str.string_match re_add s pos then Tok_Add :: tok (pos + 1) s else if Str.string_match re_sub s pos then Tok_Sub :: tok (pos + 1) s else if Str.string_match re_mult s pos then Tok_Mult :: tok (pos + 1) s else if Str.string_match re_lparen s pos then Tok_LParen :: tok (pos + 1) s else if Str.string_match re_rparen s pos then Tok_RParen :: tok (pos + 1) s else raise (IllegalExpression "tokenize") in tok 0 str ;; (* function tok_to_str : token -> string converts token into a string *) let string1_of_token t = match t with | Tok_Num v -> "Tok_Num " ^ v | Tok_Add -> "Tok_Add" | Tok_Sub -> "Tok_Sub" | Tok_Mult -> "Tok_Mult" | Tok_LParen -> "Tok_LParen" | Tok_RParen -> "Tok_RParen" | Tok_END -> "END" ;; let string2_of_token t = match t with | Tok_Num v -> v (*(Char.escaped v)*) | Tok_Add -> "+" | Tok_Sub -> "-" | Tok_Mult -> "*" | Tok_LParen -> "(" | Tok_RParen -> ")" | Tok_END -> "END" ;; (* Parser *) (* function lookahead : token list -> (token * token list) Returns tuple of head of token list & tail of token list *) let lookahead tokens = match tokens with [] -> raise (ParseError "no tokens") | h :: t -> h let match_token (toks : token list) (tok : token) = match toks with | [] -> raise (ParseError (string1_of_token tok)) | h :: t when h = tok -> t | h :: _ -> raise (ParseError (string1_of_token tok)) ;; (* recursive descent parser Returns tuple of ast & token list for remainder of string Arithmetic expression grammar: S -> A + S | A - S| A A -> B * A | B B -> n | (S) [Basic grammar withtokens] S -> A Tok_Add S | A A A -> B Tok_Mult A | B B -> Tok_Num | Tok_LParen S Tok_RParen *) let rec parse_S tokens = let e1, t1 = parse_A tokens in match lookahead t1 with | Tok_Add -> (* S -> A Tok_Add E *) let t2 = match_token t1 Tok_Add in let e2, t3 = parse_S t2 in (Add (e1, e2), t3) | Tok_Sub -> (* S -> A Tok_Add E *) let t2 = match_token t1 Tok_Sub in let e2, t3 = parse_S t2 in (Sub (e1, e2), t3) | _ -> (e1, t1) (* S -> A *) and parse_A tokens = let e1, tokens = parse_B tokens in match lookahead tokens with | Tok_Mult -> (* A -> B Tok_Mult A *) let t2 = match_token tokens Tok_Mult in let e2, t3 = parse_A t2 in (Mult (e1, e2), t3) | _ -> (e1, tokens) and parse_B tokens = match lookahead tokens with (* B -> Tok_Num *) | Tok_Num c -> let t = match_token tokens (Tok_Num c) in (Num (int_of_string c), t) (* B -> Tok_LParen E Tok_RParen *) | Tok_LParen -> let t = match_token tokens Tok_LParen in let e2, t2 = parse_S t in let _ = Printf.printf "%s\n" (string1_of_token (lookahead tokens)) in if lookahead t2 = Tok_RParen then (e2, match_token t2 Tok_RParen) else raise (ParseError "parse_B 1") | _ -> raise (ParseError "parse_B 2") ;; (* function eval_str : given string, parse string, build AST, evaluate value of AST *) let pp = Printf.printf;; (* function eval_str : given string, parse string, build AST, evaluate value of AST *) let eval_str str = let tokens = tokenize str in pp "Input token list = ["; List.iter (fun x -> pp "%s;" (string1_of_token x)) tokens; pp "]\n"; let (a, t) = parse_S tokens in if t <> [ Tok_END ] then raise (IllegalExpression "parse_S"); pp "AST produced = %s\n" (expr_to_str a); let v = eval a in pp "Value of AST = %d\n" v; ;; eval_str "1*2*3-4*5*6";; eval_str "1+2+3*4*5+6";; eval_str "1+(2+3)*4*5+6";; eval_str "100 * (10 + 20)";; (* eval_str "(2^5)*2";; error *) (* eval_str "1++12" error *) # # type expr = Num of int | Add of expr * expr | Sub of expr * expr | Mult of expr * expr # val expr_to_str : expr -> string = <fun> # val eval : expr -> int = <fun> # exception IllegalExpression of string # exception ParseError of string # type token = Tok_Num of string | Tok_Add | Tok_Sub | Tok_Mult | Tok_LParen | Tok_RParen | Tok_END # val re_num : Str.regexp = <abstr> # val re_add : Str.regexp = <abstr> # val re_sub : Str.regexp = <abstr> # val re_mult : Str.regexp = <abstr> # val re_lparen : Str.regexp = <abstr> # val re_rparen : Str.regexp = <abstr> # val re_space : Str.regexp = <abstr> # val tokenize : string -> token list = <fun> # val string1_of_token : token -> string = <fun> # val string2_of_token : token -> string = <fun> # val lookahead : 'a list -> 'a = <fun> val match_token : token list -> token -> token list = <fun> # val parse_S : token list -> expr * token list = <fun> val parse_A : token list -> expr * token list = <fun> val parse_B : token list -> expr * token list = <fun> # val pp : ('a, out_channel, unit) format -> 'a = <fun> # val eval_str : string -> unit = <fun> # Input token list = [Tok_Num 1;Tok_Mult;Tok_Num 2;Tok_Mult;Tok_Num 3;Tok_Sub;Tok_Num 4;Tok_Mult;Tok_Num 5;Tok_Mult;Tok_Num 6;END;] AST produced = ((1 * (2 * 3)) - (4 * (5 * 6))) Value of AST = -114 - : unit = () # Input token list = [Tok_Num 1;Tok_Add;Tok_Num 2;Tok_Add;Tok_Num 3;Tok_Mult;Tok_Num 4;Tok_Mult;Tok_Num 5;Tok_Add;Tok_Num 6;END;] AST produced = (1 + (2 + ((3 * (4 * 5)) + 6))) Value of AST = 69 - : unit = () # Input token list = [Tok_Num 1;Tok_Add;Tok_LParen;Tok_Num 2;Tok_Add;Tok_Num 3;Tok_RParen;Tok_Mult;Tok_Num 4;Tok_Mult;Tok_Num 5;Tok_Add;Tok_Num 6;END;] Tok_LParen AST produced = (1 + (((2 + 3) * (4 * 5)) + 6)) Value of AST = 107 - : unit = () # Input token list = [Tok_Num 100;Tok_Mult;Tok_LParen;Tok_Num 10;Tok_Add;Tok_Num 20;Tok_RParen;END;] Tok_LParen AST produced = (100 * (10 + 20)) Value of AST = 3000 - : unit = () #