7 Parsing

8.17

7 Parsing🔗

7.1 Lexing🔗

Lexing is the process of converting a program’s textual input into a sequence of tokens. The component that performs this task is called a lexer, tokenizer, or scanner, and the process itself is often referred to as scanning. The tokens produced by the lexer correspond to the terminals of the parser’s context-free grammar (CFG), such as keywords, identifiers, numeric literals, and punctuation symbols. During this stage, the lexer typically ignores or discards whitespace and comments, as they are not needed for syntactic analysis.

We will create a lexer and parser for a simple expression language, which only supports integers addition. First, we create the token type that represents all the tokens in the alphabet.


type token =
    Tok_Num of char (* represents a single digit number *)
  | Tok_Add (* represents + *)
  | Tok_END (* end of token list)

The tokenizer will take a string the splits it into a list of tokens. For example:


tokenize "1 + 2" = [Tok_Num '1'; Tok_Add; Tok_Num '2'; Tok_END]

Here is the full tokenizer code:


#require "str";;

let re_num = Str.regexp "[1-9][0-9]*";;
let re_add = Str.regexp "+";;
let re_sub = Str.regexp "-";;
let re_lparen = Str.regexp "(";;
let re_rparen = Str.regexp ")";;
let re_mult = Str.regexp "*";;
let re_space = Str.regexp " ";;


exception ParseError of string

type token =
  | Tok_Num of string (* represent a number *)
  | Tok_Add
  | Tok_Mult
  | Tok_Sub
  | Tok_lparen
  | Tok_rparen
  | Tok_END;;

let tokenize str =
  let rec tok pos s =
    if pos >= String.length s then [ Tok_END ]
    else if Str.string_match re_num s pos then
      let token = Str.matched_string s in
      let len = String.length token in
      Tok_Num token :: tok (pos + len) s
    else if Str.string_match re_space s pos then tok (pos + 1) s
    else if Str.string_match re_add s pos then Tok_Add :: tok (pos + 1) s
    else if Str.string_match re_mult s pos then Tok_Mult :: tok (pos + 1) s
    else if Str.string_match re_sub s pos then Tok_Sub :: tok (pos + 1) s
    else if Str.string_match re_lparen s pos then Tok_lparen :: tok (pos + 1) s
    else if Str.string_match re_rparen s pos then Tok_rparen :: tok (pos + 1) s
    else raise (ParseError "tokenize")
  in
  tok 0 str;;

  tokenize "3 * 4 + 2";;
  - : token list = [Tok_Num "3"; Tok_Mult; Tok_Num "4"; Tok_Add; Tok_Num "2"; Tok_END]

  tokenize "3 *+++";;
 - : token list = [Tok_Num "3"; Tok_Mult; Tok_Add; Tok_Add; Tok_Add; Tok_END]

"2+3*4" -> Lexer/Tokenizer/Scanner split the input string into tokens 2, +, 3, *, 4

int, main, (, void, ), { ..... } for3, for,_abc,$abc,

7.2 Parsing🔗

A parser is a program that takes a list of tokens as input, verifies that they conform to a specified grammar or format, and transforms them into an abstract syntax tree (AST).

AST is a shape of a program. Once we have the AST, we can optimize, evaluate, or generate machine code for the program. For example, given a context free grammar


   S -> A + S | A
   A -> n | (S)

we parse

[Tok_Num "3"; Tok_Mult; Tok_Num "4"; Tok_Add; Tok_Num "2"; Tok_END]

into

7.2.1 Recursive Descent Parser🔗

In practice, parsers are often created using tools known as parser generators. Examples include Yacc, Bison, ANTLR, and LALRPOP. In this lecture, we will focus on the Recursive Descent Parser, a simple type of parser that can be implemented manually.

A Recursive Descent Parser attempts to produce a leftmost derivation of the input. Starting from the start symbol (S) and the sequence of input tokens (t), it repeatedly rewrites S and consumes tokens from t using the grammar’s production rules, continuing until all tokens are matched or a parsing failure occurs.



type expr = 
      Num of int 
    | Add of expr * expr 
    | Mult of expr * expr 

let lookahead tokens =
  match tokens with [] -> raise (ParseError "no tokens") | h :: t -> h

let match_token (toks : token list) (tok : token) =
  match toks with
  | [] -> raise (ParseError "error:")
  | h :: t when h = tok -> t
  | h :: _ -> raise (ParseError "error")

(*
   Arithmetic expression grammar:

   S -> A + S | A
   A -> n | (S)

  [Basic grammar with tokens]

    S -> A Tok_Add S | A
    A -> Tok_Num | Tok_LParen S Tok_RParen
*)

let rec parse_S tokens =
  let e1, t1 = parse_A tokens in
  match lookahead t1 with
  | Tok_Add ->
      let t2 = match_token t1 Tok_Add in
      let e2, t3 = parse_S t2 in
      (Add (e1, e2), t3)
  | _ -> (e1, t1)

and parse_A tokens =
  match lookahead tokens with
  (* B -> Tok_Num *)
  | Tok_Num c ->
      let t = match_token tokens (Tok_Num c) in
      (Num (int_of_string c), t)
  (* B -> Tok_LParen E Tok_RParen *)
  | Tok_lparen ->
      let t = match_token tokens Tok_lparen in
      let e2, t2 = parse_S t in
      let _ = Printf.printf "%s
" "error" in
      if lookahead t2 = Tok_rparen then (e2, match_token t2 Tok_rparen)
      else raise (ParseError "parse_B 1")
  | _ -> raise (ParseError "parse_B 2")

7.3 Recursive Descent Parser Example🔗

OCaml REPL


 #require "str";; 
 (* expr : user-defined variant datatype for arithmetic 
     expressions 
 *) 
 
 type expr = 
      Num of int 
    | Add of expr * expr 
    | Sub of expr * expr 
    | Mult of expr * expr 
 ;; 
 (* 
    function a_to_str : arith -> string 
          converts arithmetic expression into a string 
 *) 
 
 let rec expr_to_str a = 
    match a with 
    | Num n -> string_of_int n (* from Pervasives *) 
    | Add (a1, a2) -> "(" ^ expr_to_str a1 ^ " + " ^ expr_to_str a2 ^ ")" 
    | Sub (a1, a2) -> "(" ^ expr_to_str a1 ^ " - " ^ expr_to_str a2 ^ ")" 
    | Mult (a1, a2) -> "(" ^ expr_to_str a1 ^ " * " ^ expr_to_str a2 ^ ")" 
 ;; 
 (* 
    finds value of arithmetic expression.  always returns (Num n) 
 *) 
 let rec eval e = 
    match e with 
    | Num n -> n 
    | Add (a1, a2) -> ( match (eval a1, eval a2) with n1, n2 -> n1 + n2) 
    | Sub (a1, a2) -> ( match (eval a1, eval a2) with n1, n2 -> n1 - n2) 
    | Mult (a1, a2) -> ( match (eval a1, eval a2) with n1, n2 -> n1 * n2) 
 ;; 
 exception IllegalExpression of string;; 
 exception ParseError of string;; 
 (* Scanner *) 
 
 type token = 
    | Tok_Num of string 
    | Tok_Add 
    | Tok_Sub 
    | Tok_Mult 
    | Tok_LParen 
    | Tok_RParen 
    | Tok_END 
 ;; 
 
 let re_num = Str.regexp "[0-9]+" (* single digit number *);; 
 let re_add = Str.regexp "+";; 
 
 let re_sub = Str.regexp "-";; 
 let re_mult = Str.regexp "*";; 
 let re_lparen = Str.regexp "(";; 
 let re_rparen = Str.regexp ")";; 
 let re_space = Str.regexp " ";; 
 
 (*---------------------------------------------------------- 
    function tokenize : string -> token list 
    converts string into a list of tokens 
 *) 
 let tokenize str = 
    let rec tok pos s = 
      if pos >= String.length s then [ Tok_END ] 
      else if Str.string_match re_num s pos then 
        let token = Str.matched_string s in 
        let len = String.length token in 
        Tok_Num token :: tok (pos + len) s 
      else if Str.string_match re_space s pos then tok (pos + 1) s 
      else if Str.string_match re_add s pos then Tok_Add :: tok (pos + 1) s 
      else if Str.string_match re_sub s pos then Tok_Sub :: tok (pos + 1) s 
      else if Str.string_match re_mult s pos then Tok_Mult :: tok (pos + 1) s 
      else if Str.string_match re_lparen s pos then Tok_LParen :: tok (pos + 1) s 
      else if Str.string_match re_rparen s pos then Tok_RParen :: tok (pos + 1) s 
      else raise (IllegalExpression "tokenize") 
    in 
    tok 0 str 
 ;; 
 (* 
    function tok_to_str : token -> string 
    converts token into a string 
 *) 
 
 let string1_of_token t = 
    match t with 
    | Tok_Num v -> "Tok_Num " ^ v 
    | Tok_Add -> "Tok_Add" 
    | Tok_Sub -> "Tok_Sub" 
    | Tok_Mult -> "Tok_Mult" 
    | Tok_LParen -> "Tok_LParen" 
    | Tok_RParen -> "Tok_RParen" 
    | Tok_END -> "END" 
 ;; 
 let string2_of_token t = 
    match t with 
    | Tok_Num v -> v (*(Char.escaped v)*) 
    | Tok_Add -> "+" 
    | Tok_Sub -> "-" 
    | Tok_Mult -> "*" 
    | Tok_LParen -> "(" 
    | Tok_RParen -> ")" 
    | Tok_END -> "END" 
 ;; 
 (* Parser *) 
 (* 
     function lookahead : token list -> (token * token list) 
     Returns tuple of head of token list & tail of token list 
 *) 
 let lookahead tokens = 
    match tokens with [] -> raise (ParseError "no tokens") | h :: t -> h 
 
 let match_token (toks : token list) (tok : token) = 
    match toks with 
    | [] -> raise (ParseError (string1_of_token tok)) 
    | h :: t when h = tok -> t 
    | h :: _ -> raise (ParseError (string1_of_token tok)) 
 ;; 
 (* 
 recursive descent parser 
 
 Returns tuple of ast & token list for remainder of string 
 
 Arithmetic expression grammar: 
 
     S -> A + S | A - S| A 
     A -> B * A | B 
     B -> n | (S) 
 
    [Basic grammar withtokens] 
 
      S -> A Tok_Add S | A A 
      A -> B Tok_Mult A | B 
      B -> Tok_Num | Tok_LParen S Tok_RParen 
 *) 
 
 let rec parse_S tokens = 
    let e1, t1 = parse_A tokens in 
    match lookahead t1 with 
    | Tok_Add ->  (* S -> A Tok_Add E *) 
 
        let t2 = match_token t1 Tok_Add in 
        let e2, t3 = parse_S t2 in 
        (Add (e1, e2), t3) 
    | Tok_Sub -> (* S -> A Tok_Add E *) 
          let t2 = match_token t1 Tok_Sub in 
          let e2, t3 = parse_S t2 in 
          (Sub (e1, e2), t3) 
    | _ -> (e1, t1)       (* S -> A *) 
 
 and parse_A tokens = 
    let e1, tokens = parse_B tokens in 
    match lookahead tokens with 
    | Tok_Mult -> 
        (* A -> B Tok_Mult A *) 
        let t2 = match_token tokens Tok_Mult in 
        let e2, t3 = parse_A t2 in 
        (Mult (e1, e2), t3) 
    | _ -> (e1, tokens) 
 
 and parse_B tokens = 
    match lookahead tokens with 
    (* B -> Tok_Num *) 
    | Tok_Num c -> 
        let t = match_token tokens (Tok_Num c) in 
        (Num (int_of_string c), t) 
    (* B -> Tok_LParen E Tok_RParen *) 
    | Tok_LParen -> 
        let t = match_token tokens Tok_LParen in 
        let e2, t2 = parse_S t in 
        let _ = Printf.printf "%s\n" (string1_of_token (lookahead tokens)) in 
        if lookahead t2 = Tok_RParen then (e2, match_token t2 Tok_RParen) 
        else raise (ParseError "parse_B 1") 
    | _ -> raise (ParseError "parse_B 2") 
 ;; 
 (* 
    function eval_str : given string, parse string, build AST, 
          evaluate value of AST 
 *) 
 
 let pp = Printf.printf;; 
 (* 
    function eval_str : given string, parse string, build AST, 
          evaluate value of AST 
 *) 
 let eval_str str = 
    let tokens = tokenize str in 
 
    pp "Input token list = ["; 
    List.iter (fun x -> pp "%s;" (string1_of_token x)) tokens; 
    pp "]\n"; 
 
    let (a, t) = parse_S tokens in 
    if t <> [ Tok_END ] then raise (IllegalExpression "parse_S"); 
    pp "AST produced = %s\n" (expr_to_str a); 
    let v = eval a in 
    pp "Value of AST = %d\n" v; 
 ;; 
 
 
 eval_str "1*2*3-4*5*6";; 
 eval_str "1+2+3*4*5+6";; 
 eval_str "1+(2+3)*4*5+6";; 
 eval_str "100 *      (10 + 20)";; 
 (* eval_str "(2^5)*2";;  error *) 
 (* eval_str "1++12" error *)

# # type expr =
    Num of int
  | Add of expr * expr
  | Sub of expr * expr
  | Mult of expr * expr
# val expr_to_str : expr -> string = <fun>
# val eval : expr -> int = <fun>
# exception IllegalExpression of string
# exception ParseError of string
# type token =
    Tok_Num of string
  | Tok_Add
  | Tok_Sub
  | Tok_Mult
  | Tok_LParen
  | Tok_RParen
  | Tok_END
# val re_num : Str.regexp = <abstr>
# val re_add : Str.regexp = <abstr>
# val re_sub : Str.regexp = <abstr>
# val re_mult : Str.regexp = <abstr>
# val re_lparen : Str.regexp = <abstr>
# val re_rparen : Str.regexp = <abstr>
# val re_space : Str.regexp = <abstr>
# val tokenize : string -> token list = <fun>
# val string1_of_token : token -> string = <fun>
# val string2_of_token : token -> string = <fun>
# val lookahead : 'a list -> 'a = <fun>
val match_token : token list -> token -> token list = <fun>
# val parse_S : token list -> expr * token list = <fun>
val parse_A : token list -> expr * token list = <fun>
val parse_B : token list -> expr * token list = <fun>
# val pp : ('a, out_channel, unit) format -> 'a = <fun>
# val eval_str : string -> unit = <fun>
# Input token list = [Tok_Num 1;Tok_Mult;Tok_Num 2;Tok_Mult;Tok_Num 3;Tok_Sub;Tok_Num 4;Tok_Mult;Tok_Num 5;Tok_Mult;Tok_Num 6;END;]
AST produced = ((1 * (2 * 3)) - (4 * (5 * 6)))
Value of AST = -114
- : unit = ()
# Input token list = [Tok_Num 1;Tok_Add;Tok_Num 2;Tok_Add;Tok_Num 3;Tok_Mult;Tok_Num 4;Tok_Mult;Tok_Num 5;Tok_Add;Tok_Num 6;END;]
AST produced = (1 + (2 + ((3 * (4 * 5)) + 6)))
Value of AST = 69
- : unit = ()
# Input token list = [Tok_Num 1;Tok_Add;Tok_LParen;Tok_Num 2;Tok_Add;Tok_Num 3;Tok_RParen;Tok_Mult;Tok_Num 4;Tok_Mult;Tok_Num 5;Tok_Add;Tok_Num 6;END;]
Tok_LParen
AST produced = (1 + (((2 + 3) * (4 * 5)) + 6))
Value of AST = 107
- : unit = ()
# Input token list = [Tok_Num 100;Tok_Mult;Tok_LParen;Tok_Num 10;Tok_Add;Tok_Num 20;Tok_RParen;END;]
Tok_LParen
AST produced = (100 * (10 + 20))
Value of AST = 3000
- : unit = ()
#

contents ← prev up next →

1	Functional Programming with OCaml
2	Imperative Programming with OCaml
3	Property-Based Randomized Testing
4	OCaml Regular Expressions
5	Finite Automata
6	Context Free Grammars
7	Parsing
8	Operational Semantics
9	Type Checking