package menhirCST

  1. Overview
  2. Docs
Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source

Source file Settle.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
(******************************************************************************)
(*                                                                            *)
(*                                    Menhir                                  *)
(*                                                                            *)
(*   Copyright Inria. All rights reserved. This file is distributed under     *)
(*   the terms of the GNU Library General Public License version 2, with a    *)
(*   special exception on linking, as described in the file LICENSE.          *)
(*                                                                            *)
(******************************************************************************)

(* This flags enables runtime assertion checking. *)
let debug =
  false

(* This flag controls backtracking. *) (* TODO *)
let backtracking =
  true

(* We need the types [state] and [production] to support equality and
   hashing, because we build a hash table whose keys are pairs of a state
   [s] and a production [prod]. We could use OCaml's generic equality and
   hashing functions. We prefer to explicitly require that both states and
   productions be convertible to integers. For productions, this is done
   via a subtyping relation [production :> int]. For states, this is done
   via a function [number : state -> int]. *)

module[@inline] Make (A : sig
  type terminal
  type nonterminal
  type production = int
  type state
  val number: state -> int
  type token
  val token2terminal: token -> terminal
  val lhs : production -> nonterminal
  val maybe_shift_t : state -> terminal -> state option
  val maybe_goto_nt : state -> nonterminal -> state option
  val may_reduce_prod : state -> terminal -> production -> bool
end) = struct
open A

(* -------------------------------------------------------------------------- *)

(* States. *)

module State = struct
  let[@inline] equal (s1 : state) (s2 : state) = number s1 = number s2
  let[@inline] hash (s : state) = number s
end

(* Productions. *)

module Production = struct
  let[@inline] equal (prod1 : production) (prod2 : production) =
    (prod1 :> int) = (prod2 :> int)
  let[@inline] hash (prod : production) = (prod :> int)
end

(* State-production pairs. *)

module SP = struct
  type t = state * production
  let hash (s, prod) =
    (State.hash s, Production.hash prod) |> Hashtbl.hash
  let equal (s1, prod1) (s2, prod2) =
    State.equal s1 s2 &&
    Production.equal prod1 prod2
end

(* -------------------------------------------------------------------------- *)

module CST = struct

type cst =
  | Terminal    of token
  | NonTerminal of production * cst array

let dummy : cst =
  NonTerminal ((0 : production), [||])

end

type cst =
  CST.cst

(* -------------------------------------------------------------------------- *)

module DCST = struct

type dcst =
  | Choice      of dcst * dcst
  | Terminal    of token
  | NonTerminal of production * dcst array

(* -------------------------------------------------------------------------- *)

(* The head symbol of a DCST. *)

(* We could give separate definitions of the type [symbol], of the function
   [head], which returns the head symbol of a DCST, and of the function
   [transition], which takes a symbol as an argument.

   Instead, we give a direct definition of the function [transition_head],
   which is the composition of [head] and [transition]. This removes the
   need to define the type [symbol] and saves one memory allocation. *)

let rec transition_head (s : state) (dcst : dcst) : state option =
  match dcst with
  | Choice (dcst1, _dcst2) ->
      transition_head s dcst1
  | Terminal tok ->
      maybe_shift_t s (token2terminal tok)
  | NonTerminal (prod, _) ->
      maybe_goto_nt s (lhs prod)

(* -------------------------------------------------------------------------- *)

(* Smart constructors for DCSTs. *)

(* The smart constructor [nonterminal] could check that [prod] is not a
   start production, that the number of subtrees is correct, and that the
   head symbols of the subtrees are correct. The smart constructor [choice]
   could check that both branches of a choice have the same head symbol.
   Fortunately, if a strongly-typed view of DCSTs and CSTs is offered to the
   end user, then these checks are unnecessary. Such a strongly-typed view
   is *not* offered by the API of this module (so, this API is unsafe). A
   safe API *can* be constructed a posteriori on top of this unsafe API. *)

(* The constructor of [terminal] DCSTs. *)

let[@inline] terminal tok =
  Terminal tok

(* The constructor of [nonterminal] DCSTs. *)

let[@inline] nonterminal prod dcsts =
  (* Omitted check:
  assert (not (is_start prod));
  let rhs = Production.rhs prod in
  assert (Array.length rhs = Array.length dcsts);
  assert (for i = 0 to Array.length rhs - 1 do
    let symbol, _, _ = rhs.(i) in
    assert (Symbol.equal symbol (head dcsts.(i)));
  done; true);
   *)
  NonTerminal (prod, dcsts)

(* The smart constructor of [choice] DCSTs. *)

(* This smart constructor guarantees that a choice is never nested inside the
   left branch of a choice. It does this by re-balancing choices on the fly so
   that they lean towards the right. The order of the branches is preserved. *)

(* This is potentially inefficient, as it is analogous to list concatenation.
   We do not expect this to be a problem in practice, as most choices should
   have few branches. *)

let rec choice dcst1 dcst2 =
  (* Omitted check:
  assert (Symbol.equal (head dcst1) (head dcst2));
   *)
  match dcst1 with
  | Choice (dcst1a, dcst1b) ->
      Choice (dcst1a, choice dcst1b dcst2)
  | Terminal _
  | NonTerminal _ ->
      Choice (dcst1, dcst2)

end (* DCST *)

type dcst =
  DCST.dcst

type dcsts =
  dcst array

(* -------------------------------------------------------------------------- *)

(* The following functions test whether a certain path in the automaton exists,
   and if so, what state this path leads to. *)

(* [follow_suffix s dcsts i] follows the path that begins at the state [s] and
   whose edges are labeled with the head symbols of the trees found at offset
   [i] in the array [dcsts]. It returns [None] if this path does not exist.
   It returns [Some s'] if this path exists and leads to the state [s']. *)

let rec follow_suffix (s : state) (dcsts : dcsts) (i : int) : state option =
  let n = Array.length dcsts in
  if debug then assert (0 <= i && i <= n);
  if i = n then
    Some s
  else
    match DCST.transition_head s dcsts.(i) with
    | None    -> None
    | Some s' -> follow_suffix s' dcsts (i+1)

(* [follow] specializes [follow_suffix] with the offset 0. *)

let[@inline] follow (s : state) (dcsts : dcsts) : state option =
  follow_suffix s dcsts 0

(* Under the assumption that the trees [dcsts] form a valid right-hand side
   for the production [prod], this new definition of [follow prod s dcsts]
   returns the same result as [follow s dcsts] above. It is memoized, using
   a hash table whose keys are pairs [(s, prod)]. *)

(* Intuitively, it may seem that the parameter [dcsts] should not be needed,
   as the sequence of the head symbols of [dcsts] is the right-hand side of
   the production [prod], so it is determined by [prod]. However, we do not
   have a table that maps a production to its right-hand side. This is why
   [dcsts] is needed here (and it is fortunately at hand). *)

let follow : state -> production -> dcsts -> state option =
  let module T = Hashtbl.Make(SP) in
  let table = T.create 1023 in
  fun s prod dcsts ->
    try
      T.find table (s, prod)
    with Not_found ->
      let o = follow s dcsts in
      T.add table (s, prod) o;
      o

(* -------------------------------------------------------------------------- *)

(* [apparently_viable prod dcsts s t'] assumes that the state [s] has an
   outgoing transition whose symbol is [lhs prod]. It also assumes that the
   head symbols of the trees [dcsts] correspond to the right-hand side of
   production [prod].

   It tests whether, with respect to state [s] and lookahead symbol [t'],
   the production [prod] is "apparently viable".

   By definition, this means that there exists a path that begins at state
   [s], follows a series of edges whose labels correspond to the right-hand
   side of production [prod], and ends at a state [s'] where reducing [prod],
   with lookahead symbol [t'], is permitted. *)

let[@inline] apparently_viable_production prod dcsts s t' : bool =
  if debug then assert (maybe_goto_nt s (lhs prod) <> None);
  (* Test whether this path exists and leads to a state [s'] . *)
  match follow s prod dcsts with
  | None ->
      false
  | Some s' ->
      may_reduce_prod s' t' prod

(* [apparently_viable dcst s t'] assumes that the state [s] has an outgoing
   transition labeled with the head symbol of [dcst]. It also assumes that
   the tree [dcst] is not [Choice _].

   It tests whether, with respect to state [s] and lookahead symbol [t'],
   the tree [dcst] is "apparently viable".

   If [dcst] is a terminal node, then it is apparently viable. If it is a
   nonterminal node [NonTerminal (prod, _)], then, by definition, it is
   apparently viable if, with respect to [s] and [t'], the production [prod]
   is apparently viable. *)

(* If [apparently_viable dcst s t'] is false then the tree [dcst] definitely
   cannot be settled (printed) in the context represented by the state [s]
   and by the lookahead symbol [t'].

   If [apparently_viable dcst s t'] is true, however, then we cannot know for
   sure whether [dcst] can be settled in this context. This tree looks good at
   depth 1, but perhaps, by inspecting it at a greater depth, a problem would
   be discovered. *)

let[@warning "-32"] apparently_viable dcst s t' : bool =
  if debug then assert (DCST.transition_head s dcst <> None);
  match dcst with
  | Choice _ ->
      assert false
  | Terminal _ ->
      true
  | NonTerminal (prod, dcsts) ->
      apparently_viable_production prod dcsts s t'

(* The function [apparently_viable] is in fact not used, because we manually
   inline it at its call site inside [settle]. *)

(* -------------------------------------------------------------------------- *)

(* [settle dcst s t'] assumes that the state [s] has an outgoing transition
   labeled with the head symbol of [dcst].

   It attempts to "settle" the tree [dcst], that is, to convert [dcst] into
   a concrete syntax tree (CST), in the context represented by the state [s]
   and by the lookahead symbol [t'].

   The state [s] encodes information about the left context (the text that
   appears to the left of the tree [dcst]), whereas the lookahead symbol [t']
   represents the right context: it is the first symbol that will appear to
   the right of the tree [dcst].

   If the tree [dcst] can be settled in this context then [settle] returns a
   pair [(t, cst)] where [cst] is a viable concrete syntax tree and [t], a
   terminal symbol, is the first symbol of the sequence [fringe t @ [t']].

   If the tree cannot be settled in this context then [settle] raises the
   exception [CouldNotSettle].

   [settle] is correct: if it succeeds, then [cst] is indeed a correct
   rendition of [dcst] in this context. That is to say, the parser, beginning
   in state [s] and fed with the input sequence [fringe cst ++ [t']], will
   recognize [cst], will leave [t'] unconsumed, and will follow one edge
   (labeled with the symbol [head dcst]) from state [s] to some state [s'].

   When [backtracking] is false, then [settle] is not complete: it can raise
   [CouldNotSettle] even though [dcst] can be settled. Indeed, at a choice
   node, if the left branch is apparently viable, then the algorithm commits
   to this branch. If this branch later turns out to be unsolvable then the
   algorithm fails; it does not come back to the choice point to examine the
   other branch.

   When [backtracking] is true, [settle] is complete: if [dcst] can be settled,
   then the algorithm must succeed, and cannot raise [CouldNotSettle]. This is
   achieved by backtracking at choice points.

   Setting [backtracking] to [true] has a certain cost in stack space: the
   exception handlers take up stack space, even if they are never used.
   Furthermore, when [backtracking] is true, the algorithm could have very
   bad (exponential?) complexity. If a subtree, deep down, cannot be settled,
   then the complexity is expected to be Omega(2^n) where [n] is the number
   of choice points above the problematic subtree. *)

(* [settle] is written in destination-passing style, so, instead of returning
   a pair [(t, cst)], as suggested above, it actually returns just [t] and
   writes [cst] into the array slot [csts.(i)]. The array [csts] and the
   offset [i] are extra parameters; together they form a "destination". This
   style reduces memory allocation ([settle] no longer needs to return a
   pair) and stack allocation (the call from [settle_seq_step] to [settle]
   becomes a tail call). *)

exception CouldNotSettle

let rec settle (dcst : dcst) (s : state) (t' : terminal) csts i : terminal =
  if debug then assert (DCST.transition_head s dcst <> None);
  match dcst with

  (* When [backtracking] is false, the case of [Choice] nodes can be
     written as follows:

  | DCST.Choice (dcst1, dcst2) ->
      if apparently_viable dcst1 s t' then
        settle dcst1 s t'
      else
        settle dcst2 s t'

     If [apparently_viable] returns [false] then we are happy: we are certain
     that the left branch cannot succeed, so we commit to the right branch.

     If [apparently_viable] returns [true] then we do not know for sure that
     the left branch will succeed. Nevertheless, we optimistically commit to
     this branch. This makes the algorithm very simple and very fast.

     When [backtracking] is true, the recursive call [settle dcst1 s t'] is
     wrapped in an exception handler, so if the left branch fails, we try
     the second branch.

     Since the left branch of a choice cannot be a choice, [dcst1] must be a
     terminal or nonterminal node. We expand these two subcases and inline away
     the call [settle dcst1 s t']. This allows us to avoid a redundant call to
     [apparently_viable]. We end up with the following cases: *)

  | DCST.Choice (DCST.Choice _, _) ->
      assert false

  | DCST.Choice (DCST.Terminal tok, _)
  | DCST.Terminal tok ->
      csts.(i) <- CST.Terminal tok;
      token2terminal tok

  | DCST.Choice (DCST.NonTerminal (prod, dcsts), dcst2) ->
      if apparently_viable_production prod dcsts s t' then
        (* The left branch is apparently viable. If [backtracking] is false,
           commit to this branch. If [backtracking] is true, try this branch;
           if it fails, try the second branch. *)
        if not backtracking then
          settle_production prod dcsts s t' csts i
        else
          try
            settle_production prod dcsts s t' csts i
          with CouldNotSettle ->
            settle dcst2 s t' csts i
      else
        (* The left branch is definitely not viable: commit to the other
           branch. *)
        settle dcst2 s t' csts i

  | DCST.NonTerminal (prod, dcsts) ->
      (* This case looks like the previous one. However, here, we are not in a
         choice node. If this production is not viable then we are definitely
         in a dead end: we must fail. (Either we mistakenly committed earlier
         to an unsolvable branch; or the user actually gave us an unsolvable
         tree, to begin with.) If this production is viable then all is well,
         for now. *)
      if apparently_viable_production prod dcsts s t' then
        settle_production prod dcsts s t' csts i
      else
        (* Since we know that a handler is installed, there is no point in
           constructing a backtrace, so we use [raise_notrace]. However a
           quick benchmark suggests that this is not more efficient in time
           or space than [raise]. *)
        raise_notrace CouldNotSettle

(* [settle_production] is the special case of [settle] where the tree has the
   form [NonTerminal (prod, dcsts)] and this tree is apparently viable. *)

and settle_production prod dcsts s t' csts i : terminal =
  if debug then assert (apparently_viable_production prod dcsts s t');
  let n = Array.length dcsts in
  (* We allocate an array [csts'] of length [n], initialize it with a dummy
     value (ouch), and pass it to [settle_seq] so that it writes its results
     into this array. *)
  let csts' = Array.make n CST.dummy in
  let t = settle_seq dcsts 0 n s t' csts' in
  (* Then, we write our own result into [csts.(i)]. *)
  csts.(i) <- CST.NonTerminal (prod, csts');
  t

(* [settle_seq dcsts i n s t' csts] settles the sequence of subtrees that begins
   at offset [i] in the array [dcsts]. [n] is the length of this array. The
   resulting trees are written to the array [csts], beginning at offset [i],
   so [settle_seq] returns just a terminal symbol.

   We assume that this subsequence is apparently viable; that is, a path from
   state [s], labeled with the head symbols of the tree sequence defined by
   the array [dcsts] and the offset [i], exists and leads to a state where
   reducing production [prod] with lookahead symbol [t'] is permitted.

   We do not write a runtime assertion to verify this precondition because
   that would be a little awkward; we would need to transport [prod]. *)

and settle_seq dcsts i n s t' csts : terminal =
  if debug then assert (Array.length dcsts = n);
  if debug then assert (0 <= i && i <= n);
  if i = n then
    (* This is the base case. As the sequence is empty, the lookahead symbol
       [t'] falls through. *)
    t'
  else
    settle_seq_step dcsts i n s t' csts

(* [settle_seq_step] is the step case of [settle_seq]. One subtree, [dcsts.(i)],
   is followed by a sequence of subtrees.

   We proceed as follows:

   1. Follow one edge in the automaton to find out what intermediate state
      [is] will be reached after we settle the subtree [dcsts.(i)].
      A key fact is that we [is] can be computed without traversing this
      subtree: the head symbol of this subtree suffices to determine [is].

   2. Based on [is] and on the lookahead symbol [t'], settle the remaining
      subtrees. As a result, we get an intermediate symbol [it], which is
      the first symbol of the remaining subtrees (followed with [t']).

   3. Settle the subtree [dcst.(i)], with [it] as the lookahead symbol.

   As a result of this organization, the start states are computed in a
   left-to-right sweep, while going down into the recursive calls to
   [settle_seq], while the lookahead symbols are computed in a right-to-left
   sweep, while coming out of the recursive calls. The subtrees are settled
   from right to left. *)

and settle_seq_step dcsts i n s t' csts : terminal =
  if debug then assert (Array.length dcsts = n);
  if debug then assert (0 <= i && i < n);
  let dcst = dcsts.(i) in
  (* 1. Per our precondition, an edge from state [s], labeled with the head
        symbol of [dcst], towards some state [is], must exist. *)
  let is = DCST.transition_head s dcst |> Option.get in
  (* 2. We are now in a position to settle the remaining subtrees. *)
  let it = settle_seq dcsts (i+1) n is t' csts in
  (* 3. This call yields the the lookahead symbol [it] that we can use to
        settle [dcst]. *)
  settle dcst s it csts i
  (* We are done. Because [settle] is written in destination-passing style,
     we are able to make a tail call to [settle]. [settle] will take care of
     writing the array slot [csts.(i)] and returning a terminal symbol. *)

(* -------------------------------------------------------------------------- *)

(* A public entry point. *)

let settle (dcst, s, t') : cst option =
  match DCST.transition_head s dcst with
  | None ->
      None
  | Some (_ : state) ->
      try
        (* A dummy array must be created to serve as a destination. *)
        let csts = Array.make 1 CST.dummy in
        let _ : terminal = settle dcst s t' csts 0 in
        Some csts.(0)
      with CouldNotSettle ->
        None

end
OCaml

Innovation. Community. Security.