package re

  1. Overview
  2. Docs

Source file compile.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
open Import

let rec iter n f v = if Int.equal n 0 then v else iter (n - 1) f (f v)
let unknown = -2
let break = -3

type match_info =
  | Match of Group.t
  | Failed
  | Running of { no_match_starts_before : int }

type state_info =
  { idx : int
  ; (* Index of the current position in the position table.
       Not yet computed transitions point to a dummy state where
       [idx] is set to [unknown];
       If [idx] is set to [break] for states that either always
       succeed or always fail. *)
    real_idx : int
  ; (* The real index, in case [idx] is set to [break] *)
    mutable final : (Category.t * (Automata.idx * Automata.status)) list
  ; (* Mapping from the category of the next character to
       - the index where the next position should be saved
       - possibly, the list of marks (and the corresponding indices)
         corresponding to the best match *)
    desc : Automata.State.t (* Description of this state of the automata *)
  }

(* A state [t] is a pair composed of some information about the
   state [state_info] and a transition table [t array], indexed by
   color. For performance reason, to avoid an indirection, we manually
   unbox the transition table: we allocate a single array, with the
   state information at index 0, followed by the transitions. *)
module State : sig
  type t

  val make : ncol:int -> state_info -> t
  val get_info : t -> state_info
  val follow_transition : t -> color:Cset.c -> t
  val set_transition : t -> color:Cset.c -> t -> unit
end = struct
  type t = Table of t array [@@unboxed]

  let get_info (Table st) : state_info = Obj.magic (Array.unsafe_get st 0)
  [@@inline always]
  ;;

  let set_info (Table st) (info : state_info) = st.(0) <- Obj.magic info

  let follow_transition (Table st) ~color = Array.unsafe_get st (1 + Cset.to_int color)
  [@@inline always]
  ;;

  let set_transition (Table st) ~color st' = st.(1 + Cset.to_int color) <- st'
  let dummy (info : state_info) = Table [| Obj.magic info |]

  let unknown_state =
    dummy { idx = unknown; real_idx = 0; final = []; desc = Automata.State.dummy }
  ;;

  let make ~ncol state =
    let st = Table (Array.make (ncol + 1) unknown_state) in
    set_info st state;
    st
  ;;
end

(* Automata (compiled regular expression) *)
type re =
  { initial : Automata.expr
  ; (* The whole regular expression *)
    mutable initial_states : (Category.t * State.t) list
  ; (* Initial states, indexed by initial category *)
    colors : Color_map.Table.t
  ; (* Color table *)
    color_repr : Color_map.Repr.t
  ; (* Table from colors to one character of this color *)
    ncolor : int
  ; (* Number of colors. *)
    lnl : int
  ; (* Color of the last newline. -1 if unnecessary *)
    tbl : Automata.Working_area.t
  ; (* Temporary table used to compute the first available index
       when computing a new state *)
    states : State.t Automata.State.Table.t
  ; (* States of the deterministic automata *)
    group_names : (string * int) list
  ; (* Named groups in the regular expression *)
    group_count : int (* Number of groups in the regular expression *)
  }

let pp_re ch re = Automata.pp ch re.initial
let print_re = pp_re
let group_count re = re.group_count
let group_names re = re.group_names

(* Information used during matching *)
type info =
  { re : re
  ; (* The automata *)
    mutable positions : int array
  ; (* Array of mark positions
       The mark are off by one for performance reasons *)
    pos : int
  ; (* Position where the match is started *)
    last : int (* Position where the match should stop *)
  }

(****)

let category re ~color =
  if Cset.to_int color = -1
  then Category.inexistant (* Special category for the last newline *)
  else if Cset.to_int color = re.lnl
  then Category.(lastnewline ++ newline ++ not_letter)
  else Category.from_char (Color_map.Repr.repr re.color_repr color)
;;

(****)

let mk_state ncol desc =
  let break_state =
    match Automata.State.status desc with
    | Automata.Running -> false
    | Automata.Failed | Automata.Match _ -> true
  in
  let st =
    let real_idx = Automata.State.idx desc in
    { idx = (if break_state then break else real_idx); real_idx; final = []; desc }
  in
  State.make ~ncol:(if break_state then 0 else ncol) st
;;

let find_state re desc =
  try Automata.State.Table.find re.states desc with
  | Not_found ->
    let st = mk_state re.ncolor desc in
    Automata.State.Table.add re.states desc st;
    st
;;

(**** Match with marks ****)

let delta info cat ~color st =
  let desc = Automata.delta info.re.tbl cat color st.desc in
  let len = Array.length info.positions in
  if Automata.State.idx desc = len && len > 0
  then (
    let pos = info.positions in
    info.positions <- Array.make (2 * len) 0;
    Array.blit pos 0 info.positions 0 len);
  desc
;;

let validate info (s : string) ~pos st =
  let color = Color_map.Table.get info.re.colors s.[pos] in
  let st' =
    let desc' =
      let cat = category info.re ~color in
      delta info cat ~color (State.get_info st)
    in
    find_state info.re desc'
  in
  State.set_transition st ~color st'
;;

let next colors st s pos =
  State.follow_transition st ~color:(Color_map.Table.get colors (String.unsafe_get s pos))
;;

let rec loop info ~colors ~positions s ~pos ~last st0 st =
  if pos < last
  then (
    let st' = next colors st s pos in
    let state_info = State.get_info st' in
    let idx = state_info.idx in
    if idx >= 0
    then (
      Array.unsafe_set positions idx pos;
      loop info ~colors ~positions s ~pos:(pos + 1) ~last st' st')
    else if idx = break
    then (
      Array.unsafe_set positions state_info.real_idx pos;
      st')
    else (
      (* Unknown *)
      validate info s ~pos st0;
      loop info ~colors ~positions:info.positions s ~pos ~last st0 st0))
  else st
;;

let rec loop_no_mark info ~colors s ~pos ~last st0 st =
  if pos < last
  then (
    let st' = next colors st s pos in
    let state_info = State.get_info st' in
    let idx = state_info.idx in
    if idx >= 0
    then loop_no_mark info ~colors s ~pos:(pos + 1) ~last st' st'
    else if idx = break
    then st'
    else (
      (* Unknown *)
      validate info s ~pos st0;
      loop_no_mark info ~colors s ~pos ~last st0 st0))
  else st
;;

let final info st cat =
  try List.assq cat st.final with
  | Not_found ->
    let st' = delta info cat ~color:(Cset.of_int (-1)) st in
    let res = Automata.State.idx st', Automata.State.status st' in
    st.final <- (cat, res) :: st.final;
    res
;;

let find_initial_state re cat =
  try List.assq cat re.initial_states with
  | Not_found ->
    let st = find_state re (Automata.State.create cat re.initial) in
    re.initial_states <- (cat, st) :: re.initial_states;
    st
;;

let get_color re (s : string) pos =
  if pos < 0
  then Cset.of_int @@ -1
  else (
    let slen = String.length s in
    if pos >= slen
    then Cset.of_int (-1)
    else if pos = slen - 1 && re.lnl <> -1 && Char.equal s.[pos] '\n'
    then (* Special case for the last newline *)
      Cset.of_int re.lnl
    else Color_map.Table.get re.colors s.[pos])
;;

let rec handle_last_newline info ~pos st ~groups =
  let st' = State.follow_transition st ~color:(Cset.of_int info.re.lnl) in
  let info' = State.get_info st' in
  if info'.idx >= 0
  then (
    if groups then info.positions.(info'.idx) <- pos;
    st')
  else if info'.idx = break
  then (
    if groups then info.positions.(info'.real_idx) <- pos;
    st')
  else (
    (* Unknown *)
    let color = Cset.of_int info.re.lnl in
    let st' =
      let desc' =
        let cat = category info.re ~color in
        let real_c = Color_map.Table.get info.re.colors '\n' in
        delta info cat ~color:real_c (State.get_info st)
      in
      find_state info.re desc'
    in
    State.set_transition st ~color st';
    handle_last_newline info ~pos st ~groups)
;;

let rec scan_str info (s : string) initial_state ~groups =
  let pos = info.pos in
  let last = info.last in
  if last = String.length s
     && info.re.lnl <> -1
     && last > pos
     && Char.equal (String.get s (last - 1)) '\n'
  then (
    let info = { info with last = last - 1 } in
    let st = scan_str info s initial_state ~groups in
    if (State.get_info st).idx = break
    then st
    else handle_last_newline info ~pos:(last - 1) st ~groups)
  else if groups
  then
    loop
      info
      ~colors:info.re.colors
      ~positions:info.positions
      s
      ~pos
      ~last
      initial_state
      initial_state
  else loop_no_mark info ~colors:info.re.colors s ~pos ~last initial_state initial_state
;;

(* This function adds a final boundary check on the input.
   This is useful to indicate that the output failed because
   of insufficient input, or to verify that the output actually
   matches for regex that have boundary conditions with respect
   to the input string.
*)
let final_boundary_check ~last ~slen re s ~info ~st ~groups =
  let idx, res =
    let final_cat =
      Category.(
        search_boundary
        ++ if last = slen then inexistant else category re ~color:(get_color re s last))
    in
    final info (State.get_info st) final_cat
  in
  (match groups, res with
   | true, Match _ -> info.positions.(idx) <- last
   | _ -> ());
  res
;;

let match_str ~groups ~partial re s ~pos ~len =
  let slen = String.length s in
  let last = if len = -1 then slen else pos + len in
  let info =
    { re
    ; pos
    ; last
    ; positions =
        (if groups
         then (
           let n = Automata.Working_area.index_count re.tbl + 1 in
           if n <= 10 then [| 0; 0; 0; 0; 0; 0; 0; 0; 0; 0 |] else Array.make n 0)
         else [||])
    }
  in
  let st =
    let initial_state =
      let initial_cat =
        Category.(
          search_boundary
          ++ if pos = 0 then inexistant else category re ~color:(get_color re s (pos - 1)))
      in
      find_initial_state re initial_cat
    in
    scan_str info s initial_state ~groups
  in
  match
    let state_info = State.get_info st in
    if state_info.idx = break || (partial && not groups)
    then Automata.State.status state_info.desc
    else if partial && groups
    then (
      match Automata.State.status state_info.desc with
      | (Match _ | Failed) as status -> status
      | Running ->
        (* This could be because it's still not fully matched, or it
           could be that because we need to run special end of input
           checks. *)
        (match final_boundary_check ~last ~slen re s ~info ~st ~groups with
         | Match _ as status -> status
         | Failed | Running ->
           (* A failure here just means that we need more data, i.e.
              it's a partial match. *)
           Running))
    else final_boundary_check ~last ~slen re s ~info ~st ~groups
  with
  | Match (marks, pmarks) ->
    Match { s; marks; pmarks; gpos = info.positions; gcount = re.group_count }
  | Failed -> Failed
  | Running ->
    let no_match_starts_before = if groups then info.positions.(0) else 0 in
    Running { no_match_starts_before }
;;

let mk_re ~initial ~colors ~color_repr ~ncolor ~lnl ~group_names ~group_count =
  { initial
  ; initial_states = []
  ; colors
  ; color_repr
  ; ncolor
  ; lnl
  ; tbl = Automata.Working_area.create ()
  ; states = Automata.State.Table.create 97
  ; group_names
  ; group_count
  }
;;

(**** Compilation ****)

module A = Automata

let enforce_kind ids kind kind' cr =
  match kind, kind' with
  | `First, `First -> cr
  | `First, k -> A.seq ids k cr (A.eps ids)
  | _ -> cr
;;

type context =
  { ids : A.Ids.t
  ; kind : A.Sem.t
  ; ign_group : bool
  ; greedy : A.Rep_kind.t
  ; pos : A.Mark.t ref
  ; names : (string * int) list ref
  ; cache : Cset.t Cset.CSetMap.t ref
  ; colors : Color_map.Table.t
  }

let trans_set cache (cm : Color_map.Table.t) s =
  match Cset.one_char s with
  | Some i -> Cset.csingle (Color_map.Table.get_char cm i)
  | None ->
    let v = Cset.hash_rec s, s in
    (try Cset.CSetMap.find v !cache with
     | Not_found ->
       let l = Color_map.Table.translate_colors cm s in
       cache := Cset.CSetMap.add v l !cache;
       l)
;;

let make_repeater ids cr kind greedy =
  match greedy with
  | `Greedy -> fun rem -> A.alt ids [ A.seq ids kind (A.rename ids cr) rem; A.eps ids ]
  | `Non_greedy ->
    fun rem -> A.alt ids [ A.eps ids; A.seq ids kind (A.rename ids cr) rem ]
;;

(* XXX should probably compute a category mask *)
let rec translate
  ({ ids; kind; ign_group; greedy; pos; names; cache; colors } as ctx)
  (ast : Ast.no_case)
  =
  match ast with
  | Set s -> A.cst ids (trans_set cache colors s), kind
  | Sequence l -> trans_seq ctx l, kind
  | Ast (Alternative l) ->
    (match Ast.merge_sequences l with
     | [ r' ] ->
       let cr, kind' = translate ctx r' in
       enforce_kind ids kind kind' cr, kind
     | merged_sequences ->
       ( A.alt
           ids
           (List.map merged_sequences ~f:(fun r' ->
              let cr, kind' = translate ctx r' in
              enforce_kind ids kind kind' cr))
       , kind ))
  | Repeat (r', i, j) ->
    let cr, kind' = translate ctx r' in
    let rem =
      match j with
      | None -> A.rep ids greedy kind' cr
      | Some j ->
        let f = make_repeater ids cr kind' greedy in
        iter (j - i) f (A.eps ids)
    in
    iter i (fun rem -> A.seq ids kind' (A.rename ids cr) rem) rem, kind
  | Beg_of_line -> A.after ids Category.(inexistant ++ newline), kind
  | End_of_line -> A.before ids Category.(inexistant ++ newline), kind
  | Beg_of_word ->
    ( A.seq
        ids
        `First
        (A.after ids Category.(inexistant ++ not_letter))
        (A.before ids Category.letter)
    , kind )
  | End_of_word ->
    ( A.seq
        ids
        `First
        (A.after ids Category.letter)
        (A.before ids Category.(inexistant ++ not_letter))
    , kind )
  | Not_bound ->
    ( A.alt
        ids
        [ A.seq ids `First (A.after ids Category.letter) (A.before ids Category.letter)
        ; (let cat = Category.(inexistant ++ not_letter) in
           A.seq ids `First (A.after ids cat) (A.before ids cat))
        ]
    , kind )
  | Beg_of_str -> A.after ids Category.inexistant, kind
  | End_of_str -> A.before ids Category.inexistant, kind
  | Last_end_of_line -> A.before ids Category.(inexistant ++ lastnewline), kind
  | Start -> A.after ids Category.search_boundary, kind
  | Stop -> A.before ids Category.search_boundary, kind
  | Sem (kind', r') ->
    let cr, kind'' = translate { ctx with kind = kind' } r' in
    enforce_kind ids kind' kind'' cr, kind'
  | Sem_greedy (greedy', r') -> translate { ctx with greedy = greedy' } r'
  | Group (n, r') ->
    if ign_group
    then translate ctx r'
    else (
      let p = !pos in
      let () =
        match n with
        | Some name -> names := (name, A.Mark.group_count p) :: !names
        | None -> ()
      in
      pos := A.Mark.next2 !pos;
      let cr, kind' = translate ctx r' in
      ( A.seq ids `First (A.mark ids p) (A.seq ids `First cr (A.mark ids (A.Mark.next p)))
      , kind' ))
  | No_group r' -> translate { ctx with ign_group = true } r'
  | Nest r' ->
    let b = !pos in
    let cr, kind' = translate ctx r' in
    let e = A.Mark.prev !pos in
    if e < b then cr, kind' else A.seq ids `First (A.erase ids b e) cr, kind'
  | Pmark (i, r') ->
    let cr, kind' = translate ctx r' in
    A.seq ids `First (A.pmark ids i) cr, kind'

and trans_seq ({ ids; kind; _ } as ctx) = function
  | [] -> A.eps ids
  | [ r ] ->
    let cr', kind' = translate ctx r in
    enforce_kind ids kind kind' cr'
  | r :: rem ->
    let cr', kind' = translate ctx r in
    let cr'' = trans_seq ctx rem in
    if A.is_eps cr'' then cr' else if A.is_eps cr' then cr'' else A.seq ids kind' cr' cr''
;;

let compile_1 regexp =
  let regexp = Ast.handle_case false regexp in
  let color_map = Color_map.make () in
  let need_lnl = Ast.colorize color_map regexp in
  let colors, color_repr = Color_map.flatten color_map in
  let ncolor = Color_map.Repr.length color_repr in
  let lnl = if need_lnl then ncolor else -1 in
  let ncolor = if need_lnl then ncolor + 1 else ncolor in
  let ctx =
    { ids = A.Ids.create ()
    ; kind = `First
    ; ign_group = false
    ; greedy = `Greedy
    ; pos = ref A.Mark.start
    ; names = ref []
    ; cache = ref Cset.CSetMap.empty
    ; colors
    }
  in
  let r, kind = translate ctx regexp in
  let r = enforce_kind ctx.ids `First kind r in
  (*Format.eprintf "<%d %d>@." !ids ncol;*)
  mk_re
    ~initial:r
    ~colors
    ~color_repr
    ~ncolor
    ~lnl
    ~group_names:(List.rev !(ctx.names))
    ~group_count:(A.Mark.group_count !(ctx.pos))
;;

let compile r =
  let open Ast.Export in
  compile_1
    (if Ast.anchored r
     then group r
     else seq [ shortest (rep (Ast.cset Cset.cany)); group r ])
;;
OCaml

Innovation. Community. Security.