package pacomb

  1. Overview
  2. Docs

Source file word_list.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
module Pacomb = struct
  module Lex = Lex
  module Grammar = Grammar
end

type 'a data =
  { mutable leafs : 'a list
  ; mutable next  : 'a data option array }

type ('a,'b) t =
  { data : 'b data
  ; uniq : bool
  ; map  : 'a -> 'a
  ; cs   : Charset.t
  ; finl : Input.buffer -> Input.idx -> bool }

let create_data () = { leafs = []; next = Array.make 256 None }

let idt x = x

let create ?(unique=true) ?(map=idt)
      ?(cs=Charset.full) ?(final_test=fun _ _ -> true) () =
  { data = create_data () ; uniq = unique; map; cs; finl = final_test }

let reset t = t.data.leafs <- []; t.data.next <- Array.make 256 None

let save t = { leafs = t.data.leafs; next = t.data.next }

let save_and_reset t = let s = save t in reset t; s

let restore t s = t.data.leafs <- s.leafs; t.data.next <- s.next

let size { data = {leafs; next}; _} =
  let res = ref 0 in
  let rec fn {leafs; next} =
    res := !res + List.length leafs;
    Array.iter gn next
  and gn = function
    | None -> ()
    | Some d -> fn d
  in
  res := !res + List.length leafs;
  Array.iter gn next;
  !res

exception Already_bound

let next tbl c = tbl.next.(Char.code c)

let advance : bool -> (char -> char) -> 'b data -> string -> 'b data =
  fun add map tbl s ->
  let r = ref tbl in
  for i = 0 to String.length s - 1 do
    let c = map s.[i] in
    match !r.next.(Char.code c) with
    | Some tbl -> r := tbl
    | None ->
       if add then
         let tbl = create_data () in
         !r.next.(Char.code c) <- Some tbl;
         r := tbl
       else raise Not_found
  done;
  !r

let add_ascii : (char,'b) t -> string -> 'b -> unit =
  fun { data; uniq; map; cs } s v ->
    if s = "" then invalid_arg "Word_list.add_ascii: empty word";
    if not (Charset.mem cs s.[0]) then
      invalid_arg "Word_list.add: charset mismatch";
    let data = advance true map data s in
    if uniq && data.leafs <> [] then raise Already_bound;
    data.leafs <- v :: data.leafs

let mem_ascii : (char,'b) t -> string -> bool =
  fun { data; map; _ } s ->
    try
      let data = advance false map data s in
      data.leafs <> []
    with
      Not_found -> false

let add_utf8 : (string, 'b) t -> string -> 'b -> unit =
  fun { data; map; uniq; cs  } s v ->
    if s = "" then invalid_arg "Word_list.add_utf8: empty word";
    if not (Charset.mem cs s.[0]) then
      invalid_arg "Word_list.add: charset mismatch";
    let fn data s = advance true (fun c -> c) data (map s) in
    let data = Utf8.fold_grapheme fn data s in
    if uniq && data.leafs <> [] then raise Already_bound;
    data.leafs <- v :: data.leafs

let mem_utf8 : (string, 'b) t -> string -> bool =
  fun { data; map; _ } s ->
    try
      let fn data s = advance false (fun c -> c) data (map s) in
      let data = Utf8.fold_grapheme fn data s in
      data.leafs <> []
    with
      Not_found -> false

let word : ?name:string -> (char, 'a) t -> 'a Grammar.t =
  fun ?name { data = tbl; map; cs; finl; uniq } ->
    let n = Lex.default "WORD" name in
    if uniq then
      let rec f tbl s0 n0 =
        let (c,s,n) = Input.read s0 n0 in
        let c = map c in
        match next tbl c with
        | Some t -> f t s n
        | None ->
           if finl s0 n0 && tbl.leafs <> [] then (List.hd tbl.leafs, s0, n0)
           else (raise Lex.NoParse)
      in
      let f = f tbl in
      let lex = Lex.{ n; f; a = Custom(f,Assoc.new_key ()); c = cs }
      in
      Grammar.term ?name lex
    else
      let rec f tbl s0 n0 =
        let (c,s,n) = Input.read s0 n0 in
        let c = map c in
        match next tbl c with
        | Some t -> f t s n
        | None ->
           if finl s0 n0 && tbl.leafs <> [] then (tbl.leafs, s0, n0)
           else raise Lex.NoParse
      in
      let f = f tbl in
      let lex = Lex.{ n; f; a = Custom(f,Assoc.new_key ()); c = cs }
      in
      Grammar.unmerge ?name (Grammar.term lex)

let utf8_word : ?name:string -> (string, 'a) t -> 'a Grammar.t =
  fun ?name { data = tbl; map; finl; cs; uniq } ->
    let n = Lex.default "UTF8_WORD" name in
    if uniq then
      let rec f tbl s n =
        try
          let (g,s,n) = Lex.((any_grapheme ()).f s n) in
          let g = map g in
          f (advance false (fun c -> c) tbl g) s n
        with
          Not_found ->
          if finl s n && tbl.leafs <> [] then(List.hd tbl.leafs, s, n)
          else raise Lex.NoParse
      in
      let f = f tbl in
      let lex = Lex.{ n; f; a = Custom(f,Assoc.new_key ()); c = cs }
      in
      Grammar.term ?name lex
    else
      let rec f tbl s n =
        try
          let (g,s,n) = Lex.((any_grapheme ()).f s n) in
          let g = map g in
          f (advance false (fun c -> c) tbl g) s n
        with
          Not_found ->
          if finl s n && tbl.leafs <> [] then (tbl.leafs, s, n)
          else raise Lex.NoParse
      in
      let f = f tbl in
      let lex = Lex.{ n; f; a = Custom(f,Assoc.new_key ()); c = cs }
      in
      Grammar.unmerge ?name (Grammar.term lex)
OCaml

Innovation. Community. Security.