package core_extended

  1. Overview
  2. Docs
Extra components that are not as closely vetted or as stable as Core

Install

Dune Dependency

Authors

Maintainers

Sources

core_extended-v0.14.0.tar.gz
sha256=a7bf672f617891b10e405f1edb1c2ddc1db9e5b3169bd278bbb75b84d57d23ce
md5=00eb9b3ed6b0b02f74a2cf01e4d5827f

doc/src/core_extended.delimited_kernel/parse_state.ml.html

Source file parse_state.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
open Core_kernel
module Row_buffer = Append_only_buffer
open! Int.Replace_polymorphic_compare

exception Bad_csv_formatting of string list * string

module Step = struct
  type t =
    | Field_start
    | In_unquoted_field
    | In_quoted_field
    | In_quoted_field_after_quote
end

open Step

type 'a t =
  { acc : 'a
  ; sep : char
  ; quote : char
  ; use_quoting : bool
  ; lineno : int
  ; step : Step.t
  ; field : string
  ; current_row : string list
  ; emit_field : string Row_buffer.t -> Buffer.t -> unit
  ; f : int -> 'a -> string Row_buffer.t -> 'a
  ; fields_used : int array option
  ; current_field : int
  ; next_field_index : int
  }
[@@deriving fields]

let make_emit_field ~strip current_row field =
  Row_buffer.append
    current_row
    (if strip then Shared.strip_buffer field else Buffer.contents field);
  Buffer.clear field
;;

let emit_row f i acc current_row =
  let acc = f (i + 1) acc current_row in
  Row_buffer.lax_clear current_row;
  acc
;;

let set_acc t acc = { t with acc }

let create ?(strip = false) ?(sep = ',') ?(quote = `Using '"') ~fields_used ~init ~f () =
  let fields_used =
    match fields_used with
    | None -> None
    | Some fields_used as x
      when Array.is_sorted_strictly fields_used ~compare:Int.ascending -> x
    | Some fields_used ->
      Some
        (Array.of_list
           (List.dedup_and_sort (Array.to_list fields_used) ~compare:Int.ascending))
  in
  { acc = init
  ; sep
  ; quote =
      (match quote with
       | `Using char -> char
       | `No_quoting -> '"')
  ; use_quoting =
      (match quote with
       | `Using _ -> true
       | `No_quoting -> false)
  ; lineno = 1
  ; step = Field_start
  ; field = ""
  ; current_row = []
  ; emit_field = make_emit_field ~strip
  ; f
  ; fields_used
  ; current_field = 0
  ; next_field_index = 0
  }
;;

let mutable_of_t t =
  let field = Buffer.create (String.length t.field) in
  Buffer.add_string field t.field;
  let current_row = Row_buffer.of_list t.current_row in
  field, current_row
;;

(* To reduce the number of allocations, we keep an array [fields_used] of the field
   indexes we care about. [current_field] is the position of the parser within the
   input row, and [next_field_index] is an index into the [fields_used] array
   indicating the next field that we need to store.

   If [fields_used] is None, we need to store every field.
*)
let should_enqueue fields_used current_field next_field_index =
  match fields_used with
  | None -> true
  | Some array ->
    next_field_index < Array.length array && array.(next_field_index) = current_field
;;

let input_aux ~get_length ~get t ?(pos = 0) ?len input =
  let field, current_row = mutable_of_t t in
  let enqueue = ref (should_enqueue t.fields_used t.current_field t.next_field_index) in
  let current_field = ref t.current_field in
  let next_field_index = ref t.next_field_index in
  let increment_field () =
    current_field := !current_field + 1;
    next_field_index := if !enqueue then !next_field_index + 1 else !next_field_index;
    enqueue := should_enqueue t.fields_used !current_field !next_field_index
  in
  let reset_field () =
    current_field := 0;
    next_field_index := 0;
    enqueue := should_enqueue t.fields_used !current_field !next_field_index
  in
  let loop_bound =
    match len with
    | Some i -> i + pos
    | None -> get_length input
  in
  let rec loop i t step =
    if i >= loop_bound
    then
      { t with
        step
      ; current_field = !current_field
      ; next_field_index = !next_field_index
      }
    else
      let open Char.Replace_polymorphic_compare in
      let continue = loop (i + 1) in
      let c = get input i in
      if c = '\r'
      then continue t step
      else (
        match step with
        | Field_start ->
          if c = t.quote && t.use_quoting
          then continue t In_quoted_field
          else if c = t.sep
          then (
            if !enqueue then t.emit_field current_row field;
            increment_field ();
            continue t Field_start)
          else if c = '\n'
          then (
            if !enqueue then t.emit_field current_row field;
            reset_field ();
            continue
              { t with acc = emit_row t.f i t.acc current_row; lineno = t.lineno + 1 }
              Field_start)
          else (
            if !enqueue then Buffer.add_char field c;
            continue t In_unquoted_field)
        | In_unquoted_field ->
          if c = t.sep
          then (
            if !enqueue then t.emit_field current_row field;
            increment_field ();
            continue t Field_start)
          else if c = '\n'
          then (
            if !enqueue then t.emit_field current_row field;
            reset_field ();
            continue
              { t with acc = emit_row t.f i t.acc current_row; lineno = t.lineno + 1 }
              Field_start)
          else (
            if !enqueue then Buffer.add_char field c;
            continue t step)
        | In_quoted_field ->
          if c = t.quote
          then continue t In_quoted_field_after_quote
          else (
            if !enqueue then Buffer.add_char field c;
            continue t step)
        | In_quoted_field_after_quote ->
          (* We must be using quoting to be in this state. *)
          if c = t.quote
          then (
            (* doubled quote *)
            if !enqueue then Buffer.add_char field t.quote;
            continue t In_quoted_field)
          else if c = '0'
          then (
            if !enqueue then Buffer.add_char field '\000';
            continue t In_quoted_field)
          else if c = t.sep
          then (
            if !enqueue then t.emit_field current_row field;
            increment_field ();
            continue t Field_start)
          else if c = '\n'
          then (
            if !enqueue then t.emit_field current_row field;
            reset_field ();
            continue
              { t with acc = emit_row t.f i t.acc current_row; lineno = t.lineno + 1 }
              Field_start)
          else if Char.is_whitespace c
          then continue t step
          else
            failwithf
              "In_quoted_field_after_quote looking at '%c' (lineno=%d)"
              c
              t.lineno
              ())
  in
  let t' = loop pos t t.step in
  { t' with
    field = Buffer.contents field
  ; current_row = Row_buffer.to_list current_row
  ; current_field = !current_field
  ; next_field_index = !next_field_index
  }
;;

let input t ?pos ?len input =
  input_aux ~get_length:Bytes.length ~get:Bytes.get t ?pos ?len input
;;

let input_string t ?pos ?len input =
  input_aux ~get_length:String.length ~get:String.get t ?pos ?len input
;;

let finish t =
  let field, current_row = mutable_of_t t in
  let enqueue = should_enqueue t.fields_used t.current_field t.next_field_index in
  let acc =
    match t.step with
    | Field_start ->
      if Row_buffer.length current_row <> 0
      then (
        if enqueue then t.emit_field current_row field;
        emit_row t.f 0 t.acc current_row)
      else t.acc
    | In_unquoted_field | In_quoted_field_after_quote ->
      if enqueue then t.emit_field current_row field;
      emit_row t.f 0 t.acc current_row
    | In_quoted_field ->
      raise (Bad_csv_formatting (Row_buffer.to_list current_row, Buffer.contents field))
  in
  { t with
    field = Buffer.contents field
  ; current_row = Row_buffer.to_list current_row
  ; current_field = 0
  ; next_field_index = 0
  ; acc
  }
;;
OCaml

Innovation. Community. Security.