package SZXX

  1. Overview
  2. Docs

Source file xlsx.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
open! Core
open Eio.Std

type location = {
  sheet_number: int;
  row_number: int;
  col_index: int;
}
[@@deriving sexp_of]

type 'a cell_parser = {
  string: location -> string -> 'a;
  formula: location -> formula:string -> string -> 'a;
  error: location -> formula:string -> string -> 'a;
  boolean: location -> string -> 'a;
  number: location -> string -> 'a;
  date: location -> string -> 'a;
  null: 'a;
}

type 'a row = {
  sheet_number: int;
  row_number: int;
  data: 'a list;
}
[@@deriving sexp_of]

let origin = Date.add_days (Date.create_exn ~y:1900 ~m:(Month.of_int_exn 1) ~d:1) (-2)

let parse_date f = Float.to_int f |> Date.add_days origin

let parse_datetime ~zone f =
  let parts = Float.modf f in
  let date = Float.Parts.integral parts |> Float.to_int |> Date.add_days origin in
  let frac = Float.(Parts.fractional parts * 86400000. |> round) |> Time_float.Span.of_ms in
  let ofday = Time_float.Ofday.of_span_since_start_of_day_exn frac in
  Time_float.of_date_ofday ~zone date ofday

let xml_parser_options =
  Xml.SAX.
    {
      accept_html_boolean_attributes = false;
      accept_unquoted_attributes = false;
      accept_single_quoted_attributes = false;
      batch_size = 20;
    }

let xml_parser = Xml.SAX.make_parser xml_parser_options

let fold_angstrom ~filter_path ~on_match () =
  let sax = ref Xml.SAX.Expert.Stream.init in
  let on_parse node = sax := Xml.SAX.Expert.Stream.folder ~filter_path ~on_match !sax node in
  Zip.Action.Parse_many { parser = xml_parser; on_parse }

let parse_string_cell el =
  let open Xml.DOM in
  match dot "t" el with
  | Some { text; _ } -> text
  | None -> filter_map "r" el ~f:(dot_text "t") |> String.concat

module SST = struct
  type t = string Lazy.t array

  let filter_path = [ "sst"; "si" ]

  let zip_entry_filename = "xl/sharedStrings.xml"

  let from_feed feed =
    Switch.run @@ fun sw ->
    let q = Queue.create () in
    let seen = ref false in
    Zip.stream_files ~sw ~feed (function
      | { filename = "xl/sharedStrings.xml"; _ } ->
        seen := true;
        let on_match el = Queue.enqueue q (lazy (parse_string_cell el)) in
        fold_angstrom ~filter_path ~on_match ()
      | _ when !seen -> Terminate
      | _ -> Fast_skip )
    |> Sequence.iter ~f:(function
         | Zip.{ filename; _ }, Zip.Data.Parse_many state -> (
           match Zip.Data.parser_state_to_result state with
           | Ok x -> x
           | Error msg -> failwithf "SZXX: File '%s' error: %s" filename msg () )
         | _ -> () );

    Queue.to_array q

  let from_entries file (entries : Zip.entry list) =
    match List.find entries ~f:(fun entry -> String.( = ) entry.filename zip_entry_filename) with
    | None -> [||]
    | Some entry -> (
      let q = Queue.create () in
      let on_match el = Queue.enqueue q (lazy (parse_string_cell el)) in
      let action = fold_angstrom ~filter_path ~on_match () in
      match Zip.extract_from_index file entry action with
      | Zip.Data.Parse_many state ->
        (match Zip.Data.parser_state_to_result state with
        | Ok () -> ()
        | Error msg -> failwithf "SZXX: File '%s' error: %s" entry.filename msg ());
        Queue.to_array q
      | _ -> assert false )

  let from_file file = Zip.index_entries file |> from_entries file

  let resolve_sst_index (sst : t) ~sst_index =
    try Some (force (Array.get sst (Int.of_string sst_index))) with
    | _ -> None
end

let index_of_column s =
  let key =
    String.take_while s ~f:(function
      | 'A' .. 'Z' -> true
      | _ -> false )
  in
  String.fold key ~init:0 ~f:(fun acc c -> (acc * 26) + Char.to_int c - 64) - 1

module Expert = struct
  module SST = SST

  type delayed_string = {
    location: location;
    sst_index: string;
  }
  [@@deriving sexp_of]

  type 'a status =
    | Available of 'a
    | Delayed of delayed_string
  [@@deriving sexp_of]

  let unwrap_status cell_parser (sst : SST.t) (row : 'a status row) =
    let data =
      List.map row.data ~f:(function
        | Available x -> x
        | Delayed { location; sst_index } -> (
          match SST.resolve_sst_index sst ~sst_index with
          | Some index -> cell_parser.string location index
          | None -> cell_parser.null ) )
    in
    { row with data }

  let extract_cell_sst, extract_cell_status =
    let open Xml.DOM in
    let extract ~null location extractor : element option -> 'a = function
      | None -> null
      | Some { text; _ } -> extractor location text
    in
    let extract_cell_base { string; formula; error; boolean; number; date; null } location el ty =
      match ty with
      | None
       |Some "n" ->
        dot "v" el |> extract ~null location number
      | Some "d" -> dot "v" el |> extract ~null location date
      | Some "str" -> (
        match dot_text "v" el with
        | None -> null
        | Some s -> formula location s ~formula:(dot_text "f" el |> Option.value ~default:"") )
      | Some "e" -> (
        match dot_text "v" el with
        | None -> null
        | Some s -> error location s ~formula:(dot_text "f" el |> Option.value ~default:"") )
      | Some "inlineStr" -> (
        match dot "is" el with
        | None -> null
        | Some el -> string location (parse_string_cell el) )
      | Some "b" -> dot "v" el |> extract ~null location boolean
      | Some t ->
        failwithf "Unknown data type: %s. Please report this bug. %s" t
          (sexp_of_element el |> Sexp.to_string)
          ()
    in
    let extract_cell_sst sst cell_parser location el =
      match Xml.DOM.get_attr el.attrs "t" with
      | Some "s" -> (
        match dot "v" el with
        | None -> cell_parser.null
        | Some { text = sst_index; _ } -> (
          match SST.resolve_sst_index sst ~sst_index with
          | None -> cell_parser.null
          | Some resolved -> cell_parser.string location resolved ) )
      | ty -> extract_cell_base cell_parser location el ty
    in
    let extract_cell_status cell_parser location el =
      match Xml.DOM.get_attr el.attrs "t" with
      | Some "s" -> (
        match dot "v" el with
        | None -> Available cell_parser.null
        | Some { text = sst_index; _ } -> Delayed { location; sst_index } )
      | ty -> Available (extract_cell_base cell_parser location el ty)
    in
    extract_cell_sst, extract_cell_status

  let parse_row_with_sst sst cell_parser ({ data; sheet_number; row_number } as row) =
    let open Xml.DOM in
    match data with
    | [] -> { row with data = [] }
    | _ ->
      let rec loop i acc = function
        | [] -> List.rev acc
        | el :: rest ->
          let col_index =
            Xml.DOM.get_attr el.attrs "r" |> Option.value_map ~default:i ~f:index_of_column
          in
          let v = extract_cell_sst sst cell_parser { col_index; sheet_number; row_number } el in
          let acc = Fn.apply_n_times ~n:(col_index - i) (List.cons cell_parser.null) acc in
          (loop [@tailcall]) (col_index + 1) (v :: acc) rest
      in
      { row with data = loop 0 [] data }

  let parse_row_without_sst cell_parser ({ data; sheet_number; row_number } as row) =
    let open Xml.DOM in
    match data with
    | [] -> { row with data = [] }
    | _ ->
      let rec loop i acc = function
        | [] -> List.rev acc
        | el :: rest ->
          let col_index =
            Xml.DOM.get_attr el.attrs "r" |> Option.value_map ~default:i ~f:index_of_column
          in
          let v = extract_cell_status cell_parser { col_index; sheet_number; row_number } el in
          let acc = Fn.apply_n_times ~n:(col_index - i) (List.cons (Available cell_parser.null)) acc in
          (loop [@tailcall]) (col_index + 1) (v :: acc) rest
      in
      { row with data = loop 0 [] data }
end

let parse_sheet ~sheet_number push =
  let num = ref 0 in
  let on_match (el : Xml.DOM.element) =
    (match Xml.DOM.get_attr el.attrs "r" with
    | None -> incr num
    | Some s -> (
      try
        let i = Int.of_string s in
        (* Insert blank rows *)
        for row_number = !num to i - 2 do
          push { sheet_number; row_number; data = [] }
        done;
        num := i
      with
      | _ -> incr num ));
    push { sheet_number; row_number = !num; data = el.children }
  in
  fold_angstrom ~filter_path:[ "worksheet"; "sheetData"; "row" ] ~on_match ()

let get_sheet_action ~filter_sheets (entry : Zip.entry) push =
  let open Option.Monad_infix in
  String.chop_prefix ~prefix:"xl/worksheets/sheet" entry.filename
  >>= String.chop_suffix ~suffix:".xml"
  >>= (fun s -> Option.try_with (fun () -> Int.of_string s))
  |> Option.filter ~f:(fun sheet_id ->
       Option.value_map filter_sheets ~default:true ~f:(fun f ->
         f ~sheet_id ~raw_size:(Byte_units.of_bytes_int64_exn entry.descriptor.uncompressed_size) ) )
  >>| fun sheet_number -> parse_sheet ~sheet_number push

let stream_rows_double_pass ?filter_sheets ~sw file cell_parser =
  let entries = Zip.index_entries file in
  let sst = SST.from_entries file entries in
  Sequence.of_seq
  @@ Fiber.fork_seq ~sw
  @@ fun yield ->
  let push x = yield (Expert.parse_row_with_sst sst cell_parser x) in

  List.iter entries ~f:(fun ({ filename; _ } as entry) ->
    get_sheet_action ~filter_sheets entry push
    |> Option.iter ~f:(fun action ->
         match Zip.extract_from_index file entry action with
         | Zip.Data.Parse_many state -> (
           match Zip.Data.parser_state_to_result state with
           | Ok () -> ()
           | Error msg -> failwithf "SZXX: File '%s': %s" filename msg () )
         | _ -> assert false ) )

let process_file ?filter_sheets ~sw ~feed (sst_p, sst_w) yield =
  let q = Queue.create () in

  Zip.stream_files ~sw ~feed (function
    | { filename = "xl/sharedStrings.xml"; _ } ->
      let on_match el = Queue.enqueue q (lazy (parse_string_cell el)) in
      fold_angstrom ~filter_path:SST.filter_path ~on_match ()
    | entry -> get_sheet_action ~filter_sheets entry yield |> Option.value ~default:Zip.Action.Fast_skip )
  |> Sequence.iter ~f:(function
       | Zip.{ filename = "xl/sharedStrings.xml"; _ }, Zip.Data.Parse_many state -> (
         match Zip.Data.parser_state_to_result state with
         | Ok () -> Promise.resolve sst_w (Queue.to_array q)
         | Error msg -> failwithf "SZXX: File '%s': %s" SST.zip_entry_filename msg () )
       | Zip.{ filename; _ }, Zip.Data.Parse_many state -> (
         match Zip.Data.parser_state_to_result state with
         | Ok () -> ()
         | Error msg -> failwithf "SZXX: File '%s': %s" filename msg () )
       | _ -> () );

  if not (Promise.is_resolved sst_p) then Promise.resolve sst_w (Queue.to_array q)

type status =
  | All_buffered
  | Overflowed
  | Got_SST of SST.t

let with_minimal_buffering ?max_buffering ?filter cell_parser sst_p yield raw_rows =
  let q = Queue.create ?capacity:max_buffering () in
  let highwater =
    match max_buffering with
    | None -> Int.max_value
    | Some x when Int.is_non_negative x -> x
    | Some x -> failwithf "SZXX: stream_rows_single_pass max_buffering: %d < 0" x ()
  in

  let status, raw_rows =
    let filter = Option.value filter ~default:(fun _ -> true) in
    let rec loop acc =
      match Promise.peek sst_p with
      | Some sst -> Got_SST sst, acc
      | None -> (
        match acc () with
        | Seq.Nil -> All_buffered, Seq.empty
        | Cons (_, acc) when Queue.length q = highwater -> Overflowed, acc
        | Cons (row, next) ->
          if filter row then Queue.enqueue q row;
          Fiber.yield ();
          (loop [@tailcall]) next )
    in
    loop raw_rows
  in

  match status with
  | All_buffered ->
    let sst = Promise.await sst_p in
    Queue.iter q ~f:(fun raw -> yield (Expert.parse_row_with_sst sst cell_parser raw))
  | Overflowed ->
    failwithf "SZXX: stream_rows_single_pass max_buffering exceeded %d."
      (Option.value max_buffering ~default:Int.max_value)
      ()
  | Got_SST sst -> (
    Queue.iter q ~f:(fun row -> yield (Expert.parse_row_with_sst sst cell_parser row));
    match filter with
    | None -> Seq.iter (fun raw -> yield (Expert.parse_row_with_sst sst cell_parser raw)) raw_rows
    | Some filter ->
      Seq.iter
        (fun raw -> if filter raw then yield (Expert.parse_row_with_sst sst cell_parser raw))
        raw_rows )

let stream_rows_single_pass ?max_buffering ?filter ?filter_sheets ~sw ~feed cell_parser =
  Sequence.of_seq
  @@ Fiber.fork_seq ~sw
  @@ fun yield ->
  let ((sst_p, _) as p) = Promise.create () in
  Fiber.fork_seq ~sw (process_file ?filter_sheets ~sw ~feed p)
  |> with_minimal_buffering ?max_buffering ?filter cell_parser sst_p yield

let unescape = Xml.DOM.unescape

let string_cell_parser : string cell_parser =
  {
    string = (fun _location s -> unescape s);
    formula = (fun _location ~formula:_ s -> unescape s);
    error = (fun _location ~formula s -> sprintf !"#ERROR# %{unescape} -> %{unescape}" formula s);
    boolean = (fun _location s -> if String.(s = "0") then "false" else "true");
    number = (fun _location s -> s);
    date = (fun _location s -> s);
    null = "";
  }

let yojson_cell_parser : [> `Bool of bool | `Float of float | `String of string | `Null ] cell_parser =
  {
    string = (fun _location s -> `String (unescape s));
    formula = (fun _location ~formula:_ s -> `String (unescape s));
    error =
      (fun _location ~formula s -> `String (sprintf !"#ERROR# %{unescape} -> %{unescape}" formula s));
    boolean = (fun _location s -> `Bool String.(s = "1"));
    number = (fun _location s -> `Float (Float.of_string s));
    date = (fun _location s -> `String s);
    null = `Null;
  }
OCaml

Innovation. Community. Security.