package shell

  1. Overview
  2. Docs

Source file string_extended.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
open Core

(* Natural ordering like found in gnome nautilus, the mac finder etc...
   Refer to Mli for more documentation
*)
let collate s1 s2 =
  let pos1 = ref 0
  and pos2 = ref 0 in
  let next ~ok s pos =
    if !pos = String.length s
    then None
    else (
      let c = s.[!pos] in
      if ok c
      then (
        incr pos;
        Some c)
      else None)
  in
  let compare_non_numerical () =
    let ok c = not (Char.is_digit c) in
    let rec loop () =
      match next ~ok s1 pos1, next ~ok s2 pos2 with
      | Some _, None -> 1
      | None, Some _ -> -1
      | None, None -> 0
      | Some c1, Some c2 when Char.equal c1 c2 -> loop ()
      | Some c1, Some c2 -> Char.compare c1 c2
    in
    loop ()
  in
  let compare_numerical () =
    let rec consume0 s pos =
      match next ~ok:(Char.equal '0') s pos with
      | Some _ -> consume0 s pos
      | None -> ()
    in
    (* Our main loop works on string representation of ints where all the
       trailing zeros have been chopped of. Their magnitude is given by the
       length of their representation. If they have the same magnitude the
       lexical order is correct. Bias is used to save that information.
    *)
    let ok = Char.is_digit in
    let bias = ref 0 in
    let rec loop () =
      match next ~ok s1 pos1, next ~ok s2 pos2 with
      | Some _, None -> 1
      | None, Some _ -> -1
      | None, None when !bias <> 0 -> !bias
      | None, None ->
        (* Both ints have the same value, The one with the shortest
           representation (i.e. the least trailing zeroes) is
           considered to be the smallest*)
        !pos1 - !pos2
      | Some c1, Some c2 when !bias = 0 ->
        bias := Char.compare c1 c2;
        loop ()
      | Some _, Some _ -> loop ()
    in
    consume0 s1 pos1;
    consume0 s2 pos2;
    loop ()
  in
  let s1_length = String.length s1 in
  let s2_length = String.length s2 in
  let rec loop () =
    let r = compare_non_numerical () in
    let r' = compare_numerical () in
    match r, r' with
    | 0, 0 when !pos1 = s1_length && !pos2 = s2_length -> 0
    | 0, 0 -> loop ()
    | 0, i | i, _ -> i
  in
  loop ()
;;

(**
   Inverse operation of [String.escaped]
*)
exception Unescape_error of bool * int * string

(* The stdlib's escaped does a lot of fancy wazoo magic to avoid
   using a buffer:
   It works in two passes, the first one calculates the length of the string to
   allocate and the second one does the actual escaping.

   This would be more cumbersome to do here but might be worth the hassle if
   performance ever gets to be an issue *)
let unescaped' ?(strict = true) s =
  let len = String.length s in
  let pos = ref 0 in
  let error ?(fatal = false) message = raise (Unescape_error (fatal, !pos, message)) in
  let consume () =
    let i = !pos in
    if i = len then error "unexpectedly reached end of string";
    let c = s.[i] in
    pos := i + 1;
    c
  in
  let res = Buffer.create len in
  let emit c = Buffer.add_char res c in
  let emit_code code =
    match Char.of_int code with
    | Some c -> emit c
    | None -> error ~fatal:true (Printf.sprintf "got invalid escape code %d" code)
  in
  let rec loop () =
    if !pos < len
    then (
      let c = consume () in
      if Char.( <> ) c '\\'
      then emit c
      else (
        let mark = !pos in
        try
          let c = consume () in
          match c with
          | '\\' | '\"' -> emit c
          | 'b' -> emit '\b'
          | 'n' -> emit '\n'
          | 'r' -> emit '\r'
          | 't' -> emit '\t'
          | '\n' ->
            let rec consume_blank () =
              if !pos < len
              then (
                match consume () with
                | ' ' | '\t' -> consume_blank ()
                | _ -> decr pos)
            in
            consume_blank ()
          | 'x' ->
            let c2hex c =
              let open Char.O in
              if c >= 'A' && c <= 'F'
              then Char.to_int c + 10 - Char.to_int 'A'
              else if c >= 'a' && c <= 'f'
              then Char.to_int c + 10 - Char.to_int 'a'
              else if c >= '0' && c <= '9'
              then Char.to_int c - Char.to_int '0'
              else error (Printf.sprintf "expected hex digit, got: %c" c)
            in
            let c1 = consume () in
            let c2 = consume () in
            emit_code ((16 * c2hex c1) + c2hex c2)
          | c when Char.is_digit c ->
            let char_to_num c =
              match Char.get_digit c with
              | None -> error (Printf.sprintf "expected digit,got: %c" c)
              | Some i -> i
            in
            let i1 = char_to_num c in
            let i2 = char_to_num (consume ()) in
            let i3 = char_to_num (consume ()) in
            emit_code ((100 * i1) + (10 * i2) + i3)
          | c -> error (Printf.sprintf "got invalid escape character: %c" c)
        with
        | Unescape_error (false, _, _) when not strict ->
          emit '\\';
          pos := mark);
      loop ())
    else Buffer.contents res
  in
  loop ()
;;

let unescaped_exn ?strict s =
  try unescaped' ?strict s with
  | Unescape_error (_, pos, message) ->
    invalid_argf
      "String_extended.unescaped_exn error at position %d of %s: %s"
      pos
      s
      message
      ()
;;

let squeeze str =
  let len = String.length str in
  let buf = Buffer.create len in
  let rec skip_spaces i =
    if i >= len
    then Buffer.contents buf
    else (
      let c = str.[i] in
      if Char.O.(c = ' ' || c = '\n' || c = '\t' || c = '\r')
      then skip_spaces (i + 1)
      else (
        Buffer.add_char buf c;
        copy_chars (i + 1)))
  and copy_chars i =
    if i >= len
    then Buffer.contents buf
    else (
      let c = str.[i] in
      if Char.O.(c = ' ' || c = '\n' || c = '\t' || c = '\r')
      then (
        Buffer.add_char buf ' ';
        skip_spaces (i + 1))
      else (
        Buffer.add_char buf c;
        copy_chars (i + 1)))
  in
  copy_chars 0
;;

let line_break ~len s =
  let buf = Buffer.create len in
  let flush_buf () =
    let res = Buffer.contents buf in
    Buffer.reset buf;
    res
  in
  let rec loop acc = function
    | [] ->
      let acc =
        if Buffer.length buf <> 0
        then flush_buf () :: acc
        else if List.is_empty acc
        then [ "" ]
        else acc
      in
      List.rev acc
    | h :: t when Buffer.length buf = 0 ->
      Buffer.add_string buf h;
      loop acc t
    | h :: t when Buffer.length buf + 1 + String.length h < len ->
      Buffer.add_char buf ' ';
      Buffer.add_string buf h;
      loop acc t
    | l -> loop (flush_buf () :: acc) l
  in
  List.concat_map (String.split ~on:'\n' s) ~f:(fun s -> loop [] (String.split ~on:' ' s))
;;

(* Finds out where to break a given line; returns the len of the line to break
   and the staring position of the next line.*)
let rec word_wrap__break_one ~hard_limit ~soft_limit ~previous_match s ~pos ~len =
  if pos = String.length s
  then len, pos
  else if previous_match > 0 && len >= soft_limit
  then previous_match, pos - len + previous_match + 1
  else if len >= hard_limit
  then len, pos
  else (
    match s.[pos] with
    (* Detect \r\n as one newline and not two... *)
    | '\r' when pos < String.length s - 1 && Char.equal s.[pos + 1] '\n' -> len, pos + 2
    | '\r' | '\n' -> len, pos + 1
    | ' ' | '\t' ->
      word_wrap__break_one
        s
        ~hard_limit
        ~soft_limit
        ~previous_match:len
        ~pos:(pos + 1)
        ~len:(len + 1)
    | _ ->
      word_wrap__break_one
        s
        ~previous_match
        ~hard_limit
        ~soft_limit
        ~pos:(pos + 1)
        ~len:(len + 1))
;;

(* Returns an pos*length list of all the lines (as substrings of the argument
   passed in) *)
let rec word_wrap__find_substrings ~hard_limit ~soft_limit s acc pos =
  if pos < String.length s
  then (
    let len, new_pos =
      word_wrap__break_one s ~hard_limit ~soft_limit ~previous_match:0 ~pos ~len:0
    in
    word_wrap__find_substrings ~hard_limit ~soft_limit s ((pos, len) :: acc) new_pos)
  else acc
;;

let word_wrap
      ?(trailing_nl = false)
      ?(soft_limit = 80)
      ?(hard_limit = Int.max_value)
      ?(nl = "\n")
      s
  =
  let soft_limit = min soft_limit hard_limit in
  let lines = word_wrap__find_substrings ~soft_limit ~hard_limit s [] 0 in
  match lines with
  | [] | [ _ ] -> if trailing_nl then s ^ nl else s
  | (hpos, hlen) :: t ->
    let nl_len = String.length nl in
    let body_len = List.fold_left t ~f:(fun acc (_, len) -> acc + nl_len + len) ~init:0 in
    let res_len = if trailing_nl then body_len + hlen + nl_len else body_len + hlen in
    let res = Bytes.create res_len in
    if trailing_nl
    then
      Bytes.From_string.blit
        ~src:nl
        ~dst:res
        ~len:nl_len
        ~src_pos:0
        ~dst_pos:(body_len + hlen);
    Bytes.From_string.blit ~src:s ~dst:res ~len:hlen ~src_pos:hpos ~dst_pos:body_len;
    let rec blit_loop dst_end_pos = function
      | [] -> ()
      | (src_pos, len) :: rest ->
        let dst_pos = dst_end_pos - len - nl_len in
        Bytes.From_string.blit ~src:s ~dst:res ~len ~src_pos ~dst_pos;
        Bytes.From_string.blit
          ~src:nl
          ~dst:res
          ~len:nl_len
          ~src_pos:0
          ~dst_pos:(dst_pos + len);
        blit_loop dst_pos rest
    in
    blit_loop body_len t;
    Bytes.unsafe_to_string ~no_mutation_while_string_reachable:res
;;

let edit_distance_matrix ?transpose s1 s2 =
  let transpose = Option.is_some transpose in
  let l1, l2 = String.length s1, String.length s2 in
  let d = Array.make_matrix 0 ~dimx:(l1 + 1) ~dimy:(l2 + 1) in
  for x = 0 to l1 do
    d.(x).(0) <- x
  done;
  for y = 0 to l2 do
    d.(0).(y) <- y
  done;
  for y = 1 to l2 do
    for x = 1 to l1 do
      let min_d =
        if Char.equal s1.[x - 1] s2.[y - 1]
        then d.(x - 1).(y - 1)
        else
          List.reduce_exn
            ~f:min
            [ d.(x - 1).(y) + 1; d.(x).(y - 1) + 1; d.(x - 1).(y - 1) + 1 ]
      in
      let min_d =
        if transpose
        && x > 1
        && y > 1
        && Char.equal s1.[x - 1] s2.[y - 2]
        && Char.equal s1.[x - 2] s2.[y - 1]
        then min min_d (d.(x - 2).(y - 2) + 1)
        else min_d
      in
      d.(x).(y) <- min_d
    done
  done;
  d
;;

let edit_distance ?transpose s1 s2 =
  (edit_distance_matrix ?transpose s1 s2).(String.length s1).(String.length s2)
;;
OCaml

Innovation. Community. Security.