Source file uCol.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
# 1 "Camomile/public/uCol.ml"
(** Unicode collation algorithm *)
type variable_option =
[ `Blanked
| `Non_ignorable
| `Shifted
| `Shift_Trimmed ]
type precision = [ `Primary | `Secondary | `Tertiary | `Quaternary ]
module type Type =
sig
type text
type index
(** For locale, see {!Locale}.
If [locale] is omitted, the standard UCA order is used.
If [prec] is omitted, the maximum possible strength is used.
If [variable] is omitted, the default of the locale
(usually [`Shifted]) is used.
The meaning of the returned value is similar to Pervasives.compare *)
val compare :
?locale:string -> ?prec:precision -> ?variable:variable_option ->
text -> text -> int
(** Binary comparison of sort_key gives the same result as [compare].
i.e.
[compare t1 t2 = Pervasives.compare (sort_key t1) (sort_key t2)]
If the same texts are repeatedly compared,
pre-computation of sort_key gives better performance. *)
val sort_key :
?locale:string -> ?prec:precision -> ?variable:variable_option ->
text -> string
(** Comparison with the sort key. *)
val compare_with_key :
?locale: string -> ?prec:precision -> ?variable:variable_option ->
string -> text -> int
val search_with_key :
?locale: string -> ?prec:precision -> ?variable:variable_option ->
string -> text -> index -> (index * index)
val search :
?locale: string -> ?prec:precision -> ?variable:variable_option ->
text -> text -> index -> (index * index)
end
module Make (Config : ConfigInt.Type) (Text : UnicodeString.Type) = struct
module Unidata = Unidata.Make(Config)
module UCharInfo = UCharInfo.Make(Config)
let logical_order_exception_tbl =
UCharInfo.load_property_tbl `Logical_Order_Exception
let is_logical_order_exception u =
UCharTbl.Bool.get logical_order_exception_tbl u
let rec rearrange_aux x pos =
if pos > XString.length x - 2 then () else
let u = XString.get x pos in
if is_logical_order_exception u then begin
XString.set x pos (XString.get x (pos + 1));
XString.set x (pos + 1) u;
rearrange_aux x (pos + 2)
end else
rearrange_aux x (pos + 1)
let rearrange x = rearrange_aux x 0
let remove_ignorable ce_tbl x =
let rec loop0 i =
if XString.length x <= i then () else
let u = XString.get x i in
match Unidata.ce ce_tbl u with
[([], [ce])] when ce = Unidata.complete_ignorable ->
loop1 (i + 1) i
| _ ->
loop0 (i + 1)
and loop1 i k =
if XString.length x <= i then begin
XString.shrink x k;
end else
let u = XString.get x i in
match Unidata.ce ce_tbl u with
[([], [ce])] when ce = Unidata.complete_ignorable ->
loop1 (i + 1) k
| _ ->
XString.set x k u; loop1 (i + 1) (k + 1) in
loop0 0
let noncharacter_code_point_tbl =
UCharInfo.load_property_tbl `Noncharacter_Code_Point
let is_noncharacter_code_point u =
UCharTbl.Bool.get noncharacter_code_point_tbl u
let reverse s =
if Bytes.length s = 0 then () else
let last = Bytes.length s - 1 in
for i = 0 to last / 2 do
let c = Bytes.get s i in
Bytes.set s i (Bytes.get s (last - i));
Bytes.set s (last - i) c
done
let shiftright x i j =
for k = j downto i do XString.set x (k + 1) (XString.get x k) done
let rec remove_chars x i = function
[] -> i
| j :: rest ->
shiftright x i (j - 1);
remove_chars x (i + 1) rest
let trim start_regular key =
let rec loop i =
if i > 0 &&
(Char.code key.[i - 1]) lsl 8
lor (Char.code key.[i]) > start_regular
then loop (i - 2)
else String.sub key 0 (i + 1) in
loop (String.length key - 1)
let is_variable variable_top ce =
Unidata.primary ce <> 0 && Unidata.primary ce <= variable_top
let is_ignorable ce = Unidata.primary ce = 0
let add_i16 buf n =
Buffer.add_char buf (Char.unsafe_chr (n lsr 8));
Buffer.add_char buf (Char.unsafe_chr (n land 255))
let add_byte buf n =
Buffer.add_char buf (Char.unsafe_chr n)
type non_ignorable_keybuf =
{non_ignorable_col_info : Unidata.col_info;
non_ignorable_prec : precision;
non_ignorable_primary : Buffer.t;
non_ignorable_secondary : Buffer.t;
non_ignorable_tertiary : Buffer.t;
non_ignorable_quaternary : Buffer.t;
mutable non_ignorable_count : int}
let addce_non_ignorable keybuf ce =
let w1 = Unidata.primary ce in
if w1 <> 0 && w1 <> keybuf.non_ignorable_col_info.hiraganaQ_weight then
add_i16 keybuf.non_ignorable_primary w1;
match keybuf.non_ignorable_prec with `Primary -> () | _ ->
let w2 = Unidata.secondary ce in
if w2 <> 0 then add_byte keybuf.non_ignorable_secondary w2;
match keybuf.non_ignorable_prec with `Secondary -> () | _ ->
let w3 = Unidata.tertiary ce in
if w3 <> 0 then add_byte keybuf.non_ignorable_tertiary w3;
match keybuf.non_ignorable_prec with `Tertiary -> () | _ ->
if not keybuf.non_ignorable_col_info.hiraganaQ then () else
if w1 = keybuf.non_ignorable_col_info.hiraganaQ_weight then begin
if keybuf.non_ignorable_count > 0 then begin
add_i16
keybuf.non_ignorable_quaternary
(1 + keybuf.non_ignorable_count);
keybuf.non_ignorable_count <- 0;
end;
add_i16 keybuf.non_ignorable_quaternary 1;
end else begin
keybuf.non_ignorable_count <- keybuf.non_ignorable_count + 1;
if keybuf.non_ignorable_count = 0xffff - 1 then begin
add_i16 keybuf.non_ignorable_quaternary 0xffff;
keybuf.non_ignorable_count <- 0;
end
end
let terminate_non_ignorable keybuf =
let c = keybuf.non_ignorable_count in
if c > 0 then add_i16 keybuf.non_ignorable_quaternary (1 + c)
type blanked_keybuf =
{blanked_col_info : Unidata.col_info;
blanked_prec : precision;
blanked_primary : Buffer.t;
blanked_secondary : Buffer.t;
blanked_tertiary : Buffer.t;
blanked_quaternary : Buffer.t;
mutable blanked_after_variable : bool;
mutable blanked_count : int}
let addce_blanked keybuf ce =
if is_ignorable ce && keybuf.blanked_after_variable then () else
if is_variable keybuf.blanked_col_info.variable_top ce then
keybuf.blanked_after_variable <- true
else begin
keybuf.blanked_after_variable <- false;
let w1 = Unidata.primary ce in
if w1 <> 0 && w1 <> keybuf.blanked_col_info.hiraganaQ_weight then
add_i16 keybuf.blanked_primary w1;
match keybuf.blanked_prec with `Primary -> () | _ ->
let w2 = Unidata.secondary ce in
if w2 <> 0 then add_byte keybuf.blanked_secondary w2;
match keybuf.blanked_prec with `Secondary -> () | _ ->
let w3 = Unidata.tertiary ce in
if w3 <> 0 then add_byte keybuf.blanked_tertiary w3;
match keybuf.blanked_prec with `Tertiary -> () | _ ->
if not keybuf.blanked_col_info.hiraganaQ then () else
if w1 = keybuf.blanked_col_info.hiraganaQ_weight then begin
if keybuf.blanked_count > 0 then begin
add_i16 keybuf.blanked_quaternary (1 + keybuf.blanked_count);
keybuf.blanked_count <- 0
end;
add_i16 keybuf.blanked_quaternary 1;
end else begin
keybuf.blanked_count <- keybuf.blanked_count + 1;
if keybuf.blanked_count = 0xffff - 1 then begin
add_i16 keybuf.blanked_quaternary 0xffff;
keybuf.blanked_count <- 0
end
end
end
let terminate_blanked keybuf =
let c = keybuf.blanked_count in
if c > 0 then add_i16 keybuf.blanked_quaternary (1 + c)
type shifted_keybuf =
{shifted_col_info : Unidata.col_info;
shifted_prec : precision;
shifted_primary : Buffer.t;
shifted_secondary : Buffer.t;
shifted_tertiary : Buffer.t;
shifted_quaternary : Buffer.t;
mutable shifted_after_variable : bool;
mutable shifted_count : int}
let start_regular keybuf =
if keybuf.shifted_col_info.hiraganaQ then
keybuf.shifted_col_info.hiraganaQ_weight
else
keybuf.shifted_col_info.variable_top
let addce_shifted keybuf ce =
let start_regular = start_regular keybuf in
if is_ignorable ce && keybuf.shifted_after_variable then () else
if is_variable keybuf.shifted_col_info.variable_top ce then begin
keybuf.shifted_after_variable <- true;
match keybuf.shifted_prec with `Quaternary ->
if keybuf.shifted_count > 0 then begin
add_i16
keybuf.shifted_quaternary
(start_regular + keybuf.shifted_count);
keybuf.shifted_count <- 0
end;
add_i16 keybuf.shifted_quaternary (Unidata.primary ce);
| _ -> ()
end else begin
keybuf.shifted_after_variable <- false;
let w1 = Unidata.primary ce in
if w1 <> 0 && w1 <> keybuf.shifted_col_info.hiraganaQ_weight then
add_i16 keybuf.shifted_primary w1;
match keybuf.shifted_prec with `Primary -> () | _ ->
let w2 = Unidata.secondary ce in
if w2 <> 0 then add_byte keybuf.shifted_secondary w2;
match keybuf.shifted_prec with `Secondary -> () | _ ->
let w3 = Unidata.tertiary ce in
if w3 <> 0 then add_byte keybuf.shifted_tertiary w3;
match keybuf.shifted_prec with `Tertiary -> () | _ ->
if is_ignorable ce then () else
if w1 = keybuf.shifted_col_info.hiraganaQ_weight &&
keybuf.shifted_col_info.hiraganaQ then begin
if keybuf.shifted_count > 0 then begin
add_i16
keybuf.shifted_quaternary
(start_regular + keybuf.shifted_count);
keybuf.shifted_count <- 0
end;
add_i16 keybuf.shifted_quaternary w1
end else begin
keybuf.shifted_count <- keybuf.shifted_count + 1;
if keybuf.shifted_count = 0xffff - start_regular then begin
add_i16 keybuf.shifted_quaternary 0xffff;
keybuf.shifted_count <- 0
end
end
end
let terminate_shifted keybuf =
let c = keybuf.shifted_count in
if c > 0 then
add_i16
keybuf.shifted_quaternary
((start_regular keybuf) + c)
let terminate_shift_trimmed keybuf =
let k4 = Buffer.contents keybuf.shifted_quaternary in
let k4 = trim (start_regular keybuf) k4 in
Buffer.clear keybuf.shifted_quaternary;
Buffer.add_string keybuf.shifted_quaternary k4
type keybuf =
Non_ignorable of non_ignorable_keybuf
| Blanked of blanked_keybuf
| Shifted of shifted_keybuf
| Shift_Trimmed of shifted_keybuf
let create_keybuf prec col_info =
match col_info.Unidata.variable_option with
`Non_ignorable ->
Non_ignorable
{non_ignorable_col_info = col_info;
non_ignorable_prec = prec;
non_ignorable_primary = Buffer.create 0;
non_ignorable_secondary = Buffer.create 0;
non_ignorable_tertiary = Buffer.create 0;
non_ignorable_quaternary = Buffer.create 0;
non_ignorable_count = 0}
| `Blanked ->
Blanked
{blanked_col_info = col_info;
blanked_prec = prec;
blanked_primary = Buffer.create 0;
blanked_secondary = Buffer.create 0;
blanked_tertiary = Buffer.create 0;
blanked_quaternary = Buffer.create 0;
blanked_after_variable = false;
blanked_count = 0}
| `Shifted ->
Shifted
{shifted_col_info = col_info;
shifted_prec = prec;
shifted_primary = Buffer.create 0;
shifted_secondary = Buffer.create 0;
shifted_tertiary = Buffer.create 0;
shifted_quaternary = Buffer.create 0;
shifted_after_variable = false;
shifted_count = 0}
| `Shift_Trimmed ->
Shift_Trimmed
{shifted_col_info = col_info;
shifted_prec = prec;
shifted_primary = Buffer.create 0;
shifted_secondary = Buffer.create 0;
shifted_tertiary = Buffer.create 0;
shifted_quaternary = Buffer.create 0;
shifted_after_variable = false;
shifted_count = 0}
let col_info_of_keybuf = function
Non_ignorable b -> b.non_ignorable_col_info
| Blanked b -> b.blanked_col_info
| Shifted b | Shift_Trimmed b -> b.shifted_col_info
let precision_of_keybuf = function
Non_ignorable b -> b.non_ignorable_prec
| Blanked b -> b.blanked_prec
| Shifted b | Shift_Trimmed b -> b.shifted_prec
let primary_of_keybuf = function
Non_ignorable b -> b.non_ignorable_primary
| Blanked b -> b.blanked_primary
| Shifted b | Shift_Trimmed b -> b.shifted_primary
let secondary_of_keybuf = function
Non_ignorable b -> b.non_ignorable_secondary
| Blanked b -> b.blanked_secondary
| Shifted b | Shift_Trimmed b -> b.shifted_secondary
let tertiary_of_keybuf = function
Non_ignorable b -> b.non_ignorable_tertiary
| Blanked b -> b.blanked_tertiary
| Shifted b | Shift_Trimmed b -> b.shifted_tertiary
let quaternary_of_keybuf = function
Non_ignorable b -> b.non_ignorable_quaternary
| Blanked b -> b.blanked_quaternary
| Shifted b | Shift_Trimmed b -> b.shifted_quaternary
let addce keybuf ce =
match keybuf with
Non_ignorable keybuf -> addce_non_ignorable keybuf ce
| Blanked keybuf -> addce_blanked keybuf ce
| Shifted keybuf | Shift_Trimmed keybuf ->
addce_shifted keybuf ce
let terminate = function
Non_ignorable keybuf -> terminate_non_ignorable keybuf
| Blanked keybuf -> terminate_blanked keybuf
| Shifted keybuf ->
terminate_shifted keybuf
| Shift_Trimmed keybuf ->
terminate_shift_trimmed keybuf
let rec add_list keybuf = function
[] -> ()
| e :: rest -> addce keybuf e; add_list keybuf rest
let implicit_ce cebuf u =
let n = UChar.uint_code u in
if
n < 0 || n > 0x10ffff ||
(match UCharInfo.general_category u with `Cs -> true | _ -> false) ||
is_noncharacter_code_point u
then
addce cebuf Unidata.complete_ignorable
else
let base =
if n >= 0x4e00 && n <= 0x9fff then 0xfb40 else
if n >= 0x3400 && n <= 0x4dbf then 0xfb80 else
if n >= 0x20000 && n <= 0x2a6df then 0xfb80 else
0xfbc0
in
let a = base + n lsr 15 in
let b = (n land 0x7fff) lor 0x8000 in
addce cebuf (Unidata.compose_ce a 1 1);
addce cebuf (Unidata.compose_ce b 0 0)
let rec match_us2 x i c' = function
[] -> []
| (u :: rest) as us ->
if i >= XString.length x then raise Exit else
let u' = XString.get x i in
let c = UCharInfo.combined_class u' in
if c'= 0 || c = 0 || c' = c then raise Exit else
if UChar.eq u u' then i :: (match_us2 x (i + 1) c' rest) else
match_us2 x (i + 1) c us
let rec match_us1 x i = function
[] -> i
| (u :: rest) as us ->
if i >= XString.length x then raise Exit else
let u' = XString.get x i in
if UChar.eq u u' then match_us1 x (i + 1) rest else
let ps = match_us2 x (i + 1) (UCharInfo.combined_class u') us in
remove_chars x i ps
let rec longest_match ce_buf x i = function
[] -> assert false
| (us, ces) :: rest ->
try
let j = match_us1 x i us in
add_list ce_buf ces;
j
with Exit -> longest_match ce_buf x i rest
let getce keybuf x i =
let col_info = col_info_of_keybuf keybuf in
let hiraganaQ_mark = Unidata.compose_ce col_info.hiraganaQ_weight 0 0 in
let rec loop i =
if i >= XString.length x then () else
let u = XString.get x i in
(match UCharInfo.script u with `Hiragana when col_info.hiraganaQ ->
addce keybuf hiraganaQ_mark | _ -> ());
let i' = match Unidata.ce col_info.tbl u with
[] -> implicit_ce keybuf u; i + 1
| [([], [ce])] ->
addce keybuf ce; i + 1
| info -> longest_match keybuf x (i + 1) info in
loop i' in
loop i
let getkey keybuf =
let col_info = col_info_of_keybuf keybuf in
let prec = precision_of_keybuf keybuf in
terminate keybuf;
let buf1 = primary_of_keybuf keybuf in
(match prec with `Primary -> () | _ ->
add_i16 buf1 0;
let buf2 = secondary_of_keybuf keybuf in
if col_info.french_accent then
let key2 = Buffer.to_bytes buf2 in
reverse key2;
Buffer.add_bytes buf1 key2
else
Buffer.add_buffer buf1 buf2;
match prec with `Secondary -> () | _ ->
add_i16 buf1 0;
Buffer.add_buffer buf1 (tertiary_of_keybuf keybuf);
match prec with `Tertiary -> () | _ ->
add_i16 buf1 0;
Buffer.add_buffer buf1 (quaternary_of_keybuf keybuf));
Buffer.contents buf1
type text = Text.t
type index = Text.index
module NF = UNF.Make(Config)(Text)
let sort_key_aux col_info prec t =
let x = XString.make 0 (UChar.chr_of_uint 0) in
NF.put_nfd x t;
rearrange x;
remove_ignorable col_info.Unidata.tbl x;
let cebuf = create_keybuf prec col_info in
getce cebuf x 0;
getkey cebuf
let sort_key ?locale ?prec ?variable text =
let col_info =
let default = Unidata.get_col_info ?locale () in
match variable with
None -> default
| Some v -> {default with variable_option = v} in
let prec = match prec with
None -> (match col_info.variable_option with
`Shifted | `Shift_Trimmed -> `Quaternary
| _ -> `Tertiary)
| Some prec -> prec
in
sort_key_aux col_info prec text
let rec primaries_of_ces col_info = function
[] -> []
| ce :: rest ->
let w =
let w = Unidata.primary ce in
if w = col_info.Unidata.hiraganaQ_weight then 0 else
match col_info.variable_option with
`Non_ignorable -> w
| _ ->
if is_variable col_info.variable_top ce then 0 else
w
in
if w = 0 then primaries_of_ces col_info rest else
w :: primaries_of_ces col_info rest
let rec inc_end i =
`Inc ([], i, lazy (inc_end i))
let inc_prim col_info (`Inc(ces, i, f)) =
let rec loop i f ws =
let `Inc (ces, i', f) = Lazy.force f in
if ces = [] then `Inc (ws, i, lazy (inc_end i)) else
match primaries_of_ces col_info ces with
[] -> loop i' f ws
| ws' ->
`Inc (ws, i, lazy (loop i' f ws')) in
loop i f (primaries_of_ces col_info ces)
let implicit_ce_list u =
let n = UChar.uint_code u in
if
n < 0 || n > 0x10ffff ||
match UCharInfo.general_category u with `Cs -> true | _ -> false ||
is_noncharacter_code_point u
then
[Unidata.complete_ignorable]
else
let base =
if n >= 0x4e00 && n <= 0x9fff then 0xfb40 else
if n >= 0x3400 && n <= 0x4dbf then 0xfb80 else
if n >= 0x20000 && n <= 0x2a6df then 0xfb80 else
0xfbc0
in
let a = base + n lsr 15 in
let b = (n land 0x7fff) lor 0x8000 in
[Unidata.compose_ce a 1 1; Unidata.compose_ce b 0 0]
let rec inc_match_us2 i f us0 us1 c' = function
[] -> `Match (us0 @ us1, i, f)
| (u :: rest) as us ->
match us1 with
[] ->
let `Inc (us1, i, f) = Lazy.force f in
if us1 = [] then `Not_Match else
inc_match_us2 i f us0 us1 c' us
| u' :: r' ->
let c = UCharInfo.combined_class u' in
if c'= 0 || c = 0 || c' = c then `Not_Match else
if UChar.eq u u' then
inc_match_us2 i f us0 r' c' rest
else
inc_match_us2 i f (us0 @ [u']) r' c us
let rec inc_match_us1 i f us1 = function
[] -> `Match (us1, i, f)
| (u :: rest) as us ->
match us1 with
[] ->
let `Inc (us1, i, f) = Lazy.force f in
if us1 = [] then `Not_Match else
inc_match_us1 i f us1 us
| u' :: r' ->
if UChar.eq u u' then
inc_match_us1 i f r' rest
else
inc_match_us2 i f [u'] r' (UCharInfo.combined_class u') us
let rec inc_longest_match us i f = function
[] -> `Not_Match
| (us1, ces) :: rest ->
match inc_match_us1 i f us us1 with
`Match (us, i, f) -> `Match (ces, us, i, f)
| `Not_Match ->
inc_longest_match us i f rest
let get_next_ce col_info i f u us =
match Unidata.ce col_info.Unidata.tbl u with
[] -> (implicit_ce_list u, us, i, f)
| [([], ces)] -> (ces, us, i, f)
| info ->
match inc_longest_match us i f info with
`Not_Match -> (implicit_ce_list u, us, i, f)
| `Match (ces, us, i, f) -> (ces, us, i, f)
let get_ces col_info f t i =
let hiraganaQ_mark = Unidata.compose_ce col_info.Unidata.hiraganaQ_weight 0 0 in
let rec loop i f a = function
[] ->
(match Lazy.force f with
`Inc ([], i, _) -> `Inc (a, i, lazy (inc_end i))
| `Inc (us, i', f) ->
match a with
[] -> loop i' f a us
| _ -> `Inc (a, i, lazy (loop i' f [] us)))
| u :: us ->
let a =
match UCharInfo.script u with
`Hiragana when col_info.hiraganaQ ->
a @ [hiraganaQ_mark]
| _ -> a in
let ces, us, i, f = get_next_ce col_info i f u us in
loop i f (a @ ces) us in
let `Inc (us, i, f) = f t i in
loop i f [] us
let inc_prep col_info f t i =
let rec loop i f prev a = function
[] ->
(match a, prev with
[], _ | _, [_] ->
(match Lazy.force f with
`Inc ([], i, _) -> `Inc (a @ prev, i, lazy (inc_end i))
| `Inc (us, i, f) -> loop i f prev a us)
| _ -> `Inc (a, i, lazy (loop i f [] [] [])))
| u :: rest ->
match Unidata.ce col_info.Unidata.tbl u with
[([], [ce])] when ce = Unidata.complete_ignorable ->
loop i f prev a rest
| _ ->
match prev with
[] ->
if is_logical_order_exception u then
loop i f [u] a rest
else
loop i f [] (a @ [u]) rest
| [u0] ->
loop i f [] (a @ [u; u0]) rest
| _ -> assert false in
let `Inc (us, i, f) = f t i in
loop i f [] [] us
let inc_ce col_info t i =
get_ces col_info (inc_prep col_info NF.nfd_inc) t i
let key_of_inc prec col_info x =
let keybuf = create_keybuf prec col_info in
let rec loop (`Inc(ces, _, f)) =
add_list keybuf ces;
match ces with
[] -> ()
| _ -> loop (Lazy.force f) in
loop x;
getkey keybuf
let null_weight f =
match Lazy.force f with `Inc ([], _, _) -> true | _ -> false
let inc_compare prec col_info t1 t2 =
let rec loop f1 f2 ws1 ws2 =
match ws1, ws2 with
w1 :: rest1, w2 :: rest2 ->
let sgn = w1 - w2 in
if sgn = 0 then loop f1 f2 rest1 rest2 else sgn
| [], ws2 ->
let `Inc (ws1, _, f1) = Lazy.force f1 in
if ws1 = [] then
if ws2 = [] && null_weight f2 then 0 else ~-1
else
loop f1 f2 ws1 ws2
| ws1, [] ->
let `Inc (ws2, _, f2) = Lazy.force f2 in
if ws2 = [] then 1 else
loop f1 f2 ws1 ws2 in
let x1 = inc_ce col_info t1 (Text.nth t1 0) in
let x2 = inc_ce col_info t2 (Text.nth t1 0) in
let `Inc (ws1, _, g1) = inc_prim col_info x1 in
let `Inc (ws2, _, g2) = inc_prim col_info x2 in
let sgn = loop g1 g2 ws1 ws2 in
if sgn <> 0 then sgn else
match prec with
`Primary -> 0
| _ ->
let key1 = key_of_inc prec col_info x1 in
let key2 = key_of_inc prec col_info x2 in
Pervasives.compare key1 key2
let compare ?locale ?prec ?variable t1 t2 =
let col_info =
let default = Unidata.get_col_info ?locale () in
match variable with
None -> default
| Some v ->
{default with variable_option = v} in
let prec = match prec with
None -> (match col_info.variable_option with
`Shifted | `Shift_Trimmed -> `Quaternary
| _ -> `Tertiary)
| Some prec -> prec in
inc_compare prec col_info t1 t2
let get_weight k i =
(Char.code k.[i]) lsl 8 lor (Char.code k.[i + 1])
let rec primary_length k i =
if String.length k <= i || get_weight k i = 0 then i else
primary_length k (i + 2)
let inc_compare_key prec col_info k t =
let k_len = primary_length k 0 in
let rec loop f ws i =
match ws with
w :: rest ->
if k_len <= i then ~-1 else
let w' = get_weight k i in
let sgn = w' - w in
if sgn = 0 then loop f rest (i + 2) else sgn
| [] ->
let `Inc (ws, _, f) = Lazy.force f in
if ws = [] then
if k_len = i then 0 else
if k_len > i then 1 else
assert false
else
loop f ws i in
let x = inc_ce col_info t (Text.nth t 0) in
let `Inc (ws, _, g) = inc_prim col_info x in
let sgn = loop g ws 0 in
if sgn <> 0 then sgn else
match prec with
`Primary -> 0
| _ ->
let key = key_of_inc prec col_info x in
Pervasives.compare k key
let compare_with_key ?locale ?prec ?variable k t =
let col_info =
let default = Unidata.get_col_info ?locale () in
match variable with
None -> default
| Some v -> {default with variable_option = v} in
let prec = match prec with
None -> (match col_info.variable_option with
`Shifted | `Shift_Trimmed -> `Quaternary
| _ -> `Tertiary)
| Some prec -> prec
in
inc_compare_key prec col_info k t
let search_common col_info prec k t loc =
let k_len = primary_length k 0 in
let rec null i f =
let `Inc (ces, j, f) = Lazy.force f in
if primaries_of_ces col_info ces = [] then null j f else i in
let rec test_match i f j = function
w :: rest ->
if k_len <= j then raise Exit else
let w' = get_weight k j in
if w' = w then test_match i f (j + 2) rest else raise Exit
| [] ->
if k_len = j then (i, null i f) else
let `Inc (ces, i, f) = Lazy.force f in
if ces = [] then raise Exit else
test_match i f j (primaries_of_ces col_info ces) in
let keys loc f i j ces =
let keybuf = create_keybuf prec col_info in
add_list keybuf ces;
let rec loop f ks =
let `Inc (ces, loc, f) = Lazy.force f in
if Text.compare_index t loc j > 0 || ces = [] then ks else begin
add_list keybuf ces;
if Text.compare_index t loc i >= 0 then
loop f ((getkey keybuf, i) :: ks)
else
loop f ks
end in
if Text.compare_index t loc i >= 0 then
loop f [(getkey keybuf, i)]
else
loop f [] in
let rec scan loc f =
let `Inc (ces, i, f) = Lazy.force f in
if ces = [] then raise Not_found else
try
let (i, j) = test_match i f 0 (primaries_of_ces col_info ces) in
match prec with
`Primary -> (loc, j)
| _ ->
let ks = keys loc f i j ces in
try (loc, List.assoc k ks) with Not_found -> raise Exit
with Exit ->
scan i f in
scan loc (lazy (inc_ce col_info t loc))
let search_with_key ?locale ?prec ?variable k t loc =
let col_info =
let default = Unidata.get_col_info ?locale () in
match variable with
None -> default
| Some v -> {default with variable_option = v} in
let prec = match prec with
None -> (match col_info.variable_option with
`Shifted | `Shift_Trimmed -> `Quaternary
| _ -> `Tertiary)
| Some prec -> prec in
search_common col_info prec k t loc
let search ?locale ?prec ?variable t0 t loc =
let k = sort_key ?locale ?prec ?variable t0 in
search_with_key ?locale ?prec ?variable k t loc
end