Source file hierarchical_heavy_hitters.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
open! Core
module Make (X : Hashable.S_plain) = struct
module Node = struct
type t =
{ mutable count : int
; mutable delta : int
; mutable child_delta : int
; children : t X.Table.t
}
let create () =
let count = 0 in
let delta = 0 in
let child_delta = 0 in
let children = X.Table.create () in
{ count; delta; child_delta; children }
;;
let find_or_create_child t x =
match Hashtbl.find t.children x with
| Some t -> t
| None ->
let count = 0 in
let delta = t.child_delta in
let child_delta = delta in
let children = X.Table.create () in
let child = { count; delta; child_delta; children } in
Hashtbl.add_exn t.children ~key:x ~data:child;
child
;;
let rec insert t xs count =
match xs with
| [] -> t.count <- t.count + count
| x :: xs -> insert (find_or_create_child t x) xs count
;;
let compress t bucket =
let rec loop parent t =
Hashtbl.filter_map_inplace ~f:(fun child -> loop t child) t.children;
let empty = Hashtbl.length t.children = 0 in
let infrequent = t.count + t.delta <= bucket in
if empty && infrequent
then (
parent.count <- parent.count + t.count;
parent.child_delta <- max parent.child_delta (t.count + t.delta);
None)
else Some t
in
Hashtbl.filter_map_inplace ~f:(fun child -> loop t child) t.children
;;
let output t threshold =
let compare (_, (count1 : int), _) (_, (count2 : int), _) = compare count2 count1 in
let rec loop key t =
let child_count, light_child_count, child_results = loop_children t in
let results =
List.map
~f:(fun (rest, lower, upper) -> key :: rest, lower, upper)
child_results
in
let count = t.count + child_count in
if t.count + t.delta + light_child_count > threshold
then (
let results = List.merge ~compare [ [ key ], count, count + t.delta ] results in
count, 0, results)
else count, t.count + light_child_count, results
and loop_children t =
Hashtbl.fold
t.children
~f:(fun ~key ~data (c, f, r) ->
let c', f', r' = loop key data in
c' + c, f' + f, List.merge ~compare r' r)
~init:(0, 0, [])
in
let _, _, results = loop_children t in
results
;;
let children t = Hashtbl.to_alist t.children
let samples_excluding_children t = t.count
let delta t = t.delta
end
type t =
{ root : Node.t
; bucket_size : int
; mutable current_bucket : int
; mutable remaining : int
; mutable total : int
}
let create error =
let root = Node.create () in
let bucket_size = Float.to_int (Float.round_up (1.0 /. error)) in
let current_bucket = 0 in
let remaining = bucket_size in
let total = 0 in
{ root; bucket_size; current_bucket; remaining; total }
;;
let insert t xs count =
Node.insert t.root xs count;
let remaining = t.remaining - 1 in
if remaining > 0
then t.remaining <- remaining
else (
let current_bucket = t.current_bucket + 1 in
Node.compress t.root current_bucket;
t.current_bucket <- current_bucket;
t.remaining <- t.bucket_size);
t.total <- t.total + count
;;
let output t frequency =
let threshold = Float.to_int (Float.round_down (frequency *. Float.of_int t.total)) in
Node.output t.root threshold
;;
let roots t = Node.children t.root
let total t = t.total
end