Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source
Page
Library
Module
Module type
Parameter
Class
Class type
Source
WMH.ml
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
(* Weighted Minwise Hashing in (amortized) Constant Time Shrivastava, A. (2016). Simple and efficient weighted minwise hashing. In Advances in Neural Information Processing Systems (pp. 1498-1506). *) open Printf module A = BatArray module BA = Bigarray module BA1 = BA.Array1 module Fp = Fingerprint module L = BatList type dense = (int, BA.int8_unsigned_elt, BA.c_layout) BA1.t type hashed = int array let get_seeds k = let global_seed = 314159265 in let rng = Random.State.make [|global_seed|] in let bound = (BatInt.pow 2 30) - 1 in Array.init k (fun _ -> Random.State.int rng bound) (* convert the sparse Fp.t type into a dense array of small positive ints *) let to_dense (feat_id_bound: int) (fp: Fp.t): dense = let res = BA1.create BA.int8_unsigned BA.C_layout feat_id_bound in BA1.fill res 0; let n = BA1.dim fp in let i = ref 0 in while !i < n do let k = BA1.get fp !i in let v = BA1.get fp (!i + 1) in assert(k < feat_id_bound && v < 256); BA1.set res k v; i := !i + 2 done; res let string_of_dense x = let n = BA1.dim x in let buff = Buffer.create 80 in for i = 0 to n - 1 do let j = BA1.get x i in bprintf buff " %d:%d" i j done; Buffer.contents buff (* read the sparse fingerprints, update feat. val. bounds * if necessary *) let update_bounds (bounds: int array) (fp: Fp.t): unit = let n = BA1.dim fp in let i = ref 0 in while !i < n do let k = BA1.get fp !i in let v = BA1.get fp (!i + 1) in bounds.(k) <- max (bounds.(k)) v; i := !i + 2 done (* compute the max value for each feature. *) (* I.e. the columns' maximum if we put observations as rows * and features as columns in a data matrix *) let bounds (max_feat_id: int) (train: Fp.t array): int array = let bounds = A.make max_feat_id 0 in A.iter (update_bounds bounds) train; bounds (* create a lookup table from the bounds (max feature values) so that we can draw a single rand but still know which feature id. it corresponds to *) let lookup_table (bounds: int array): int array = let total = A.sum bounds in let res = A.create total 0 in let j = ref 0 in A.iteri (fun i bound -> for _ = 1 to bound do res.(!j) <- i; incr j done ) bounds; res let acc_bounds_table (bounds: int array): int array = let n = A.length bounds in let res = A.create n 0 in let acc = ref 0 in A.iteri (fun i bound -> res.(i) <- !acc; acc := !acc + bound ) bounds; res (* (\* in the paper, he defines is_green; but he samples until is_green becomes * * true. It is more natural to sample while is_red *\) * let is_red (arr: dense) (test_feat_id: int) (test_feat_val: int): bool = * test_feat_val >= (BA1.unsafe_get arr test_feat_id) *) (* pre-generate non-repeating random number sequences for later *) let gen_rands seeds rand_bound = A.map (fun seed -> let rng = Random.State.make [|seed|] in (* all ints we are interested into *) let ints = A.init rand_bound (fun i -> i) in A.shuffle ~state:rng ints; (* random permute them *) ints ) seeds (* compute k hashes *) let hash pregen_rands idx2feat feat2acc_bound (dense_fp: dense): hashed = let k = A.length pregen_rands in let res = A.make k 0 in for i = 0 to k - 1 do let misses = ref 0 in let j = ref 0 in let rands = pregen_rands.(i) in let rand' = rands.(!j) in let test_feat_id = ref (idx2feat.(rand')) in let test_feat_val = ref (rand' - feat2acc_bound.(!test_feat_id)) in (* while is_red ... *) while !test_feat_val >= (BA1.unsafe_get dense_fp !test_feat_id) do incr misses; (* in the paper: Hashes[i]++ *) incr j; let rand = rands.(!j) in test_feat_id := idx2feat.(rand); test_feat_val := rand - feat2acc_bound.(!test_feat_id) done; A.unsafe_set res i !misses done; res let estimate_jaccard (hash1: hashed) (hash2: hashed): float = let res = ref 0 in let k = A.length hash1 in for i = 0 to k - 1 do if (A.unsafe_get hash1 i) = (A.unsafe_get hash2 i) then incr res done; (float !res) /. (float k) let estimate_distance (h1: hashed) (h2: hashed): float = 1.0 -. (estimate_jaccard h1 h2)