Source file consensus.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
open Printf
module Log = Dolog.Log
module Fp = Fingerprint
module IntMap = BatMap.Int
module IntSet = MyIntSet
module StringSet = BatSet.Make(BatString)
module L = BatList
module Mol = Molecule
module Pol = Policy
type t =
| Single of Mol.t list * Fp.t list
| Opportunist of Mol.t list * Fp.t list
| Optimist_maccs of Mol.t list * Fp.t
| Realist_maccs of Mol.t list * float array
| Knowledgeable_maccs of Mol.t list * float array
| Optimist_mop2d of Mol.t list * Fp.t
| Realist_mop2d of Mol.t list * float IntMap.t
| Knowledgeable_mop2d of Mol.t list * float IntMap.t
| Optimist_ecfp4 of Mol.t list * Fp.t
| Realist_ecfp4 of Mol.t list * float array
| Knowledgeable_ecfp4 of Mol.t list * float array
let length: t -> int = function
| Single (_queries, fprints)
| Opportunist (_queries, fprints) -> List.length fprints
| Optimist_mop2d (_, _)
| Realist_mop2d (_, _)
| Knowledgeable_mop2d (_, _)
| Optimist_maccs (_, _)
| Optimist_ecfp4 (_, _)
| Realist_maccs (_, _)
| Realist_ecfp4 (_, _)
| Knowledgeable_maccs (_, _)
| Knowledgeable_ecfp4 (_, _) -> 1
let is_single: t -> bool = function
| Single _ -> true
| _ -> false
let to_policy: t -> Pol.t = function
| Single _ -> Pol.Single
| Opportunist _ -> Pol.Opportunist
| Optimist_maccs _
| Optimist_mop2d _
| Optimist_ecfp4 _ -> Pol.Optimist
| Realist_maccs _
| Realist_mop2d _
| Realist_ecfp4 _ -> Pol.Realist
| Knowledgeable_maccs _
| Knowledgeable_mop2d _
| Knowledgeable_ecfp4 _ -> Pol.Knowledgeable
let to_string (cons: t): string =
let pol_str = Pol.to_string (to_policy cons) in
let fp_str = match cons with
| Single (_mols, fprints)
| Opportunist (_mols, fprints) -> Fp.identify (L.hd fprints)
| Optimist_ecfp4 _
| Realist_ecfp4 _
| Knowledgeable_ecfp4 _ -> "ecfp4"
| Optimist_maccs _
| Realist_maccs _
| Knowledgeable_maccs _ -> "maccs"
| Optimist_mop2d _
| Realist_mop2d _
| Knowledgeable_mop2d _ -> "mop2d"
in
sprintf "%s_%s" pol_str fp_str
let get_queries: t -> Mol.t list = function
| Single (queries, _)
| Opportunist (queries, _)
| Optimist_mop2d (queries, _)
| Realist_mop2d (queries, _)
| Knowledgeable_mop2d (queries, _)
| Optimist_maccs (queries, _)
| Optimist_ecfp4 (queries, _)
| Realist_maccs (queries, _)
| Realist_ecfp4 (queries, _)
| Knowledgeable_maccs (queries, _)
| Knowledgeable_ecfp4 (queries, _) -> queries
let get_fprints: t -> Fp.t list = function
| Single (_queries, fprints)
| Opportunist (_queries, fprints) -> fprints
| Optimist_mop2d (_, _)
| Realist_mop2d (_, _)
| Knowledgeable_mop2d (_, _)
| Optimist_maccs (_, _)
| Optimist_ecfp4 (_, _)
| Realist_maccs (_, _)
| Realist_ecfp4 (_, _)
| Knowledgeable_maccs (_, _)
| Knowledgeable_ecfp4 (_, _) -> assert(false)
let get_fprint: t -> Fp.t = function
| Single (_, _)
| Opportunist (_, _) -> assert(false)
| Optimist_mop2d (_, fp)
| Optimist_maccs (_, fp)
| Optimist_ecfp4 (_, fp) -> fp
| Realist_mop2d (_, _)
| Realist_maccs (_, _)
| Realist_ecfp4 (_, _)
| Knowledgeable_mop2d (_, _)
| Knowledgeable_maccs (_, _)
| Knowledgeable_ecfp4 (_, _) -> assert(false)
let get_query_names (cons: t): StringSet.t =
let queries = get_queries cons in
let names = L.map Mol.get_name queries in
StringSet.of_list names
let cons_score (cons_q: t): (Fp.t -> float) =
match cons_q with
| Single _ | Opportunist _ -> assert(false)
| Optimist_maccs (_, cons)
| Optimist_mop2d (_, cons)
| Optimist_ecfp4 (_, cons) -> Score.get_fp_score !Flags.curr_score cons
| Realist_maccs (_, cons)
| Realist_ecfp4 (_, cons)
| Knowledgeable_maccs (_, cons)
| Knowledgeable_ecfp4 (_, cons) -> Score.get_score !Flags.curr_score cons
| Realist_mop2d (_, cons)
| Knowledgeable_mop2d (_, cons) ->
assert(!Flags.curr_score = Flags.Tanimoto);
Score.tanimoto_intmap cons
let size cons =
L.length (get_queries cons)
let bit_frequency (queries: Mol.t list): float array =
assert(queries <> []);
let size = Fp.size (Mol.get_fp (List.hd queries)) in
let frequencies = Array.make size 0.0 in
let nb_queries = float (L.length queries) in
L.iter (fun query ->
let query_bits = Mol.get_bits query in
Bitv.iteri (fun i bit ->
if bit then
let prev = Array.get frequencies i in
Array.set frequencies i (prev +. 1.0)
) query_bits
) queries;
Array.iteri (fun i count ->
Array.set frequencies i (count /. nb_queries)
) frequencies;
frequencies
let int_frequency (queries: Mol.t list): float IntMap.t =
assert(queries <> []);
let nb_queries = float (L.length queries) in
let counts =
L.fold_left (fun acc0 mol ->
let indexes = Fp.get_ints (Mol.get_fp mol) in
IntSet.fold (fun index acc1 ->
try
let prev_count = IntMap.find index acc1 in
IntMap.update index index (prev_count + 1) acc1
with Not_found ->
IntMap.add index 1 acc1
) indexes acc0
) IntMap.empty queries
in
IntMap.map (fun count -> (float count) /. nb_queries) counts
let get_pot_scale name_to_weight name =
try Hashtbl.find name_to_weight name
with Not_found -> failwith ("get_pot_scale: no IC50 for " ^ name)
let pot_scaled_int_frequency
(queries: Mol.t list)
(name_to_weight: (string, float) Hashtbl.t): float IntMap.t =
assert(queries <> []);
let sum_weights = ref 0.0 in
let counts =
L.fold_left (fun acc0 mol ->
let indexes = Fp.get_ints (Mol.get_fp mol) in
let name = Mol.get_name mol in
let weight = get_pot_scale name_to_weight name in
sum_weights := !sum_weights +. weight;
IntSet.fold (fun index acc1 ->
try
let prev_count = IntMap.find index acc1 in
IntMap.update index index (prev_count +. weight) acc1
with Not_found ->
IntMap.add index weight acc1
) indexes acc0
) IntMap.empty queries
in
IntMap.map (fun sum -> sum /. !sum_weights) counts
let pot_scaled_bit_frequency
(queries: Mol.t list)
(name_to_weight: (string, float) Hashtbl.t): float array =
assert(queries <> []);
let size = Fp.size (Mol.get_fp (List.hd queries)) in
let frequencies = Array.make size 0.0 in
let sum_weights = ref 0.0 in
L.iter (fun query ->
let query_bits = Mol.get_bits query in
let query_name = Mol.get_name query in
let weight = get_pot_scale name_to_weight query_name in
sum_weights := !sum_weights +. weight;
Bitv.iteri (fun i bit ->
if bit then
let prev = Array.get frequencies i in
Array.set frequencies i (prev +. weight)
) query_bits
) queries;
Array.iteri (fun i count ->
Array.set frequencies i (count /. !sum_weights)
) frequencies;
frequencies
let debug_bit_cons (cons: Bitv.t): unit =
Printf.printf "%s\n" (Bitv.M.to_string cons)
let debug_freq_cons (freqs: float array): unit =
let n = (Array.length freqs) - 1 in
for i = n downto 0 do
Printf.printf
(if i = n then "%.2f" else " %.2f")
freqs.(i)
done;
Printf.printf "\n"
let logical_consensus strat queries =
match queries with
| [] -> failwith "logical_consensus: no queries"
| q :: _ ->
let fp = Mol.get_fp q in
let fprints = L.map Mol.get_fp queries in
let opti_cons = L.reduce Fp.union fprints in
match strat, fp with
| Pol.Optimist, Fp.MACCS _ -> Optimist_maccs (queries, opti_cons)
| Pol.Optimist, Fp.ECFP4 _ -> Optimist_ecfp4 (queries, opti_cons)
| Pol.Optimist, Fp.MOP2D _ -> Optimist_mop2d (queries, opti_cons)
| _ -> failwith "Cons.logical_consensus: unsupported strat"
let create
?verbose:(verbose = false)
(strat: Pol.t)
(queries: Mol.t list): t =
if verbose then
(L.iter (fun m ->
let fp = Mol.get_fp m in
Printf.printf "%s\n" (Fp.to_string fp)
) queries;
Printf.printf "---\n");
match strat with
| Pol.Single
| Pol.Opportunist ->
begin match queries with
| [] -> assert(false)
| _ ->
let name_fprints =
L.map (fun mol ->
Mol.(get_name mol, get_fp mol)
) queries
in
let fprints = L.map snd name_fprints in
match strat with
| Pol.Single -> Single (queries, fprints)
| Pol.Opportunist -> Opportunist (queries, fprints)
| _ -> assert(false)
end
| Pol.Optimist ->
let cons = logical_consensus strat queries in
if verbose then debug_bit_cons (Fp.get_bits (get_fprint cons));
cons
| Pol.Realist ->
(match Mol.get_fp (List.hd queries) with
| Fp.MACCS _ ->
let a = bit_frequency queries in
if verbose then debug_freq_cons a;
Realist_maccs (queries, a)
| Fp.PUBCH _ ->
failwith "Consensus.create: realist not supported for PUBCH"
| Fp.MOP2D _ ->
let a = int_frequency queries in
Realist_mop2d (queries, a)
| Fp.ECFP4 _ ->
let a = bit_frequency queries in
if verbose then debug_freq_cons a;
Realist_ecfp4 (queries, a))
| Pol.Knowledgeable ->
let _, weights = Mol.potency_scale queries in
match Mol.get_fp (List.hd queries) with
| Fp.MOP2D _ ->
let a = pot_scaled_int_frequency queries weights in
Knowledgeable_mop2d (queries, a)
| _ ->
let a = pot_scaled_bit_frequency queries weights in
match Mol.get_fp (List.hd queries) with
| Fp.MOP2D _ -> assert(false)
| Fp.MACCS _ -> Knowledgeable_maccs (queries, a)
| Fp.ECFP4 _ -> Knowledgeable_ecfp4 (queries, a)
| Fp.PUBCH _ ->
failwith "Consensus.create: knowledgeable not supported for PUBCH"