package lbvs_consent

  1. Overview
  2. Docs

Source file consensus.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
open Printf

module Log = Dolog.Log

module Fp = Fingerprint
module IntMap = BatMap.Int
module IntSet = MyIntSet
module StringSet = BatSet.Make(BatString)
module L = BatList
module Mol = Molecule
module Pol = Policy

type t =
  (* control *)
  | Single              of Mol.t list * Fp.t list
  | Opportunist         of Mol.t list * Fp.t list
  (* MACCS *)
  | Optimist_maccs      of Mol.t list * Fp.t
  | Realist_maccs       of Mol.t list * float array
  | Knowledgeable_maccs of Mol.t list * float array
  (* MOP2D *)
  | Optimist_mop2d      of Mol.t list * Fp.t
  | Realist_mop2d       of Mol.t list * float IntMap.t
  | Knowledgeable_mop2d of Mol.t list * float IntMap.t
  (* ECFP4 *)
  | Optimist_ecfp4      of Mol.t list * Fp.t
  | Realist_ecfp4       of Mol.t list * float array
  | Knowledgeable_ecfp4 of Mol.t list * float array

(* number of times we will have to screen *)
let length: t -> int = function
  | Single (_queries, fprints)
  | Opportunist (_queries, fprints) -> List.length fprints
  | Optimist_mop2d (_, _)
  | Realist_mop2d (_, _)
  | Knowledgeable_mop2d (_, _)
  | Optimist_maccs (_, _)
  | Optimist_ecfp4 (_, _)
  | Realist_maccs (_, _)
  | Realist_ecfp4 (_, _)
  | Knowledgeable_maccs (_, _)
  | Knowledgeable_ecfp4 (_, _) -> 1

let is_single: t -> bool = function
  | Single _ -> true
  | _ -> false

let to_policy: t -> Pol.t = function
  | Single _ -> Pol.Single
  | Opportunist _ -> Pol.Opportunist
  | Optimist_maccs _
  | Optimist_mop2d _
  | Optimist_ecfp4 _ -> Pol.Optimist
  | Realist_maccs _
  | Realist_mop2d _
  | Realist_ecfp4 _ -> Pol.Realist
  | Knowledgeable_maccs _
  | Knowledgeable_mop2d _
  | Knowledgeable_ecfp4 _ -> Pol.Knowledgeable

let to_string (cons: t): string =
  let pol_str = Pol.to_string (to_policy cons) in
  let fp_str = match cons with
    | Single (_mols, fprints)
    | Opportunist (_mols, fprints) -> Fp.identify (L.hd fprints)
    | Optimist_ecfp4 _
    | Realist_ecfp4 _
    | Knowledgeable_ecfp4 _ -> "ecfp4"
    | Optimist_maccs _
    | Realist_maccs _
    | Knowledgeable_maccs _ -> "maccs"
    | Optimist_mop2d _
    | Realist_mop2d _
    | Knowledgeable_mop2d _ -> "mop2d"
  in
  sprintf "%s_%s" pol_str fp_str

let get_queries: t -> Mol.t list = function
  | Single (queries, _)
  | Opportunist (queries, _)
  | Optimist_mop2d (queries, _)
  | Realist_mop2d (queries, _)
  | Knowledgeable_mop2d (queries, _)
  | Optimist_maccs (queries, _)
  | Optimist_ecfp4 (queries, _)
  | Realist_maccs (queries, _)
  | Realist_ecfp4 (queries, _)
  | Knowledgeable_maccs (queries, _)
  | Knowledgeable_ecfp4 (queries, _) -> queries

let get_fprints: t -> Fp.t list = function
  | Single (_queries, fprints)
  | Opportunist (_queries, fprints) -> fprints
  | Optimist_mop2d (_, _)
  | Realist_mop2d (_, _)
  | Knowledgeable_mop2d (_, _)
  | Optimist_maccs (_, _)
  | Optimist_ecfp4 (_, _)
  | Realist_maccs (_, _)
  | Realist_ecfp4 (_, _)
  | Knowledgeable_maccs (_, _)
  | Knowledgeable_ecfp4 (_, _) -> assert(false)

let get_fprint: t -> Fp.t = function
  | Single (_, _)
  | Opportunist (_, _) -> assert(false)
  | Optimist_mop2d (_, fp)
  | Optimist_maccs (_, fp)
  | Optimist_ecfp4 (_, fp) -> fp
  | Realist_mop2d (_, _)
  | Realist_maccs (_, _)
  | Realist_ecfp4 (_, _)
  | Knowledgeable_mop2d (_, _)
  | Knowledgeable_maccs (_, _)
  | Knowledgeable_ecfp4 (_, _) -> assert(false)

(* get the name of all query molecules used in the consensus *)
let get_query_names (cons: t): StringSet.t =
  let queries = get_queries cons in
  let names = L.map Mol.get_name queries in
  StringSet.of_list names

(* score using given consensus query; we just need a candidate to output its score *)
let cons_score (cons_q: t): (Fp.t -> float) =
  match cons_q with
  | Single _ | Opportunist _ -> assert(false)
  | Optimist_maccs (_, cons)
  | Optimist_mop2d (_, cons)
  | Optimist_ecfp4 (_, cons) -> Score.get_fp_score !Flags.curr_score cons
  | Realist_maccs (_, cons)
  | Realist_ecfp4 (_, cons)
  | Knowledgeable_maccs (_, cons)
  | Knowledgeable_ecfp4 (_, cons) -> Score.get_score !Flags.curr_score cons
  | Realist_mop2d (_, cons)
  | Knowledgeable_mop2d (_, cons) ->
    assert(!Flags.curr_score = Flags.Tanimoto);
    Score.tanimoto_intmap cons

(* how many actives were used to create the consensus *)
let size cons =
  L.length (get_queries cons)

(* compute the probability of being set for each bit *)
let bit_frequency (queries: Mol.t list): float array =
  assert(queries <> []);
  let size = Fp.size (Mol.get_fp (List.hd queries)) in
  let frequencies = Array.make size 0.0 in
  let nb_queries = float (L.length queries) in
  (* compute nb times each bit was set *)
  L.iter (fun query ->
      let query_bits = Mol.get_bits query in
      Bitv.iteri (fun i bit ->
          if bit then
            let prev = Array.get frequencies i in
            Array.set frequencies i (prev +. 1.0)
        ) query_bits
    ) queries;
  (* convert to a frequency *)
  Array.iteri (fun i count ->
      Array.set frequencies i (count /. nb_queries)
    ) frequencies;
  frequencies

(* for intset, unfolded FPs *)
let int_frequency (queries: Mol.t list): float IntMap.t =
  assert(queries <> []);
  let nb_queries = float (L.length queries) in
  (* accumulate *)
  let counts =
    L.fold_left (fun acc0 mol ->
        let indexes = Fp.get_ints (Mol.get_fp mol) in
        IntSet.fold (fun index acc1 ->
            try
              let prev_count = IntMap.find index acc1 in
              IntMap.update index index (prev_count + 1) acc1
            with Not_found ->
              IntMap.add index 1 acc1
          ) indexes acc0
      ) IntMap.empty queries
  in
  (* convert to frequencies *)
  IntMap.map (fun count -> (float count) /. nb_queries) counts

let get_pot_scale name_to_weight name =
  try Hashtbl.find name_to_weight name
  with Not_found -> failwith ("get_pot_scale: no IC50 for " ^ name)

(* for intset, unfolded FPs *)
let pot_scaled_int_frequency
    (queries: Mol.t list)
    (name_to_weight: (string, float) Hashtbl.t): float IntMap.t =
  assert(queries <> []);
  let sum_weights = ref 0.0 in
  (* accumulate *)
  let counts =
    L.fold_left (fun acc0 mol ->
        let indexes = Fp.get_ints (Mol.get_fp mol) in
        let name = Mol.get_name mol in
        let weight = get_pot_scale name_to_weight name in
        sum_weights := !sum_weights +. weight;
        IntSet.fold (fun index acc1 ->
            try
              let prev_count = IntMap.find index acc1 in
              IntMap.update index index (prev_count +. weight) acc1
            with Not_found ->
              IntMap.add index weight acc1
          ) indexes acc0
      ) IntMap.empty queries
  in
  (* convert to frequencies *)
  IntMap.map (fun sum -> sum /. !sum_weights) counts

(* same as bit_frequency except that each molecule's contribution
   is potency-scaled *)
let pot_scaled_bit_frequency
    (queries: Mol.t list)
    (name_to_weight: (string, float) Hashtbl.t): float array =
  assert(queries <> []);
  let size = Fp.size (Mol.get_fp (List.hd queries)) in
  let frequencies = Array.make size 0.0 in
  let sum_weights = ref 0.0 in
  (* compute nb times each bit was set and scale it *)
  L.iter (fun query ->
      let query_bits = Mol.get_bits query in
      let query_name = Mol.get_name query in
      let weight = get_pot_scale name_to_weight query_name in
      sum_weights := !sum_weights +. weight;
      Bitv.iteri (fun i bit ->
          if bit then
            let prev = Array.get frequencies i in
            Array.set frequencies i (prev +. weight)
        ) query_bits
    ) queries;
  (* convert to a frequency *)
  Array.iteri (fun i count ->
      Array.set frequencies i (count /. !sum_weights)
    ) frequencies;
  frequencies

let debug_bit_cons (cons: Bitv.t): unit =
  Printf.printf "%s\n" (Bitv.M.to_string cons)

let debug_freq_cons (freqs: float array): unit =
  let n = (Array.length freqs) - 1 in
  (* Bitv.iteri iterates from least to most significant bits:
     i.e. bit at index 0 is the least significant one.
     So, we walk the array from its end to output most
     significant bits first (on the left of the display, like
     regular numbers written by humans). *)
  for i = n downto 0 do
    Printf.printf
      (if i = n then "%.2f" else " %.2f")
      freqs.(i)
  done;
  Printf.printf "\n"

let logical_consensus strat queries =
  match queries with
  | [] -> failwith "logical_consensus: no queries"
  | q :: _ ->
    let fp = Mol.get_fp q in
    let fprints = L.map Mol.get_fp queries in
    let opti_cons = L.reduce Fp.union fprints in
    match strat, fp with
    | Pol.Optimist, Fp.MACCS _ -> Optimist_maccs (queries, opti_cons)
    | Pol.Optimist, Fp.ECFP4 _ -> Optimist_ecfp4 (queries, opti_cons)
    | Pol.Optimist, Fp.MOP2D _ -> Optimist_mop2d (queries, opti_cons)
    | _ -> failwith "Cons.logical_consensus: unsupported strat"

let create
    ?verbose:(verbose = false)
    (strat: Pol.t)
    (queries: Mol.t list): t =
  if verbose then
    (L.iter (fun m ->
         let fp = Mol.get_fp m in
         Printf.printf "%s\n" (Fp.to_string fp)
       ) queries;
     Printf.printf "---\n");
  match strat with
  | Pol.Single
  | Pol.Opportunist ->
    (* use query FP as is *)
    begin match queries with
      | [] -> assert(false) (* we need at least one molecule *)
      | _ ->
        let name_fprints =
          L.map (fun mol ->
              Mol.(get_name mol, get_fp mol)
            ) queries
        in
        let fprints = L.map snd name_fprints in
        match strat with
        | Pol.Single -> Single (queries, fprints)
        | Pol.Opportunist -> Opportunist (queries, fprints)
        | _ -> assert(false)
    end
  | Pol.Optimist -> (* logical OR *)
    let cons = logical_consensus strat queries in
    if verbose then debug_bit_cons (Fp.get_bits (get_fprint cons));
    cons
  | Pol.Realist -> (* probability per bit *)
    (match Mol.get_fp (List.hd queries) with
     | Fp.MACCS _ ->
       let a = bit_frequency queries in
       if verbose then debug_freq_cons a;
       Realist_maccs (queries, a)
     | Fp.PUBCH _ ->
       failwith "Consensus.create: realist not supported for PUBCH"
     | Fp.MOP2D _ ->
       let a = int_frequency queries in
       Realist_mop2d (queries, a)
     | Fp.ECFP4 _ ->
       let a = bit_frequency queries in
       if verbose then debug_freq_cons a;
       Realist_ecfp4 (queries, a))
  | Pol.Knowledgeable ->
    (* <=> realist + activity scaling of each molecule *)
    let _, weights = Mol.potency_scale queries in
    match Mol.get_fp (List.hd queries) with
    | Fp.MOP2D _ ->
      let a = pot_scaled_int_frequency queries weights in
      Knowledgeable_mop2d (queries, a)
    | _ ->
      let a = pot_scaled_bit_frequency queries weights in
      match Mol.get_fp (List.hd queries) with
      | Fp.MOP2D _ -> assert(false)
      | Fp.MACCS _ -> Knowledgeable_maccs (queries, a)
      | Fp.ECFP4 _ -> Knowledgeable_ecfp4 (queries, a)
      | Fp.PUBCH _ ->
        failwith "Consensus.create: knowledgeable not supported for PUBCH"
OCaml

Innovation. Community. Security.