Source file MakeROC.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
module type SCORE_LABEL = sig
type t
val get_score: t -> float
val get_label: t -> bool
end
module type ROC_FUNCTOR = functor (SL: SCORE_LABEL) ->
sig
(** sort score labels putting high scores first *)
val rank_order_by_score: SL.t list -> SL.t list
(** compute the cumulated actives curve given
an already sorted list of score labels *)
val cumulated_actives_curve: SL.t list -> int list
(** compute Area Under the ROC curve given an already sorted list of
score labels *)
val roc_curve: SL.t list -> (float * float) list
(** ROC curve (list of (FPR,TPR) values) corresponding to
those score labels *)
val fast_auc: SL.t list -> float
(** compute Area Under the ROC curve given an unsorted list
of score labels *)
val auc: SL.t list -> float
(** (early) enrichment factor at given threshold (database percentage)
given an unsorted list of score labels *)
val enrichment_factor: float -> SL.t list -> float
(** [initial_enhancement a score_labels] will compute
S = sum_over_i (exp (-rank(active_i) / a))
given an unsorted list of score labels.
Robust Initial Enhancement (RIE) = S/<S> where <S> is
the average S for randomly ordered score labels.
RIE = 1.0 <=> random performance. Cf. DOI:10.1021/ci0100144
for details. *)
val initial_enhancement: float -> SL.t list -> float
(** same as [initial_enhancement] but does not reorder the list
of score_labels *)
val fast_initial_enhancement: float -> SL.t list -> float
(** power metric at given threshold given an unsorted list of score labels *)
val power_metric: float -> SL.t list -> float
(** bedroc_auc at given alpha. Default alpha = 20.0. *)
val bedroc_auc: ?alpha:float -> SL.t list -> float
(** Matthews' Correlation Coefficient (MCC)
use: [mcc classif_threshold score_labels].
scores >= threshold are predicted as targets;
scores < threshold are predicted as non targets. *)
val mcc: float -> SL.t list -> float
end
module Make: ROC_FUNCTOR = functor (SL: SCORE_LABEL) ->
struct
module L = BatList
let trapezoid_surface x1 x2 y1 y2 =
let base = abs_float (x1 -. x2) in
let height = 0.5 *. (y1 +. y2) in
base *. height
let rank_order_by_score (score_labels: SL.t list) =
L.stable_sort (fun x y ->
BatFloat.compare (SL.get_score y) (SL.get_score x)
) score_labels
let cumulated_actives_curve (high_scores_first: SL.t list) =
let sum = ref 0 in
L.map (fun sl ->
if SL.get_label sl then incr sum;
!sum
) high_scores_first
let roc_curve (score_labels: SL.t list) =
let high_scores_first = rank_order_by_score score_labels in
let nacts = ref 0 in
let ndecs = ref 0 in
let nb_act_decs =
L.fold_left (fun acc x ->
if SL.get_label x then incr nacts
else incr ndecs;
(!nacts, !ndecs) :: acc
) [(0, 0)] high_scores_first in
let nb_actives = float !nacts in
let nb_decoys = float !ndecs in
L.rev_map (fun (na, nd) ->
let tpr = float na /. nb_actives in
let fpr = float nd /. nb_decoys in
(fpr, tpr)
) nb_act_decs
let fast_auc high_scores_first =
let fp, tp, fp_prev, tp_prev, a, _p_prev =
L.fold_left
(fun (fp, tp, fp_prev, tp_prev, a, p_prev) sl ->
let si = SL.get_score sl in
let li = SL.get_label sl in
let new_a, new_p_prev, new_fp_prev, new_tp_prev =
if si <> p_prev then
a +. trapezoid_surface fp fp_prev tp tp_prev,
si,
fp,
tp
else
a,
p_prev,
fp_prev,
tp_prev
in
let new_tp, new_fp =
if li then
tp +. 1., fp
else
tp, fp +. 1.
in
(new_fp, new_tp, new_fp_prev, new_tp_prev, new_a, new_p_prev)
)
(0., 0., 0., 0., 0., neg_infinity)
high_scores_first
in
(a +. trapezoid_surface fp fp_prev tp tp_prev) /. (fp *. tp)
let auc (score_labels: SL.t list) =
let high_scores_first = rank_order_by_score score_labels in
fast_auc high_scores_first
let actives_rate (score_labels: SL.t list) =
let tp_count, fp_count =
L.fold_left
(fun (tp_c, fp_c) sl ->
if SL.get_label sl then
(tp_c + 1, fp_c)
else
(tp_c, fp_c + 1)
) (0, 0) score_labels
in
let nb_molecules = tp_count + fp_count in
(nb_molecules, (float tp_count) /. (float nb_molecules))
let enrichment_factor (p: float) (score_labels: SL.t list) =
let nb_molecules, rand_actives_rate = actives_rate score_labels in
let top_n = BatFloat.round_to_int (p *. (float nb_molecules)) in
let top_p_percent_molecules =
L.take top_n (rank_order_by_score score_labels) in
let _, top_actives_rate = actives_rate top_p_percent_molecules in
let enr_rate = top_actives_rate /. rand_actives_rate in
enr_rate
let fast_initial_enhancement (a: float) (l: SL.t list) =
L.fold_lefti (fun acc i x ->
if SL.get_label x then
let rank = float i in
acc +. exp (-. rank /. a)
else
acc
) 0.0 l
let initial_enhancement (a: float) (l: SL.t list) =
fast_initial_enhancement a (rank_order_by_score l)
let nb_actives l =
float
(L.fold_left (fun acc x ->
if SL.get_label x then acc + 1
else acc
) 0 l)
let power_metric (cutoff: float) (scores_tot: SL.t list): float =
assert(cutoff > 0.0 && cutoff <= 1.0);
let size_tot = float (L.length scores_tot) in
let x = BatFloat.round (cutoff *. size_tot) in
let size_x = int_of_float x in
assert(size_x >= 1);
let sorted = rank_order_by_score scores_tot in
let scores_x = L.take size_x sorted in
let actives_x = nb_actives scores_x in
let actives_tot = nb_actives scores_tot in
let tpr_x = actives_x /. actives_tot in
let fpr_x = (x -. actives_x) /. (size_tot -. actives_tot) in
tpr_x /. (tpr_x +. fpr_x)
let bedroc_auc ?alpha:(alpha = 20.0) (score_labels: SL.t list): float =
let half_alpha = 0.5 *. alpha in
let n_tot = float (L.length score_labels) in
let n_act = nb_actives score_labels in
let sum =
let sorted = rank_order_by_score score_labels in
L.fold_lefti (fun acc rank x ->
if SL.get_label x then
acc +. exp (-.alpha *. (1.0 +. float rank) /. n_tot)
else
acc
) 0.0 sorted in
let r_a = n_act /. n_tot in
let factor1 = r_a *. sinh half_alpha /. (cosh half_alpha -. cosh (half_alpha -. r_a *. alpha)) in
let factor2 = 1.0 /. r_a *. (exp (alpha /. n_tot) -. 1.0) /. (1.0 -. exp (-.alpha)) in
let constant = 1.0 /. (1.0 -. exp (alpha *. ( 1.0 -. r_a))) in
sum *. factor1 *. factor2 +. constant
type mcc_accum = { tp: int ;
tn: int ;
fp: int ;
fn: int }
let mcc classif_threshold score_labels =
let acc =
L.fold_left (fun acc sl ->
let truth = SL.get_label sl in
let score = SL.get_score sl in
let prediction = score >= classif_threshold in
match (truth, prediction) with
| (true, true) -> { acc with tp = acc.tp + 1 }
| (false, false) -> { acc with tn = acc.tn + 1 }
| (true, false) -> { acc with fn = acc.fn + 1 }
| (false, true) -> { acc with fp = acc.fp + 1 }
) { tp = 0; tn = 0; fp = 0; fn = 0 } score_labels in
let tp = float acc.tp in
let tn = float acc.tn in
let fp = float acc.fp in
let fn = float acc.fn in
let denum' = (tp +. fp) *. (tp +. fn) *. (tn +. fp) *. (tn +. fn) in
if denum' = 0.0 then 0.0
else
let num = (tp *. tn) -. (fp *. fn) in
let denum = sqrt denum' in
num /. denum
end