package prbnmcn-ucb1

  1. Overview
  2. Docs
Legend:
Page
Library
Module
Module type
Parameter
Class
Class type
Source

Source file ucb1.ml

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
type awaiting_reward

type ready_to_move

module type Arm_sig = sig
  type t

  val compare : t -> t -> int

  val pp : Format.formatter -> t -> unit
end

type arm_statistics =
  { number_of_activations : float;
    cumulative_reward : float;
    empirical_reward : float
  }

let pp_stats fmtr { number_of_activations; cumulative_reward; empirical_reward }
    =
  Format.fprintf
    fmtr
    "@[<v 2>{ act. count: %f@;cumulative: %f@;empirical: %f }@]"
    number_of_activations
    cumulative_reward
    empirical_reward

let compute_expected_reward (round : float) (arm : arm_statistics) =
  if arm.number_of_activations = 0.0 then infinity
  else
    arm.empirical_reward +. sqrt (2.0 *. log round /. arm.number_of_activations)

module type S = sig
  type arm

  type 'state t

  val create : arm array -> ready_to_move t

  val next_action : ready_to_move t -> arm * awaiting_reward t

  val set_reward : awaiting_reward t -> float -> ready_to_move t

  val total_rewards : ready_to_move t -> float

  val find_best_arm : 'state t -> (arm_statistics -> float) -> arm * float

  val pp_stats : Format.formatter -> 'state t -> unit
end

module Make (Arm : Arm_sig) : S with type arm = Arm.t = struct
  type arm = Arm.t

  module M = Map.Make (Arm)

  type 'state t =
    { arms : Arm.t array;
      stats : arm_statistics M.t;
      round : int;
      last : Arm.t
    }

  let initial_statistics =
    { number_of_activations = 0.0;
      cumulative_reward = 0.0;
      empirical_reward = 0.0
    }

  let create (arms : Arm.t array) : ready_to_move t =
    if Array.length arms = 0 then invalid_arg "create: length 0 array" ;
    let stats =
      Array.fold_left (fun m arm -> M.add arm initial_statistics m) M.empty arms
    in
    { arms; stats; round = 1; last = arms.(0) }
  (* The initial value of [last] does not matter.*)

  let next_action : ready_to_move t -> Arm.t * awaiting_reward t =
   fun state ->
    let round = float state.round in
    let (selected_arm, _reward) =
      M.fold
        (fun arm stats ((_selected_arm, best_reward) as acc) ->
          let r = compute_expected_reward round stats in
          if best_reward < r then (arm, r) else acc)
        state.stats
        (state.arms.(0), 0.0)
    in
    let state = { state with last = selected_arm } in
    (selected_arm, (state :> awaiting_reward t))

  let set_reward : awaiting_reward t -> float -> ready_to_move t =
   fun state reward ->
    if reward < 0.0 || reward > 1. then invalid_arg "set_reward" ;
    let last_stats = M.find state.last state.stats in
    let cum_rew = last_stats.cumulative_reward +. reward in
    let num_act = last_stats.number_of_activations +. 1. in
    let emp_rew = cum_rew /. num_act in
    let updated_stats =
      { cumulative_reward = cum_rew;
        empirical_reward = emp_rew;
        number_of_activations = num_act
      }
    in
    let state =
      { state with
        round = state.round + 1;
        stats = M.add state.last updated_stats state.stats
      }
    in
    (state :> ready_to_move t)

  let total_rewards state =
    M.fold
      (fun _arm stats acc -> acc +. stats.cumulative_reward)
      state.stats
      0.0

  let find_best_arm { stats; _ } (f : arm_statistics -> float) =
    let (res_opt, value) =
      M.fold
        (fun arm stats ((_winner, value) as acc) ->
          let v = f stats in
          if v > value then (Some arm, v) else acc)
        stats
        (None, ~-.max_float)
    in
    match res_opt with None -> assert false | Some res -> (res, value)

  let pp_stats fmtr ({ stats; round; _ } as bandit) =
    let (most_played, _) =
      find_best_arm bandit (fun { number_of_activations; _ } ->
          number_of_activations)
    in
    let (most_cumulative_reward, _) =
      find_best_arm bandit (fun { cumulative_reward; _ } -> cumulative_reward)
    in
    let (most_empirical_reward, _) =
      find_best_arm bandit (fun { empirical_reward; _ } -> empirical_reward)
    in
    Format.fprintf
      fmtr
      "most played:@.@[<v 4>    arm = %a;@;stats = %a@]@."
      Arm.pp
      most_played
      pp_stats
      (M.find most_played stats) ;
    Format.fprintf
      fmtr
      "best cumulative reward:@.@[<v 4>    arm = %a;@;stats = %a@]@."
      Arm.pp
      most_cumulative_reward
      pp_stats
      (M.find most_cumulative_reward stats) ;
    Format.fprintf
      fmtr
      "best empirical reward:@.@[<v 4>    arm = %a;@;stats = %a@]@."
      Arm.pp
      most_empirical_reward
      pp_stats
      (M.find most_empirical_reward stats) ;
    let total_cumulative_rewards =
      M.fold (fun _arm stats acc -> acc +. stats.cumulative_reward) stats 0.0
    in
    Format.fprintf
      fmtr
      "Total cumulative reward in %d rounds: %f\n"
      (round - 1)
      total_cumulative_rewards
end
OCaml

Innovation. Community. Security.