Source file bistro_multinode.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
open Core_kernel
open Bistro_engine
open Lwt.Infix
module Unix = Caml_unix
type job =
| Plugin of {
workflow_id : string ;
f : unit -> unit ;
}
| Shell_command of {
workflow_id : string ;
cmd : Shell_command.t ;
}
type client_id = Client_id of string
type _ api_request =
| Subscript : { np : int ; mem : int } -> client_id api_request
| Get_job : { client_id : string } -> job option api_request
| Plugin_result : {
client_id : string ;
workflow_id : string ;
result : (unit, string) Result.t ;
} -> unit api_request
| Shell_command_result : {
client_id : string ;
workflow_id : string ;
result : (int * bool, string) Result.t ;
} -> unit api_request
module Client = struct
type t = {
np : int ;
mem : int ;
hostname : string ;
port : int ;
}
let with_connection { hostname ; port ; _ } ~f =
Lwt_io.with_connection Unix.(ADDR_INET (inet_addr_of_string hostname, port)) f
let send_request x (msg : 'a api_request) : 'a Lwt.t =
with_connection x ~f:(fun (ic, oc) ->
Lwt_io.write_value oc msg >>= fun () ->
Lwt_io.flush oc >>= fun () ->
Lwt_io.read_value ic
)
let main ~np ~mem ~hostname ~port () =
let mem = mem * 1024 in
let client = { np ; mem ; hostname ; port } in
let stop_var = Lwt_mvar.create_empty () in
send_request client (Subscript { np ; mem }) >>= fun (Client_id client_id) ->
let job_thread = function
| Plugin { workflow_id ; f } ->
Local_backend.eval () () f () >>= fun result ->
send_request client (Plugin_result { client_id ; workflow_id ; result })
| Shell_command { workflow_id ; cmd } ->
Lwt.catch
(fun () -> Shell_command.run cmd >|= Result.return)
(fun exn -> Lwt_result.fail (Exn.to_string exn))
>>= fun result ->
send_request client (Shell_command_result { client_id ; workflow_id ; result })
in
let rec loop () =
Lwt.pick [
(send_request client (Get_job { client_id }) >|= fun x -> `New_job x) ;
Lwt_mvar.take stop_var >|= fun () -> `Stop
]
>>= function
| `New_job None
| `Stop -> Lwt.return ()
| `New_job (Some job) ->
Lwt.async (fun () -> job_thread job) ;
loop ()
in
loop ()
let command =
let open Command.Let_syntax in
Command.basic ~summary:"Bistro client" [%map_open
let np = flag "--np" (required int) ~doc:"INT Number of available cores"
and mem = flag "--mem" (required int) ~doc:"INT Available memory (in GB)"
and hostname = flag "--hostname" (required string) ~doc:"ADDR Bistro server address"
and port = flag "--port" (required int) ~doc:"INT Bistro server port"
in
fun () ->
main ~np ~mem ~hostname ~port ()
|> Lwt_main.run
]
end
module Server = struct
module Backend = struct
type job_waiter =
| Waiting_shell_command of {
workflow_id : string ;
cmd : Shell_command.t ;
waiter : (int * bool, string) result Lwt.u ;
}
| Waiting_plugin of {
workflow_id : string ;
f : unit -> unit ;
waiter : (unit, string) result Lwt.u ;
}
type worker = Worker of {
id : string ;
np : int ;
mem : int ;
mutable available_resource : Allocator.resource ;
pending_jobs : job_waiter Lwt_queue.t ;
running_jobs : job_waiter String.Table.t ;
}
module Worker_allocator = struct
type t = {
mutable available : Allocator.resource String.Table.t ;
mutable waiters : ((int * int) * (string * Allocator.resource) Lwt.u) list ;
}
let create () = {
available = String.Table.create () ;
waiters = [] ;
}
let search (type s) (table : s String.Table.t) ~f =
let module M = struct exception Found of string * s end in
try
String.Table.fold table ~init:() ~f:(fun ~key ~data () -> if f ~key ~data then raise (M.Found (key, data))) ;
None
with M.Found (k, v) -> Some (k, v)
let allocation_pass pool =
let remaining_waiters =
List.filter_map pool.waiters ~f:(fun ((np, mem), u as elt) ->
let allocation_attempt =
search pool.available ~f:(fun ~key:_ ~data:(Resource curr) ->
curr.np >= np && curr.mem >= mem
)
in
match allocation_attempt with
| None -> Some elt
| Some (worker_id, (Resource curr)) ->
String.Table.set pool.available ~key:worker_id ~data:(Resource { np = curr.np - np ; mem = curr.mem - mem }) ;
Lwt.wakeup u (worker_id, Resource { np ; mem }) ;
None
)
in
pool.waiters <- remaining_waiters
let request pool (Allocator.Request { np ; mem }) =
let t, u = Lwt.wait () in
let waiters =
((np, mem), u) :: pool.waiters
|> List.sort ~compare:(fun (x, _) (y,_) -> Poly.compare y x)
in
pool.waiters <- waiters ;
allocation_pass pool ;
t
let add_worker pool (Worker { id ; np ; mem ; _ }) =
match String.Table.add pool.available ~key:id ~data:(Allocator.Resource { np ; mem }) with
| `Ok -> allocation_pass pool
| `Duplicate -> failwith "A worker has been added twice"
let release pool worker_id (Allocator.Resource { np ; mem }) =
String.Table.update pool.available worker_id ~f:(function
| None -> failwith "Tried to release resources of inexistent worker"
| Some (Resource r) -> Resource { np = r.np + np ; mem = r.mem + mem }
)
end
type token = {
worker_id : string ;
workflow_id : string ;
}
type state = {
workers : worker String.Table.t ;
alloc : Worker_allocator.t ;
}
type event = [
| `Stop
| `New_worker
]
type t = {
server : Lwt_io.server ;
state : state ;
events : event Lwt_react.event ;
send_event : event -> unit ;
stop_signal : unit Lwt_condition.t ;
server_stop : unit Lwt.t ;
logger : Logger.t ;
db : Db.t ;
}
let new_id =
let c = ref 0 in
fun () -> incr c ; sprintf "w%d" !c
let workflow_id_of_job_waiter = function
| Waiting_plugin wp -> wp.workflow_id
| Waiting_shell_command wsc -> wsc.workflow_id
let job_of_job_waiter = function
| Waiting_plugin { f ; workflow_id ; _ } ->
Plugin { f ; workflow_id }
| Waiting_shell_command { cmd ; workflow_id ; _ } ->
Shell_command { cmd ; workflow_id }
let create_worker ~np ~mem id =
Worker {
id ; np ; mem ;
available_resource = Allocator.Resource { np ; mem } ;
pending_jobs = Lwt_queue.create () ;
running_jobs = String.Table.create () ;
}
let create_state () = {
workers = String.Table.create () ;
alloc = Worker_allocator.create () ;
}
let server_api
: type s. (Logger.event -> unit) -> stop_signal:unit Lwt_condition.t -> state -> s api_request -> s Lwt.t
= fun log ~stop_signal state msg ->
match msg with
| Subscript { np ; mem } ->
let id = new_id () in
let w = create_worker ~np ~mem id in
String.Table.set state.workers ~key:id ~data:w ;
Worker_allocator.add_worker state.alloc w ;
log (Logger.Debug (sprintf "new worker %s" id)) ;
Lwt.return (Client_id id)
| Get_job { client_id } -> (
match String.Table.find state.workers client_id with
| None -> Lwt.return None
| Some (Worker worker) ->
Lwt.choose [
(Lwt_queue.pop worker.pending_jobs >|= fun x -> `Job x) ;
(Lwt_condition.wait stop_signal >|= fun () -> `Stop) ;
] >>= function
| `Job wp ->
let workflow_id = workflow_id_of_job_waiter wp in
String.Table.set worker.running_jobs ~key:workflow_id ~data:wp ;
Lwt.return (Some (job_of_job_waiter wp))
| `Stop -> Lwt.return None
)
| Plugin_result r ->
let Worker worker = String.Table.find_exn state.workers r.client_id in
Lwt.return (
match String.Table.find_exn worker.running_jobs r.workflow_id with
| Waiting_plugin wp -> Lwt.wakeup wp.waiter r.result
| Waiting_shell_command _ -> assert false
)
| Shell_command_result r ->
let Worker worker = String.Table.find_exn state.workers r.client_id in
Lwt.return (
match String.Table.find_exn worker.running_jobs r.workflow_id with
| Waiting_plugin _ -> assert false
| Waiting_shell_command wp -> Lwt.wakeup wp.waiter r.result
)
let server_handler log ~stop_signal state _ (ic, oc) =
Lwt_io.read_value ic >>= fun msg ->
server_api log ~stop_signal state msg >>= fun res ->
Lwt_io.write_value oc res ~flags:[Closures] >>= fun () ->
Lwt_io.flush oc >>= fun () ->
Lwt_io.close ic >>= fun () ->
Lwt_io.close oc
let create ?(loggers = []) ~port db =
Lwt_unix.gethostname () >>= fun hostname ->
Lwt_unix.gethostbyname hostname >>= fun h ->
let sockaddr = Unix.ADDR_INET (h.Unix.h_addr_list.(0), port) in
let state = create_state () in
let logger = Logger.tee loggers in
let log event = logger#event db (Unix.gettimeofday ()) event in
let stop_signal = Lwt_condition.create () in
Lwt_io.establish_server_with_client_address sockaddr (server_handler log ~stop_signal state) >>= fun server ->
let events, send_event = Lwt_react.E.create () in
let server_stop =
Lwt_condition.wait stop_signal >>= fun () -> Lwt_io.shutdown_server server
in
Lwt.return {
events ;
send_event ;
stop_signal ;
server_stop ;
server ;
state ;
logger = Logger.tee loggers ;
db ;
}
let log ?(time = Unix.gettimeofday ()) backend event =
backend.logger#event backend.db time event
let request_resource backend req =
Worker_allocator.request backend.state.alloc req >|= fun (worker_id, resource) ->
String.Table.find_exn backend.state.workers worker_id, resource
let release_resource backend worker_id res =
Worker_allocator.release backend.state.alloc worker_id res
let build_trace backend w requirement perform =
let ready = Unix.gettimeofday () in
log ~time:ready backend (Logger.Workflow_ready w) ;
request_resource backend requirement >>= fun (Worker worker, resource) ->
let open Eval_thread.Infix in
let start = Unix.gettimeofday () in
log ~time:start backend (Logger.Workflow_started (w, resource)) ;
let token = { worker_id = worker.id ; workflow_id = Bistro_internals.Workflow.id w } in
perform token resource >>= fun details ->
let _end_ = Unix.gettimeofday () in
log ~time:_end_ backend (Logger.Workflow_ended { details ; start ; _end_ }) ;
release_resource backend worker.id resource ;
Eval_thread.return (
Execution_trace.Run { ready ; start ; _end_ ; details }
)
let eval backend { worker_id ; workflow_id } f x =
let Worker worker = String.Table.find_exn backend.state.workers worker_id in
let f () = f x in
let t, u = Lwt.wait () in
let job_waiter = Waiting_plugin { waiter = u ; f ; workflow_id } in
Lwt_queue.push worker.pending_jobs job_waiter ;
t
let run_shell_command backend { worker_id ; workflow_id } cmd =
let Worker worker = String.Table.find_exn backend.state.workers worker_id in
let t, u = Lwt.wait () in
let job = Waiting_shell_command { waiter = u ; cmd ; workflow_id } in
Lwt_queue.push worker.pending_jobs job ;
t
let stop backend =
Lwt_condition.broadcast backend.stop_signal () ;
Lwt.return ()
end
module Scheduler = Scheduler.Make(Backend)
type t = Scheduler.t
let create ?allowed_containers ?loggers ?collect ?(port = 6666) db =
Backend.create ?loggers ~port db >|= fun backend ->
Scheduler.create ?allowed_containers ?loggers ?collect backend db
let start sched =
Scheduler.start sched
let stop sched =
Scheduler.stop sched
let eval sched w =
Scheduler.eval sched w
let simple_app ?allowed_containers ?loggers ?collect ?port ?(db = "_bistro") w =
let t =
create ?allowed_containers ?loggers ?collect ?port (Db.init_exn db) >>= fun server ->
start server ;
eval server w >|= (
function
| Ok _ -> ()
| Error e ->
print_endline @@ Scheduler.error_report server e
) >>= fun () ->
stop server
in
Lwt_main.run t
let simple_command ~summary w =
let open Command.Let_syntax in
Command.basic ~summary [%map_open
let port = flag "--port" (required int) ~doc:"INT Port"
and verbose = flag "--verbose" no_arg ~doc:" Display more info"
in
let loggers = if verbose then [ Bistro_utils.Console_logger.create () ] else [] in
fun () -> simple_app ~port ~loggers w
]
end