1
1
module Sch
2
2
3
3
using Distributed
4
- import MemPool: DRef
4
+ import MemPool: DRef, poolset
5
5
6
6
import .. Dagger
7
- import .. Dagger: Context, Processor, Thunk, ThunkFuture, ThunkFailedException, Chunk, OSProc, AnyScope
8
- import .. Dagger: order, free!, dependents, noffspring, istask, inputs, unwrap_weak , affinity, tochunk, @dbg , @logmsg , timespan_start, timespan_end, unrelease, procs, move, capacity, chunktype, default_enabled, get_processors, execute!, rmprocs!, addprocs!, thunk_processor, constrain, cputhreadtime
7
+ import .. Dagger: Context, Processor, Thunk, WeakThunk, ThunkFuture, ThunkFailedException, Chunk, OSProc, AnyScope
8
+ import .. Dagger: order, free!, dependents, noffspring, istask, inputs, unwrap_weak_checked , affinity, tochunk, @dbg , @logmsg , timespan_start, timespan_end, unrelease, procs, move, capacity, chunktype, default_enabled, get_processors, execute!, rmprocs!, addprocs!, thunk_processor, constrain, cputhreadtime
9
9
10
10
const OneToMany = Dict{Thunk, Set{Thunk}}
11
11
@@ -45,7 +45,7 @@ Fields:
45
45
- `cache::Dict{Thunk, Any}` - Maps from a finished `Thunk` to it's cached result, often a DRef
46
46
- `running::Set{Thunk}` - The set of currently-running `Thunk`s
47
47
- `running_on::Dict{Thunk,OSProc}` - Map from `Thunk` to the OS process executing it
48
- - `thunk_dict::Dict{Int, Any }` - Maps from thunk IDs to a `Thunk`
48
+ - `thunk_dict::Dict{Int, WeakThunk }` - Maps from thunk IDs to a `Thunk`
49
49
- `node_order::Any` - Function that returns the order of a thunk
50
50
- `worker_pressure::Dict{Int,Dict{Type,UInt64}}` - Cache of worker pressure
51
51
- `worker_capacity::Dict{Int,Dict{Type,UInt64}}` - Maps from worker ID to capacity
@@ -67,7 +67,7 @@ struct ComputeState
67
67
cache:: WeakKeyDict{Thunk, Any}
68
68
running:: Set{Thunk}
69
69
running_on:: Dict{Thunk,OSProc}
70
- thunk_dict:: Dict{Int, Any }
70
+ thunk_dict:: Dict{Int, WeakThunk }
71
71
node_order:: Any
72
72
worker_pressure:: Dict{Int,Dict{Type,UInt64}}
73
73
worker_capacity:: Dict{Int,Dict{Type,UInt64}}
@@ -90,7 +90,7 @@ function start_state(deps::Dict, node_order, chan)
90
90
Dict {Thunk, Any} (),
91
91
Set {Thunk} (),
92
92
Dict {Thunk,OSProc} (),
93
- Dict {Int, Thunk } (),
93
+ Dict {Int, WeakThunk } (),
94
94
node_order,
95
95
Dict {Int,Dict{Type,UInt64}} (),
96
96
Dict {Int,Dict{Type,UInt64}} (),
@@ -333,15 +333,22 @@ function compute_dag(ctx, d::Thunk; options=SchedulerOptions())
333
333
334
334
# setup thunk_dict mappings
335
335
for node in filter (istask, keys (deps))
336
- state. thunk_dict[node. id] = node
336
+ state. thunk_dict[node. id] = WeakThunk ( node)
337
337
for dep in deps[node]
338
- state. thunk_dict[dep. id] = dep
338
+ state. thunk_dict[dep. id] = WeakThunk ( dep)
339
339
end
340
340
end
341
341
342
342
# Initialize procs, pressure, and capacity
343
343
@sync for p in procs_to_use (ctx)
344
- @async init_proc (state, p)
344
+ @async begin
345
+ try
346
+ init_proc (state, p)
347
+ catch err
348
+ @error " Error initializing worker $p " exception= (err,catch_backtrace ())
349
+ remove_dead_proc! (ctx, state, p)
350
+ end
351
+ end
345
352
end
346
353
347
354
# setup dynamic listeners
@@ -357,6 +364,7 @@ function compute_dag(ctx, d::Thunk; options=SchedulerOptions())
357
364
358
365
# Loop while we still have thunks to execute
359
366
while ! isempty (state. ready) || ! isempty (state. running)
367
+ procs_state = assign_new_procs! (ctx, state, procs_state)
360
368
if ! isempty (state. ready)
361
369
# Nothing running, so schedule up to N thunks, 1 per N workers
362
370
schedule! (ctx, state, procs_state)
@@ -407,14 +415,14 @@ function compute_dag(ctx, d::Thunk; options=SchedulerOptions())
407
415
end
408
416
continue
409
417
else
410
- if ctx. options. allow_errors || state. thunk_dict[thunk_id]. options. allow_errors
418
+ if ctx. options. allow_errors || unwrap_weak_checked ( state. thunk_dict[thunk_id]) . options. allow_errors
411
419
thunk_failed = true
412
420
else
413
421
throw (res)
414
422
end
415
423
end
416
424
end
417
- node = state. thunk_dict[thunk_id]
425
+ node = unwrap_weak_checked ( state. thunk_dict[thunk_id])
418
426
if metadata != = nothing
419
427
state. worker_pressure[pid][typeof (proc)] = metadata. pressure
420
428
state. worker_loadavg[pid] = metadata. loadavg
@@ -572,7 +580,7 @@ function schedule!(ctx, state, procs=procs_to_use(ctx))
572
580
573
581
# Calculate scope
574
582
scope = AnyScope ()
575
- for input in unwrap_weak .(task. inputs)
583
+ for input in unwrap_weak_checked .(task. inputs)
576
584
chunk = if istask (input)
577
585
state. cache[input]
578
586
elseif input isa Chunk
@@ -741,69 +749,23 @@ function finish_task!(ctx, state, node, thunk_failed)
741
749
if node. cache
742
750
node. cache_ref = state. cache[node]
743
751
end
744
- for dep in sort! (collect (get (()-> Set {Thunk} (), state. waiting_data, node)), by= state. node_order)
745
- dep_isready = false
746
- if haskey (state. waiting, dep)
747
- set = state. waiting[dep]
748
- node in set && pop! (set, node)
749
- dep_isready = isempty (set)
750
- if dep_isready
751
- delete! (state. waiting, dep)
752
- end
753
- else
754
- dep_isready = true
755
- end
756
- if dep_isready
757
- if ! thunk_failed
758
- push! (state. ready, dep)
759
- end
760
- end
761
- end
762
- if haskey (state. futures, node)
763
- # Notify any listening thunks
764
- for future in state. futures[node]
765
- put! (future, state. cache[node]; error= thunk_failed)
766
- end
767
- delete! (state. futures, node)
768
- end
752
+ schedule_dependents! (state, node, thunk_failed)
753
+ fill_registered_futures! (state, node, thunk_failed)
769
754
770
- # Chunk clean-up
771
- to_evict = Set {Chunk} ()
772
- for inp in filter (t-> istask (t) || (t isa Chunk), unwrap_weak .(node. inputs))
773
- if inp in keys (state. waiting_data)
774
- w = state. waiting_data[inp]
775
- if node in w
776
- pop! (w, node)
777
- end
778
- if isempty (w)
779
- delete! (state. waiting_data, inp)
780
- if istask (inp) && haskey (state. cache, inp)
781
- _node = state. cache[inp]
782
- if _node isa Chunk
783
- push! (to_evict, _node)
784
- end
785
- GC. @preserve inp begin
786
- pop! (state. cache, inp)
787
- if haskey (state. errored, inp)
788
- pop! (state. errored, inp)
789
- end
790
- end
791
- elseif inp isa Chunk
792
- push! (to_evict, inp)
793
- end
794
- end
795
- end
796
- end
755
+ to_evict = cleanup_inputs! (state, node)
797
756
if haskey (state. waiting_data, node) && isempty (state. waiting_data[node])
798
757
delete! (state. waiting_data, node)
799
758
end
759
+ evict_all_chunks! (ctx, to_evict)
760
+ end
761
+
762
+ function evict_all_chunks! (ctx, to_evict)
800
763
if ! isempty (to_evict)
801
764
@sync for w in map (p-> p. pid, procs_to_use (ctx))
802
765
@async remote_do (evict_chunks!, w, to_evict)
803
766
end
804
767
end
805
768
end
806
-
807
769
function evict_chunks! (chunks:: Set{Chunk} )
808
770
for chunk in chunks
809
771
haskey (CHUNK_CACHE, chunk) && delete! (CHUNK_CACHE, chunk)
@@ -852,19 +814,20 @@ function fire_tasks!(ctx, thunks::Vector{<:Tuple}, (gproc, proc), state)
852
814
end
853
815
854
816
ids = map (enumerate (thunk. inputs)) do (idx,x)
855
- istask (x) ? unwrap_weak (x). id : - idx
817
+ istask (x) ? unwrap_weak_checked (x). id : - idx
856
818
end
857
819
858
820
data = map (thunk. inputs) do x
859
- istask (x) ? state. cache[unwrap_weak (x)] : x
821
+ istask (x) ? state. cache[unwrap_weak_checked (x)] : x
860
822
end
861
823
toptions = thunk. options != = nothing ? thunk. options : ThunkOptions ()
862
824
options = merge (ctx. options, toptions)
863
825
@assert (options. single == 0 ) || (gproc. pid == options. single)
864
- sch_handle = SchedulerHandle (ThunkID (thunk. id), state. worker_chans[gproc. pid]. .. )
826
+ # TODO : Set `sch_handle.tid.ref` to the right `DRef`
827
+ sch_handle = SchedulerHandle (ThunkID (thunk. id, nothing ), state. worker_chans[gproc. pid]. .. )
865
828
state. worker_pressure[gproc. pid][typeof (proc)] += util
866
829
867
- # FIXME : De-dup common fields (log_sink, uid, etc.)
830
+ # TODO : De-dup common fields (log_sink, uid, etc.)
868
831
push! (to_send, (util, thunk. id, thunk. f, data, thunk. get_result,
869
832
thunk. persist, thunk. cache, thunk. meta, options, ids,
870
833
(log_sink= ctx. log_sink, profile= ctx. profile),
@@ -944,7 +907,7 @@ function do_task(to_proc, extra_util, thunk_id, f, data, send_result, persist, c
944
907
unlock (TASK_SYNC)
945
908
else
946
909
# Under-subscribed, calculate extra utilization and execute thunk
947
- @debug " ($(myid ()) ) ($thunk_id ) Using available $to_proc : $extra_util | $(real_util[]) /$cap "
910
+ @debug " ($(myid ()) ) $f ($thunk_id ) Using available $to_proc : $extra_util | $(real_util[]) /$cap "
948
911
extra_util = if extra_util isa MaxUtilization
949
912
count (c-> typeof (c)=== typeof (to_proc), children (from_proc))
950
913
else
@@ -982,7 +945,7 @@ function do_task(to_proc, extra_util, thunk_id, f, data, send_result, persist, c
982
945
lock (TASK_SYNC) do
983
946
real_util[] -= extra_util
984
947
end
985
- @debug " ($(myid ()) ) ($thunk_id ) Releasing $(typeof (to_proc)) : $extra_util | $(real_util[]) /$cap "
948
+ @debug " ($(myid ()) ) $f ($thunk_id ) Releasing $(typeof (to_proc)) : $extra_util | $(real_util[]) /$cap "
986
949
lock (TASK_SYNC) do
987
950
notify (TASK_SYNC)
988
951
end
0 commit comments