294
294
" Process-local condition variable (and lock) indicating task completion."
295
295
const TASK_SYNC = Threads. Condition ()
296
296
297
+ " Process-local set of running task IDs."
298
+ const TASKS_RUNNING = Set {Int} ()
299
+
297
300
" Process-local dictionary tracking per-processor total utilization."
298
301
const PROC_UTILIZATION = Dict {UInt64,Dict{Type,Ref{UInt64}}} ()
299
302
@@ -833,7 +836,21 @@ function fire_tasks!(ctx, thunks::Vector{<:Tuple}, (gproc, proc), state)
833
836
(log_sink= ctx. log_sink, profile= ctx. profile),
834
837
sch_handle, state. uid))
835
838
end
836
- remote_do (do_tasks, gproc. pid, proc, state. chan, to_send)
839
+ try
840
+ remotecall_wait (do_tasks, gproc. pid, proc, state. chan, to_send)
841
+ catch
842
+ # We might get a deserialization error due to something not being
843
+ # defined on the worker; in this case, we re-fire one task at a time to
844
+ # determine which task failed
845
+ for ts in to_send
846
+ try
847
+ remotecall_wait (do_tasks, gproc. pid, proc, state. chan, [ts])
848
+ catch err
849
+ bt = catch_backtrace ()
850
+ put! (state. chan, (gproc. pid, proc, ts[2 ], (CapturedException (err, bt), nothing )))
851
+ end
852
+ end
853
+ end
837
854
end
838
855
839
856
"""
@@ -843,6 +860,16 @@ Executes a batch of tasks on `to_proc`.
843
860
"""
844
861
function do_tasks (to_proc, chan, tasks)
845
862
for task in tasks
863
+ should_launch = lock (TASK_SYNC) do
864
+ # Already running; don't try to re-launch
865
+ if ! (task[2 ] in TASKS_RUNNING)
866
+ push! (TASKS_RUNNING, task[2 ])
867
+ true
868
+ else
869
+ false
870
+ end
871
+ end
872
+ should_launch || continue
846
873
@async begin
847
874
try
848
875
result = do_task (to_proc, task... )
@@ -944,11 +971,10 @@ function do_task(to_proc, extra_util, thunk_id, f, data, send_result, persist, c
944
971
@dbg timespan_end (ctx, :compute , thunk_id, (f, to_proc))
945
972
lock (TASK_SYNC) do
946
973
real_util[] -= extra_util
947
- end
948
- @debug " ($(myid ()) ) $f ($thunk_id ) Releasing $(typeof (to_proc)) : $extra_util | $(real_util[]) /$cap "
949
- lock (TASK_SYNC) do
974
+ pop! (TASKS_RUNNING, thunk_id)
950
975
notify (TASK_SYNC)
951
976
end
977
+ @debug " ($(myid ()) ) $f ($thunk_id ) Releasing $(typeof (to_proc)) : $extra_util | $(real_util[]) /$cap "
952
978
metadata = (
953
979
pressure= real_util[],
954
980
loadavg= ((Sys. loadavg ()... ,) ./ Sys. CPU_THREADS),
0 commit comments