Skip to content

Commit 684d80c

Browse files
authored
Merge pull request #532 from JuliaParallel/dead-workers
Dead worker handling
2 parents 530805e + 4d123ac commit 684d80c

File tree

2 files changed

+25
-3
lines changed

2 files changed

+25
-3
lines changed

Diff for: src/sch/Sch.jl

+14-1
Original file line numberDiff line numberDiff line change
@@ -405,7 +405,20 @@ function cleanup_proc(state, p, log_sink)
405405
delete!(WORKER_MONITOR_CHANS[wid], state.uid)
406406
end
407407
end
408-
remote_do(_cleanup_proc, wid, state.uid, log_sink)
408+
409+
# If the worker process is still alive, clean it up
410+
if wid in workers()
411+
try
412+
remotecall_wait(_cleanup_proc, wid, state.uid, log_sink)
413+
catch ex
414+
# We allow ProcessExitedException's, which means that the worker
415+
# shutdown halfway through cleanup.
416+
if !(ex isa ProcessExitedException)
417+
rethrow()
418+
end
419+
end
420+
end
421+
409422
timespan_finish(ctx, :cleanup_proc, (;worker=wid), nothing)
410423
end
411424

Diff for: src/sch/dynamic.jl

+11-2
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,18 @@ function safepoint(state)
3333
if state.halt.set
3434
# Force dynamic thunks and listeners to terminate
3535
for (inp_chan,out_chan) in values(state.worker_chans)
36-
close(inp_chan)
37-
close(out_chan)
36+
# Closing these channels will fail if the worker died, which we
37+
# allow.
38+
try
39+
close(inp_chan)
40+
close(out_chan)
41+
catch ex
42+
if !(ex isa ProcessExitedException)
43+
rethrow()
44+
end
45+
end
3846
end
47+
3948
# Throw out of scheduler
4049
throw(SchedulerHaltedException())
4150
end

0 commit comments

Comments
 (0)