Skip to content

Commit 144c7cb

Browse files
committed
Add precompilation via PrecompileTools
Add errormonitor_tracked to track spawned tasks Improve scheduler cleanup and exit code
1 parent 8a6e3ca commit 144c7cb

File tree

8 files changed

+137
-33
lines changed

8 files changed

+137
-33
lines changed

Diff for: Manifest.toml

+18-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
julia_version = "1.8.5"
44
manifest_format = "2.0"
5-
project_hash = "c7130869591b4f985941e8a5c1d2a572f2f59175"
5+
project_hash = "5333a6c200b6e6add81c46547527f66ddc0dc16c"
66

77
[[deps.Artifacts]]
88
uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
@@ -137,6 +137,18 @@ git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3"
137137
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
138138
version = "1.6.2"
139139

140+
[[deps.PrecompileTools]]
141+
deps = ["Preferences"]
142+
git-tree-sha1 = "03b4c25b43cb84cee5c90aa9b5ea0a78fd848d2f"
143+
uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
144+
version = "1.2.0"
145+
146+
[[deps.Preferences]]
147+
deps = ["TOML"]
148+
git-tree-sha1 = "00805cd429dcb4870060ff49ef443486c262e38e"
149+
uuid = "21216c6a-2e73-6563-6e65-726566657250"
150+
version = "1.4.1"
151+
140152
[[deps.Printf]]
141153
deps = ["Unicode"]
142154
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
@@ -201,6 +213,11 @@ git-tree-sha1 = "1d77abd07f617c4868c33d4f5b9e1dbb2643c9cf"
201213
uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
202214
version = "0.34.2"
203215

216+
[[deps.TOML]]
217+
deps = ["Dates"]
218+
uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
219+
version = "1.0.0"
220+
204221
[[deps.Test]]
205222
deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
206223
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

Diff for: Project.toml

+2
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
88
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
99
MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
1010
MemPool = "f9f48841-c794-520a-933b-121f7ba6ed94"
11+
PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
1112
Profile = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
1213
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
1314
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
@@ -24,6 +25,7 @@ UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
2425
DataStructures = "0.18"
2526
MacroTools = "0.5"
2627
MemPool = "0.4.4"
28+
PrecompileTools = "1.2"
2729
Requires = "1"
2830
ScopedValues = "1.1"
2931
StatsBase = "0.28, 0.29, 0.30, 0.31, 0.32, 0.33, 0.34"

Diff for: src/Dagger.jl

+4
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,10 @@ include("ui/gantt-text.jl")
6464
# Logging
6565
include("lib/logging-events.jl")
6666

67+
# Precompilation
68+
using PrecompileTools
69+
include("precompile.jl")
70+
6771
function __init__()
6872
# Initialize system UUID
6973
system_uuid()

Diff for: src/precompile.jl

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
@compile_workload begin
2+
system_uuid()
3+
add_processor_callback!("__cpu_thread_1__") do
4+
ThreadProc(1, 1)
5+
end
6+
t1 = @spawn 1+1
7+
t2 = spawn(+, 1, t1)
8+
fetch(t2)
9+
spawn() do
10+
Sch.halt!(sch_handle())
11+
end
12+
while Sch.EAGER_INIT[]
13+
sleep(0.1)
14+
end
15+
Sch.EAGER_CONTEXT[] = nothing
16+
GC.gc()
17+
yield()
18+
lock(Sch.ERRORMONITOR_TRACKED) do tracked
19+
if all(t->istaskdone(t) || istaskfailed(t), tracked)
20+
empty!(tracked)
21+
return
22+
end
23+
for t in tracked
24+
Base.throwto(t, InterruptException())
25+
end
26+
end
27+
MemPool.exit_hook()
28+
GC.gc()
29+
yield()
30+
@assert isempty(Sch.WORKER_MONITOR_CHANS)
31+
@assert isempty(Sch.WORKER_MONITOR_TASKS)
32+
end

Diff for: src/sch/Sch.jl

+42-20
Original file line numberDiff line numberDiff line change
@@ -339,30 +339,39 @@ function init_proc(state, p, log_sink)
339339

340340
state.worker_loadavg[p.pid] = (0.0, 0.0, 0.0)
341341
end
342-
lock(WORKER_MONITOR_LOCK) do
343-
wid = p.pid
344-
if !haskey(WORKER_MONITOR_TASKS, wid)
345-
t = @async begin
346-
try
347-
# Wait until this connection is terminated
348-
remotecall_fetch(sleep, wid, typemax(UInt64))
349-
catch err
350-
if err isa ProcessExitedException
342+
if p.pid != 1
343+
lock(WORKER_MONITOR_LOCK) do
344+
wid = p.pid
345+
if !haskey(WORKER_MONITOR_TASKS, wid)
346+
t = @async begin
347+
try
348+
# Wait until this connection is terminated
349+
remotecall_fetch(sleep, wid, typemax(UInt64))
350+
catch err
351+
# TODO: Report other kinds of errors? IOError, etc.
352+
#if !(err isa ProcessExitedException)
353+
#end
354+
finally
351355
lock(WORKER_MONITOR_LOCK) do
352356
d = WORKER_MONITOR_CHANS[wid]
353357
for uid in keys(d)
354-
put!(d[uid], (wid, OSProc(wid), nothing, (ProcessExitedException(wid), nothing)))
358+
try
359+
put!(d[uid], (wid, OSProc(wid), nothing, (ProcessExitedException(wid), nothing)))
360+
catch
361+
end
355362
end
356363
empty!(d)
357364
delete!(WORKER_MONITOR_CHANS, wid)
365+
delete!(WORKER_MONITOR_TASKS, wid)
358366
end
359367
end
360368
end
369+
errormonitor_tracked(t)
370+
WORKER_MONITOR_TASKS[wid] = t
371+
WORKER_MONITOR_CHANS[wid] = Dict{UInt64,RemoteChannel}()
361372
end
362-
WORKER_MONITOR_TASKS[wid] = t
363-
WORKER_MONITOR_CHANS[wid] = Dict{UInt64,RemoteChannel}()
373+
WORKER_MONITOR_CHANS[wid][state.uid] = state.chan
364374
end
365-
WORKER_MONITOR_CHANS[wid][state.uid] = state.chan
366375
end
367376

368377
# Setup worker-to-scheduler channels
@@ -379,18 +388,26 @@ function init_proc(state, p, log_sink)
379388
end
380389
function _cleanup_proc(uid, log_sink)
381390
empty!(CHUNK_CACHE) # FIXME: Should be keyed on uid!
391+
proc_states(uid) do states
392+
for (proc, state) in states
393+
istate = state.state
394+
istate.done[] = true
395+
notify(istate.reschedule)
396+
end
397+
empty!(states)
398+
end
382399
end
383400
function cleanup_proc(state, p, log_sink)
384401
ctx = Context(Int[]; log_sink)
385-
timespan_start(ctx, :cleanup_proc, p.pid, 0)
402+
wid = p.pid
403+
timespan_start(ctx, :cleanup_proc, wid, 0)
386404
lock(WORKER_MONITOR_LOCK) do
387-
wid = p.pid
388405
if haskey(WORKER_MONITOR_CHANS, wid)
389406
delete!(WORKER_MONITOR_CHANS[wid], state.uid)
390-
remote_do(_cleanup_proc, wid, state.uid, log_sink)
391407
end
392408
end
393-
timespan_finish(ctx, :cleanup_proc, p.pid, 0)
409+
remote_do(_cleanup_proc, wid, state.uid, log_sink)
410+
timespan_finish(ctx, :cleanup_proc, wid, 0)
394411
end
395412

396413
"Process-local condition variable (and lock) indicating task completion."
@@ -1096,6 +1113,7 @@ struct ProcessorInternalState
10961113
tasks::Dict{Int,Task}
10971114
proc_occupancy::Base.RefValue{UInt32}
10981115
time_pressure::Base.RefValue{UInt64}
1116+
done::Base.RefValue{Bool}
10991117
end
11001118
struct ProcessorState
11011119
state::ProcessorInternalState
@@ -1144,6 +1162,9 @@ function start_processor_runner!(istate::ProcessorInternalState, uid::UInt64, re
11441162
reset(istate.reschedule)
11451163
end
11461164
timespan_finish(ctx, :proc_run_wait, to_proc, nothing)
1165+
if istate.done[]
1166+
return
1167+
end
11471168
end
11481169

11491170
# Fetch a new task to execute
@@ -1270,7 +1291,7 @@ function start_processor_runner!(istate::ProcessorInternalState, uid::UInt64, re
12701291
else
12711292
t.sticky = false
12721293
end
1273-
tasks[thunk_id] = errormonitor(schedule(t))
1294+
tasks[thunk_id] = errormonitor_tracked(schedule(t))
12741295
proc_occupancy[] += task_occupancy
12751296
time_pressure[] += time_util
12761297
end
@@ -1283,7 +1304,7 @@ function start_processor_runner!(istate::ProcessorInternalState, uid::UInt64, re
12831304
else
12841305
proc_run_task.sticky = false
12851306
end
1286-
return errormonitor(schedule(proc_run_task))
1307+
return errormonitor_tracked(schedule(proc_run_task))
12871308
end
12881309

12891310
"""
@@ -1307,7 +1328,8 @@ function do_tasks(to_proc, return_queue, tasks)
13071328
istate = ProcessorInternalState(ctx, to_proc,
13081329
queue_locked, reschedule,
13091330
Dict{Int,Task}(),
1310-
Ref(UInt32(0)), Ref(UInt64(0)))
1331+
Ref(UInt32(0)), Ref(UInt64(0)),
1332+
Ref(false))
13111333
runner = start_processor_runner!(istate, uid, return_queue)
13121334
@static if VERSION < v"1.9"
13131335
reschedule.waiter = runner

Diff for: src/sch/dynamic.jl

+6-5
Original file line numberDiff line numberDiff line change
@@ -45,15 +45,15 @@ function dynamic_listener!(ctx, state, wid)
4545
task = current_task() # The scheduler's main task
4646
inp_chan, out_chan = state.worker_chans[wid]
4747
listener_task = @async begin
48-
while isopen(inp_chan) && !state.halt.set
48+
while !state.halt.set
4949
tid, f, data = try
5050
take!(inp_chan)
5151
catch err
5252
if !(unwrap_nested_exception(err) isa Union{SchedulerHaltedException,
5353
ProcessExitedException,
5454
InvalidStateException})
5555
iob = IOContext(IOBuffer(), :color=>true)
56-
println(iob, "Error in sending dynamic request:")
56+
println(iob, "Error in receiving dynamic request:")
5757
Base.showerror(iob, err)
5858
Base.show_backtrace(iob, catch_backtrace())
5959
println(iob)
@@ -87,12 +87,13 @@ function dynamic_listener!(ctx, state, wid)
8787
end
8888
end
8989
end
90-
@async begin
90+
errormonitor_tracked(listener_task)
91+
errormonitor_tracked(@async begin
9192
wait(state.halt)
9293
# TODO: Not sure why we need the @async here, but otherwise we
9394
# don't stop all the listener tasks
9495
@async Base.throwto(listener_task, SchedulerHaltedException())
95-
end
96+
end)
9697
end
9798

9899
## Worker-side methods for dynamic communication
@@ -113,7 +114,7 @@ end
113114
halt!(h::SchedulerHandle) = exec!(_halt, h, nothing)
114115
function _halt(ctx, state, task, tid, _)
115116
notify(state.halt)
116-
put!(state.chan, (1, nothing, SchedulerHaltedException(), nothing))
117+
put!(state.chan, (1, nothing, nothing, (SchedulerHaltedException(), nothing)))
117118
Base.throwto(task, SchedulerHaltedException())
118119
end
119120

Diff for: src/sch/eager.jl

+11-7
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
const EAGER_INIT = Threads.Atomic{Bool}(false)
22
const EAGER_READY = Base.Event()
3-
const EAGER_FORCE_KILL = Ref{Bool}(false)
43
const EAGER_ID_MAP = LockedObject(Dict{UInt64,Int}())
54
const EAGER_CONTEXT = Ref{Union{Context,Nothing}}(nothing)
65
const EAGER_STATE = Ref{Union{ComputeState,Nothing}}(nothing)
@@ -21,13 +20,16 @@ function init_eager()
2120
return
2221
end
2322
ctx = eager_context()
24-
Threads.@spawn try
23+
errormonitor_tracked(Threads.@spawn try
2524
sopts = SchedulerOptions(;allow_errors=true)
2625
opts = Dagger.Options((;scope=Dagger.ExactScope(Dagger.ThreadProc(1, 1)),
2726
occupancy=Dict(Dagger.ThreadProc=>0)))
2827
Dagger.compute(ctx, Dagger.delayed(eager_thunk, opts)();
2928
options=sopts)
3029
catch err
30+
# Scheduler halting is considered normal
31+
err isa SchedulerHaltedException && return
32+
3133
iob = IOContext(IOBuffer(), :color=>true)
3234
println(iob, "Error in eager scheduler:")
3335
Base.showerror(iob, err)
@@ -37,9 +39,12 @@ function init_eager()
3739
write(stderr, iob)
3840
finally
3941
reset(EAGER_READY)
42+
EAGER_STATE[] = nothing
43+
lock(EAGER_ID_MAP) do id_map
44+
empty!(id_map)
45+
end
4046
Threads.atomic_xchg!(EAGER_INIT, false)
41-
EAGER_FORCE_KILL[] = true
42-
end
47+
end)
4348
wait(EAGER_READY)
4449
end
4550
function eager_thunk()
@@ -48,8 +53,7 @@ function eager_thunk()
4853
return
4954
end
5055
notify(EAGER_READY)
51-
sleep(typemax(UInt))
52-
error("eager_thunk exited")
56+
wait(Dagger.Sch.EAGER_STATE[].halt)
5357
end
5458

5559
"""
@@ -97,7 +101,7 @@ function thunk_yield(f)
97101
end
98102

99103
eager_cleanup(t::Dagger.EagerThunkFinalizer) =
100-
Threads.@spawn eager_cleanup(EAGER_STATE[], t.uid)
104+
errormonitor_tracked(Threads.@spawn eager_cleanup(EAGER_STATE[], t.uid))
101105
function eager_cleanup(state, uid)
102106
tid = nothing
103107
lock(EAGER_ID_MAP) do id_map

Diff for: src/sch/util.jl

+22
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,25 @@
1+
"Like `errormonitor`, but tracks how many outstanding tasks are running."
2+
function errormonitor_tracked(t::Task)
3+
errormonitor(t)
4+
lock(ERRORMONITOR_TRACKED) do tracked
5+
push!(tracked, t)
6+
end
7+
errormonitor(Threads.@spawn begin
8+
try
9+
wait(t)
10+
finally
11+
lock(ERRORMONITOR_TRACKED) do tracked
12+
idx = findfirst(o->o===t, tracked)
13+
# N.B. This may be nothing if precompile emptied these
14+
if idx !== nothing
15+
deleteat!(tracked, idx)
16+
end
17+
end
18+
end
19+
end)
20+
end
21+
const ERRORMONITOR_TRACKED = LockedObject(Task[])
22+
123
"""
224
unwrap_nested_exception(err::Exception) -> Bool
325

0 commit comments

Comments
 (0)