Skip to content

Commit 5ccbd87

Browse files
authored
Merge pull request #223 from JuliaParallel/jps/eager-mem-leak-fix
Fix memory leaks in eager API
2 parents 2ed91b7 + 922dfa2 commit 5ccbd87

File tree

11 files changed

+243
-146
lines changed

11 files changed

+243
-146
lines changed

Diff for: Project.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
name = "Dagger"
22
uuid = "d58978e5-989f-55fb-8d15-ea34adc7bf54"
3-
version = "0.11.5"
3+
version = "0.11.6"
44

55
[deps]
66
Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"

Diff for: src/Dagger.jl

+2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ module Dagger
22

33
using Distributed, SharedArrays
44

5+
using MemPool
6+
57
import Base: collect, adjoint, reduce
68
import Distributed: procs
79

Diff for: src/chunks.jl

-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
2-
using MemPool
31
using Serialization
42

53
export domain, UnitDomain, project, alignfirst, ArrayDomain

Diff for: src/sch/Sch.jl

+65-59
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ using Distributed
44
import MemPool: DRef
55

66
import ..Dagger
7-
import ..Dagger: Context, Processor, Thunk, ThunkFuture, ThunkFailedException, Chunk, OSProc, order, free!, dependents, noffspring, istask, inputs, affinity, tochunk, @dbg, @logmsg, timespan_start, timespan_end, unrelease, procs, move, capacity, chunktype, default_enabled, get_processors, execute!, rmprocs!, addprocs!, thunk_processor
7+
import ..Dagger: Context, Processor, Thunk, ThunkFuture, ThunkFailedException, Chunk, OSProc, order, free!, dependents, noffspring, istask, inputs, unwrap_weak, affinity, tochunk, @dbg, @logmsg, timespan_start, timespan_end, unrelease, procs, move, capacity, chunktype, default_enabled, get_processors, execute!, rmprocs!, addprocs!, thunk_processor
88

99
const OneToMany = Dict{Thunk, Set{Thunk}}
1010

@@ -44,8 +44,6 @@ The internal state-holding struct of the scheduler.
4444
4545
Fields:
4646
- `uid::UInt64` - Unique identifier for this scheduler instance
47-
- `dependents::Dict{Union{Thunk,Chunk},Set{Thunk}}` - The result of calling `dependents` on the DAG
48-
- `finished::Set{Thunk}` - The set of completed `Thunk`s
4947
- `waiting::OneToMany` - Map from downstream `Thunk` to upstream `Thunk`s that still need to execute
5048
- `waiting_data::Dict{Union{Thunk,Chunk},Set{Thunk}}` - Map from input `Chunk`/upstream `Thunk` to all unfinished downstream `Thunk`s, to retain caches
5149
- `ready::Vector{Thunk}` - The list of `Thunk`s that are ready to execute
@@ -60,20 +58,18 @@ Fields:
6058
- `worker_chans::Dict{Int, Tuple{RemoteChannel,RemoteChannel}}` - Communication channels between the scheduler and each worker
6159
- `procs_cache_list::Base.RefValue{Union{ProcessorCacheEntry,Nothing}}` - Cached linked list of processors ready to be used
6260
- `function_cost_cache::Dict{Type{<:Tuple},UInt}` - Cache of estimated CPU time required to compute the given signature
63-
- `halt::Base.RefValue{Bool}` - Flag indicating, when set, that the scheduler should halt immediately
64-
- `lock::ReentrantLock()` - Lock around operations which modify the state
61+
- `halt::Base.Event` - Event indicating that the scheduler is halting
62+
- `lock::ReentrantLock` - Lock around operations which modify the state
6563
- `futures::Dict{Thunk, Vector{ThunkFuture}}` - Futures registered for waiting on the result of a thunk.
66-
- `errored::Set{Thunk}` - Thunks that threw an error
67-
- `chan::RemoteChannel{Channel{Any}}` - Channel for receiving completed thunks
64+
- `errored::WeakKeyDict{Thunk,Bool}` - Indicates if a thunk's result is due to an error.
65+
- `chan::RemoteChannel{Channel{Any}}` - Channel for receiving completed thunks.
6866
"""
6967
struct ComputeState
7068
uid::UInt64
71-
dependents::Dict{Union{Thunk,Chunk},Set{Thunk}}
72-
finished::Set{Thunk}
7369
waiting::OneToMany
7470
waiting_data::Dict{Union{Thunk,Chunk},Set{Thunk}}
7571
ready::Vector{Thunk}
76-
cache::Dict{Thunk, Any}
72+
cache::WeakKeyDict{Thunk, Any}
7773
running::Set{Thunk}
7874
running_on::Dict{Thunk,OSProc}
7975
thunk_dict::Dict{Int, Any}
@@ -84,19 +80,17 @@ struct ComputeState
8480
worker_chans::Dict{Int, Tuple{RemoteChannel,RemoteChannel}}
8581
procs_cache_list::Base.RefValue{Union{ProcessorCacheEntry,Nothing}}
8682
function_cost_cache::Dict{Type{<:Tuple},UInt}
87-
halt::Base.RefValue{Bool}
83+
halt::Base.Event
8884
lock::ReentrantLock
8985
futures::Dict{Thunk, Vector{ThunkFuture}}
90-
errored::Set{Thunk}
86+
errored::WeakKeyDict{Thunk,Bool}
9187
chan::RemoteChannel{Channel{Any}}
9288
end
9389

9490
function start_state(deps::Dict, node_order, chan)
9591
state = ComputeState(rand(UInt64),
96-
deps,
97-
Set{Thunk}(),
9892
OneToMany(),
99-
Dict{Union{Thunk,Chunk},Set{Thunk}}(),
93+
deps,
10094
Vector{Thunk}(undef, 0),
10195
Dict{Thunk, Any}(),
10296
Set{Thunk}(),
@@ -109,18 +103,13 @@ function start_state(deps::Dict, node_order, chan)
109103
Dict{Int, Tuple{RemoteChannel,RemoteChannel}}(),
110104
Ref{Union{ProcessorCacheEntry,Nothing}}(nothing),
111105
Dict{Type{<:Tuple},UInt}(),
112-
Ref{Bool}(false),
106+
Base.Event(),
113107
ReentrantLock(),
114108
Dict{Thunk, Vector{ThunkFuture}}(),
115-
Set{Thunk}(),
109+
WeakKeyDict{Thunk,Bool}(),
116110
chan)
117111

118-
nodes = sort(collect(keys(deps)), by=node_order)
119-
# N.B. Using merge! here instead would modify deps
120-
for (key,val) in deps
121-
state.waiting_data[key] = copy(val)
122-
end
123-
for k in nodes
112+
for k in sort(collect(keys(deps)), by=node_order)
124113
if istask(k)
125114
waiting = Set{Thunk}(Iterators.filter(istask, inputs(k)))
126115
if isempty(waiting)
@@ -392,7 +381,11 @@ function compute_dag(ctx, d::Thunk; options=SchedulerOptions())
392381
end
393382

394383
isempty(state.running) && continue
395-
pid, proc, thunk_id, (res, metadata) = take!(chan) # get result of completed thunk
384+
chan_value = take!(chan) # get result of completed thunk
385+
if chan_value isa RescheduleSignal
386+
continue
387+
end
388+
pid, proc, thunk_id, (res, metadata) = chan_value
396389
gproc = OSProc(pid)
397390
lock(newtasks_lock) # This waits for any assign_new_procs! above to complete and then shuts down the task
398391
safepoint(state)
@@ -424,6 +417,7 @@ function compute_dag(ctx, d::Thunk; options=SchedulerOptions())
424417
state.function_cost_cache[sig] = (metadata.threadtime + get(state.function_cost_cache, sig, 0)) ÷ 2
425418
end
426419
state.cache[node] = res
420+
state.errored[node] = thunk_failed
427421
if node.options !== nothing && node.options.checkpoint !== nothing
428422
try
429423
node.options.checkpoint(node, res)
@@ -441,12 +435,14 @@ function compute_dag(ctx, d::Thunk; options=SchedulerOptions())
441435

442436
safepoint(state)
443437
end
444-
state.halt[] = true
438+
@assert !isready(state.chan)
439+
close(state.chan)
440+
notify(state.halt)
445441
@sync for p in procs_to_use(ctx)
446442
@async cleanup_proc(state, p)
447443
end
448444
value = state.cache[d] # TODO: move(OSProc(), state.cache[d])
449-
if d in state.errored
445+
if state.errored[d]
450446
throw(value)
451447
end
452448
if options.checkpoint !== nothing
@@ -556,6 +552,7 @@ function schedule!(ctx, state, procs=procs_to_use(ctx))
556552
while !isempty(state.ready)
557553
# Select a new task and get its options
558554
task = pop!(state.ready)
555+
@assert !haskey(state.cache, task)
559556
opts = merge(ctx.options, task.options)
560557
sig = signature(task, state)
561558

@@ -693,62 +690,69 @@ function pop_with_affinity!(ctx, tasks, proc)
693690
return nothing
694691
end
695692

696-
function finish_task!(ctx, state, node, thunk_failed; free=true)
693+
function finish_task!(ctx, state, node, thunk_failed)
697694
pop!(state.running, node)
698695
delete!(state.running_on, node)
699-
if !thunk_failed
700-
push!(state.finished, node)
701-
else
696+
if thunk_failed
702697
set_failed!(state, node)
703698
end
704-
if istask(node) && node.cache
699+
if node.cache
705700
node.cache_ref = state.cache[node]
706701
end
707-
if !thunk_failed
708-
for dep in sort!(collect(state.dependents[node]), by=state.node_order)
702+
for dep in sort!(collect(get(()->Set{Thunk}(), state.waiting_data, node)), by=state.node_order)
703+
dep_isready = false
704+
if haskey(state.waiting, dep)
709705
set = state.waiting[dep]
710706
node in set && pop!(set, node)
711-
if isempty(set)
712-
pop!(state.waiting, dep)
713-
push!(state.ready, dep)
707+
dep_isready = isempty(set)
708+
if dep_isready
709+
delete!(state.waiting, dep)
714710
end
715-
# todo: free data
711+
else
712+
dep_isready = true
716713
end
717-
if haskey(state.futures, node)
718-
# Notify any listening thunks
719-
for future in state.futures[node]
720-
if istask(node) && haskey(state.cache, node)
721-
put!(future, state.cache[node])
722-
else
723-
put!(future, nothing)
724-
end
714+
if dep_isready
715+
if !thunk_failed
716+
push!(state.ready, dep)
725717
end
726-
delete!(state.futures, node)
727718
end
728719
end
720+
if haskey(state.futures, node)
721+
# Notify any listening thunks
722+
for future in state.futures[node]
723+
put!(future, state.cache[node]; error=thunk_failed)
724+
end
725+
delete!(state.futures, node)
726+
end
729727

730728
# Chunk clean-up
731729
to_evict = Set{Chunk}()
732-
for inp in filter(t->istask(t) || (t isa Chunk), inputs(node))
730+
for inp in filter(t->istask(t) || (t isa Chunk), unwrap_weak.(node.inputs))
733731
if inp in keys(state.waiting_data)
734-
s = state.waiting_data[inp]
735-
if node in s
736-
pop!(s, node)
732+
w = state.waiting_data[inp]
733+
if node in w
734+
pop!(w, node)
737735
end
738-
if free && isempty(s)
736+
if isempty(w)
737+
delete!(state.waiting_data, inp)
739738
if istask(inp) && haskey(state.cache, inp)
740739
_node = state.cache[inp]
741740
if _node isa Chunk
742741
push!(to_evict, _node)
743742
end
744-
free!(_node, force=false, cache=(istask(inp) && inp.cache))
745-
pop!(state.cache, inp)
743+
GC.@preserve inp begin
744+
pop!(state.cache, inp)
745+
pop!(state.errored, inp)
746+
end
746747
elseif inp isa Chunk
747748
push!(to_evict, inp)
748749
end
749750
end
750751
end
751752
end
753+
if haskey(state.waiting_data, node) && isempty(state.waiting_data[node])
754+
delete!(state.waiting_data, node)
755+
end
752756
if !isempty(to_evict)
753757
@sync for w in map(p->p.pid, procs_to_use(ctx))
754758
@async remote_do(evict_chunks!, w, to_evict)
@@ -779,8 +783,8 @@ function fire_tasks!(ctx, thunks::Vector{<:Tuple}, (gproc, proc), state)
779783
if data !== nothing
780784
# cache hit
781785
state.cache[thunk] = data
782-
thunk_failed = thunk in state.errored
783-
finish_task!(ctx, state, thunk, thunk_failed; free=false)
786+
thunk_failed = state.errored[thunk]
787+
finish_task!(ctx, state, thunk, thunk_failed)
784788
continue
785789
else
786790
# cache miss
@@ -791,19 +795,20 @@ function fire_tasks!(ctx, thunks::Vector{<:Tuple}, (gproc, proc), state)
791795
try
792796
result = thunk.options.restore(thunk)
793797
state.cache[thunk] = result
794-
finish_task!(ctx, state, thunk, false; free=false)
798+
state.errored[thunk] = false
799+
finish_task!(ctx, state, thunk, false)
795800
continue
796801
catch err
797802
@error "Thunk restore failed" exception=(err,catch_backtrace())
798803
end
799804
end
800805

801806
ids = map(enumerate(thunk.inputs)) do (idx,x)
802-
istask(x) ? x.id : -idx
807+
istask(x) ? unwrap_weak(x).id : -idx
803808
end
804809

805810
data = map(thunk.inputs) do x
806-
istask(x) ? state.cache[x] : x
811+
istask(x) ? state.cache[unwrap_weak(x)] : x
807812
end
808813
toptions = thunk.options !== nothing ? thunk.options : ThunkOptions()
809814
options = merge(ctx.options, toptions)
@@ -829,7 +834,8 @@ function do_tasks(to_proc, chan, tasks)
829834
for task in tasks
830835
@async begin
831836
try
832-
put!(chan, (myid(), to_proc, task[2], do_task(to_proc, task...)))
837+
result = do_task(to_proc, task...)
838+
put!(chan, (myid(), to_proc, task[2], result))
833839
catch ex
834840
bt = catch_backtrace()
835841
put!(chan, (myid(), to_proc, task[2], (CapturedException(ex, bt), nothing)))

0 commit comments

Comments
 (0)