From 0295887c4c7be145b01fa0164c4f000b42335540 Mon Sep 17 00:00:00 2001 From: Julian P Samaroo Date: Wed, 6 Dec 2023 12:41:14 -0700 Subject: [PATCH] Retry jl_set_task_tid on failure --- src/Dagger.jl | 1 + src/processor.jl | 7 +------ src/sch/Sch.jl | 6 ++---- src/utils/tasks.jl | 20 ++++++++++++++++++++ 4 files changed, 24 insertions(+), 10 deletions(-) create mode 100644 src/utils/tasks.jl diff --git a/src/Dagger.jl b/src/Dagger.jl index 385799970..be6ee3075 100644 --- a/src/Dagger.jl +++ b/src/Dagger.jl @@ -25,6 +25,7 @@ include("utils/dagdebug.jl") # Distributed data include("utils/locked-object.jl") +include("utils/tasks.jl") include("options.jl") include("processor.jl") include("scopes.jl") diff --git a/src/processor.jl b/src/processor.jl index e6a04ac59..c77453f62 100644 --- a/src/processor.jl +++ b/src/processor.jl @@ -166,12 +166,7 @@ function execute!(proc::ThreadProc, @nospecialize(f), @nospecialize(args...); @n TimespanLogging.prof_task_put!(tls.sch_handle.thunk_id.id) @invokelatest f(args...; kwargs...) end - task.sticky = true - ret = ccall(:jl_set_task_tid, Cint, (Any, Cint), task, proc.tid-1) - if ret == 0 - error("jl_set_task_tid == 0") - end - @assert Threads.threadid(task) == proc.tid + set_task_tid!(task, proc.tid) schedule(task) try fetch(task) diff --git a/src/sch/Sch.jl b/src/sch/Sch.jl index 4c2abf205..6ba949d5c 100644 --- a/src/sch/Sch.jl +++ b/src/sch/Sch.jl @@ -1286,8 +1286,7 @@ function start_processor_runner!(istate::ProcessorInternalState, uid::UInt64, re lock(istate.queue) do _ tid = task_tid_for_processor(to_proc) if tid !== nothing - t.sticky = true - ret = ccall(:jl_set_task_tid, Cint, (Any, Cint), t, tid-1) + Dagger.set_task_tid!(t, tid) else t.sticky = false end @@ -1299,8 +1298,7 @@ function start_processor_runner!(istate::ProcessorInternalState, uid::UInt64, re end tid = task_tid_for_processor(to_proc) if tid !== nothing - proc_run_task.sticky = true - ret = ccall(:jl_set_task_tid, Cint, (Any, Cint), proc_run_task, tid-1) + Dagger.set_task_tid!(proc_run_task, tid) else proc_run_task.sticky = false end diff --git a/src/utils/tasks.jl b/src/utils/tasks.jl new file mode 100644 index 000000000..c2796cf21 --- /dev/null +++ b/src/utils/tasks.jl @@ -0,0 +1,20 @@ +function set_task_tid!(task::Task, tid::Integer) + task.sticky = true + ctr = 0 + while true + ret = ccall(:jl_set_task_tid, Cint, (Any, Cint), task, tid-1) + if ret == 1 + break + elseif ret == 0 + yield() + else + error("Unexpected retcode from jl_set_task_tid: $ret") + end + ctr += 1 + if ctr > 10 + @warn "Setting task TID to $tid failed, giving up!" + return + end + end + @assert Threads.threadid(task) == tid "jl_set_task_tid failed!" +end