@@ -318,7 +318,7 @@ const WORKER_MONITOR_TASKS = Dict{Int,Task}()
318
318
const WORKER_MONITOR_CHANS = Dict {Int,Dict{UInt64,RemoteChannel}} ()
319
319
function init_proc (state, p, log_sink)
320
320
ctx = Context (Int[]; log_sink)
321
- timespan_start (ctx, :init_proc , (;worker= p. pid), nothing )
321
+ timespan_start (ctx, :init_proc , (;uid = state . uid, worker= p. pid), nothing )
322
322
# Initialize pressure and capacity
323
323
gproc = OSProc (p. pid)
324
324
lock (state. lock) do
@@ -383,7 +383,7 @@ function init_proc(state, p, log_sink)
383
383
# Setup dynamic listener
384
384
dynamic_listener! (ctx, state, p. pid)
385
385
386
- timespan_finish (ctx, :init_proc , (;worker= p. pid), nothing )
386
+ timespan_finish (ctx, :init_proc , (;uid = state . uid, worker= p. pid), nothing )
387
387
end
388
388
function _cleanup_proc (uid, log_sink)
389
389
empty! (CHUNK_CACHE) # FIXME : Should be keyed on uid!
399
399
function cleanup_proc (state, p, log_sink)
400
400
ctx = Context (Int[]; log_sink)
401
401
wid = p. pid
402
- timespan_start (ctx, :cleanup_proc , (;worker= wid), nothing )
402
+ timespan_start (ctx, :cleanup_proc , (;uid = state . uid, worker= wid), nothing )
403
403
lock (WORKER_MONITOR_LOCK) do
404
404
if haskey (WORKER_MONITOR_CHANS, wid)
405
405
delete! (WORKER_MONITOR_CHANS[wid], state. uid)
@@ -419,7 +419,7 @@ function cleanup_proc(state, p, log_sink)
419
419
end
420
420
end
421
421
422
- timespan_finish (ctx, :cleanup_proc , (;worker= wid), nothing )
422
+ timespan_finish (ctx, :cleanup_proc , (;uid = state . uid, worker= wid), nothing )
423
423
end
424
424
425
425
" Process-local condition variable (and lock) indicating task completion."
@@ -467,24 +467,24 @@ function compute_dag(ctx, d::Thunk; options=SchedulerOptions())
467
467
468
468
master = OSProc (myid ())
469
469
470
- timespan_start (ctx, :scheduler_init , nothing , master)
470
+ timespan_start (ctx, :scheduler_init , (;uid = state . uid) , master)
471
471
try
472
472
scheduler_init (ctx, state, d, options, deps)
473
473
finally
474
- timespan_finish (ctx, :scheduler_init , nothing , master)
474
+ timespan_finish (ctx, :scheduler_init , (;uid = state . uid) , master)
475
475
end
476
476
477
477
value, errored = try
478
478
scheduler_run (ctx, state, d, options)
479
479
finally
480
480
# Always try to tear down the scheduler
481
- timespan_start (ctx, :scheduler_exit , nothing , master)
481
+ timespan_start (ctx, :scheduler_exit , (;uid = state . uid) , master)
482
482
try
483
483
scheduler_exit (ctx, state, options)
484
484
catch err
485
485
@error " Error when tearing down scheduler" exception= (err,catch_backtrace ())
486
486
finally
487
- timespan_finish (ctx, :scheduler_exit , nothing , master)
487
+ timespan_finish (ctx, :scheduler_exit , (;uid = state . uid) , master)
488
488
end
489
489
end
490
490
@@ -545,10 +545,10 @@ function scheduler_run(ctx, state::ComputeState, d::Thunk, options)
545
545
check_integrity (ctx)
546
546
547
547
isempty (state. running) && continue
548
- timespan_start (ctx, :take , nothing , nothing )
548
+ timespan_start (ctx, :take , (;uid = state . uid) , nothing )
549
549
@dagdebug nothing :take " Waiting for results"
550
550
chan_value = take! (state. chan) # get result of completed thunk
551
- timespan_finish (ctx, :take , nothing , nothing )
551
+ timespan_finish (ctx, :take , (;uid = state . uid) , nothing )
552
552
if chan_value isa RescheduleSignal
553
553
continue
554
554
end
@@ -563,13 +563,13 @@ function scheduler_run(ctx, state::ComputeState, d::Thunk, options)
563
563
@warn " Worker $(pid) died, rescheduling work"
564
564
565
565
# Remove dead worker from procs list
566
- timespan_start (ctx, :remove_procs , (;worker= pid), nothing )
566
+ timespan_start (ctx, :remove_procs , (;uid = state . uid, worker= pid), nothing )
567
567
remove_dead_proc! (ctx, state, gproc)
568
- timespan_finish (ctx, :remove_procs , (;worker= pid), nothing )
568
+ timespan_finish (ctx, :remove_procs , (;uid = state . uid, worker= pid), nothing )
569
569
570
- timespan_start (ctx, :handle_fault , (;worker= pid), nothing )
570
+ timespan_start (ctx, :handle_fault , (;uid = state . uid, worker= pid), nothing )
571
571
handle_fault (ctx, state, gproc)
572
- timespan_finish (ctx, :handle_fault , (;worker= pid), nothing )
572
+ timespan_finish (ctx, :handle_fault , (;uid = state . uid, worker= pid), nothing )
573
573
return # effectively `continue`
574
574
else
575
575
if something (ctx. options. allow_errors, false ) ||
@@ -604,9 +604,9 @@ function scheduler_run(ctx, state::ComputeState, d::Thunk, options)
604
604
end
605
605
end
606
606
607
- timespan_start (ctx, :finish , (;thunk_id), (;thunk_id, result= res))
607
+ timespan_start (ctx, :finish , (;uid = state . uid, thunk_id), (;thunk_id, result= res))
608
608
finish_task! (ctx, state, node, thunk_failed)
609
- timespan_finish (ctx, :finish , (;thunk_id), (;thunk_id, result= res))
609
+ timespan_finish (ctx, :finish , (;uid = state . uid, thunk_id), (;thunk_id, result= res))
610
610
611
611
delete_unused_tasks! (state)
612
612
end
@@ -691,13 +691,13 @@ function schedule!(ctx, state, procs=procs_to_use(ctx))
691
691
task = nothing
692
692
@label pop_task
693
693
if task != = nothing
694
- timespan_finish (ctx, :schedule , (;thunk_id= task. id), (;thunk_id= task. id))
694
+ timespan_finish (ctx, :schedule , (;uid = state . uid, thunk_id= task. id), (;thunk_id= task. id))
695
695
end
696
696
if isempty (state. ready)
697
697
@goto fire_tasks
698
698
end
699
699
task = pop! (state. ready)
700
- timespan_start (ctx, :schedule , (;thunk_id= task. id), (;thunk_id= task. id))
700
+ timespan_start (ctx, :schedule , (;uid = state . uid, thunk_id= task. id), (;thunk_id= task. id))
701
701
if haskey (state. cache, task)
702
702
if haskey (state. errored, task)
703
703
# An error was eagerly propagated to this task
@@ -887,7 +887,7 @@ function monitor_procs_changed!(ctx, state)
887
887
wait (ctx. proc_notify)
888
888
end
889
889
890
- timespan_start (ctx, :assign_procs , nothing , nothing )
890
+ timespan_start (ctx, :assign_procs , (;uid = state . uid) , nothing )
891
891
892
892
# Load new set of procs
893
893
new_ps = procs_to_use (ctx)
@@ -915,7 +915,7 @@ function monitor_procs_changed!(ctx, state)
915
915
end
916
916
end
917
917
918
- timespan_finish (ctx, :assign_procs , nothing , nothing )
918
+ timespan_finish (ctx, :assign_procs , (;uid = state . uid) , nothing )
919
919
old_ps = new_ps
920
920
end
921
921
end
@@ -1085,16 +1085,17 @@ function fire_tasks!(ctx, thunks::Vector{<:Tuple}, (gproc, proc), state)
1085
1085
# know which task failed.
1086
1086
tasks = Task[]
1087
1087
for ts in to_send
1088
+ # TODO : errormonitor
1088
1089
task = Threads. @spawn begin
1089
- timespan_start (ctx, :fire , (;worker= gproc. pid), nothing )
1090
+ timespan_start (ctx, :fire , (;uid = state . uid, worker= gproc. pid), nothing )
1090
1091
try
1091
1092
remotecall_wait (do_tasks, gproc. pid, proc, state. chan, [ts]);
1092
1093
catch err
1093
1094
bt = catch_backtrace ()
1094
1095
thunk_id = ts[1 ]
1095
1096
put! (state. chan, (gproc. pid, proc, thunk_id, (CapturedException (err, bt), nothing )))
1096
1097
finally
1097
- timespan_finish (ctx, :fire , (;worker= gproc. pid), nothing )
1098
+ timespan_finish (ctx, :fire , (;uid = state . uid, worker= gproc. pid), nothing )
1098
1099
end
1099
1100
end
1100
1101
end
@@ -1212,6 +1213,7 @@ proc_has_occupancy(proc_occupancy, task_occupancy) =
1212
1213
function start_processor_runner! (istate:: ProcessorInternalState , uid:: UInt64 , return_queue:: RemoteChannel )
1213
1214
to_proc = istate. proc
1214
1215
proc_run_task = @task begin
1216
+ # FIXME : Context changes aren't noticed over time
1215
1217
ctx = istate. ctx
1216
1218
tasks = istate. tasks
1217
1219
proc_occupancy = istate. proc_occupancy
@@ -1223,20 +1225,20 @@ function start_processor_runner!(istate::ProcessorInternalState, uid::UInt64, re
1223
1225
# Wait for new tasks
1224
1226
if ! work_to_do
1225
1227
@dagdebug nothing :processor " Waiting for tasks"
1226
- timespan_start (ctx, :proc_run_wait , (;worker= wid, processor= to_proc), nothing )
1228
+ timespan_start (ctx, :proc_run_wait , (;uid, worker= wid, processor= to_proc), nothing )
1227
1229
wait (istate. reschedule)
1228
1230
@static if VERSION >= v " 1.9"
1229
1231
reset (istate. reschedule)
1230
1232
end
1231
- timespan_finish (ctx, :proc_run_wait , (;worker= wid, processor= to_proc), nothing )
1233
+ timespan_finish (ctx, :proc_run_wait , (;uid, worker= wid, processor= to_proc), nothing )
1232
1234
if istate. done[]
1233
1235
return
1234
1236
end
1235
1237
end
1236
1238
1237
1239
# Fetch a new task to execute
1238
1240
@dagdebug nothing :processor " Trying to dequeue"
1239
- timespan_start (ctx, :proc_run_fetch , (;worker= wid, processor= to_proc), nothing )
1241
+ timespan_start (ctx, :proc_run_fetch , (;uid, worker= wid, processor= to_proc), nothing )
1240
1242
work_to_do = false
1241
1243
task_and_occupancy = lock (istate. queue) do queue
1242
1244
# Only steal if there are multiple queued tasks, to prevent
@@ -1255,7 +1257,7 @@ function start_processor_runner!(istate::ProcessorInternalState, uid::UInt64, re
1255
1257
return queue_result
1256
1258
end
1257
1259
if task_and_occupancy === nothing
1258
- timespan_finish (ctx, :proc_run_fetch , (;worker= wid, processor= to_proc), nothing )
1260
+ timespan_finish (ctx, :proc_run_fetch , (;uid, worker= wid, processor= to_proc), nothing )
1259
1261
1260
1262
@dagdebug nothing :processor " Failed to dequeue"
1261
1263
@@ -1270,7 +1272,7 @@ function start_processor_runner!(istate::ProcessorInternalState, uid::UInt64, re
1270
1272
@dagdebug nothing :processor " Trying to steal"
1271
1273
1272
1274
# Try to steal a task
1273
- timespan_start (ctx, :steal_local , (;worker= wid, processor= to_proc), nothing )
1275
+ timespan_start (ctx, :proc_steal_local , (;uid, worker= wid, processor= to_proc), nothing )
1274
1276
1275
1277
# Try to steal from local queues randomly
1276
1278
# TODO : Prioritize stealing from busiest processors
@@ -1305,12 +1307,12 @@ function start_processor_runner!(istate::ProcessorInternalState, uid::UInt64, re
1305
1307
from_proc = other_istate. proc
1306
1308
thunk_id = task[1 ]
1307
1309
@dagdebug thunk_id :processor " Stolen from $from_proc by $to_proc "
1308
- timespan_finish (ctx, :steal_local , (;worker= wid, processor= to_proc), (;from_proc, thunk_id))
1310
+ timespan_finish (ctx, :proc_steal_local , (;uid, worker= wid, processor= to_proc), (;from_proc, thunk_id))
1309
1311
# TODO : Keep stealing until we hit full occupancy?
1310
1312
@goto execute
1311
1313
end
1312
1314
end
1313
- timespan_finish (ctx, :steal_local , (;worker= wid, processor= to_proc), nothing )
1315
+ timespan_finish (ctx, :proc_steal_local , (;uid, worker= wid, processor= to_proc), nothing )
1314
1316
1315
1317
# TODO : Try to steal from remote queues
1316
1318
@@ -1322,7 +1324,7 @@ function start_processor_runner!(istate::ProcessorInternalState, uid::UInt64, re
1322
1324
task = task_spec[]
1323
1325
thunk_id = task[1 ]
1324
1326
time_util = task[2 ]
1325
- timespan_finish (ctx, :proc_run_fetch , (;worker= wid, processor= to_proc), (;thunk_id, proc_occupancy= proc_occupancy[], task_occupancy))
1327
+ timespan_finish (ctx, :proc_run_fetch , (;uid, worker= wid, processor= to_proc), (;thunk_id, proc_occupancy= proc_occupancy[], task_occupancy))
1326
1328
@dagdebug thunk_id :processor " Dequeued task"
1327
1329
1328
1330
# Execute the task and return its result
@@ -1423,7 +1425,7 @@ function do_tasks(to_proc, return_queue, tasks)
1423
1425
for task in tasks
1424
1426
thunk_id = task[1 ]
1425
1427
occupancy = task[4 ]
1426
- timespan_start (ctx, :enqueue , (;processor= to_proc, thunk_id), nothing )
1428
+ timespan_start (ctx, :enqueue , (;uid, processor= to_proc, thunk_id), nothing )
1427
1429
should_launch = lock (TASK_SYNC) do
1428
1430
# Already running; don't try to re-launch
1429
1431
if ! (thunk_id in TASKS_RUNNING)
@@ -1435,7 +1437,7 @@ function do_tasks(to_proc, return_queue, tasks)
1435
1437
end
1436
1438
should_launch || continue
1437
1439
enqueue! (queue, TaskSpecKey (task), occupancy)
1438
- timespan_finish (ctx, :enqueue , (;processor= to_proc, thunk_id), nothing )
1440
+ timespan_finish (ctx, :enqueue , (;uid, processor= to_proc, thunk_id), nothing )
1439
1441
@dagdebug thunk_id :processor " Enqueued task"
1440
1442
end
1441
1443
end
0 commit comments