@@ -870,6 +870,8 @@ const LPROC = LocalProcess()
870
870
const LPROCROLE = Ref {Symbol} (:master )
871
871
const HDR_VERSION_LEN= 16
872
872
const HDR_COOKIE_LEN= 16
873
+ const map_pid_statuses = Dict {Int, Any} ()
874
+ const map_pid_statuses_lock = ReentrantLock ()
873
875
const map_pid_wrkr = Dict {Int, Union{Worker, LocalProcess}} ()
874
876
const map_sock_wrkr = IdDict ()
875
877
const map_del_wrkr = Set {Int} ()
@@ -1010,15 +1012,16 @@ for any reason (i.e. not only because of [`rmprocs()`](@ref) but also the worker
1010
1012
segfaulting etc). Chooses and returns a unique key for the callback if `key` is
1011
1013
not specified.
1012
1014
1013
- The callback will be called with the worker ID and the final
1014
- `Distributed.WorkerState` of the worker, e.g. `f(w::Int, state)`. `state` is an
1015
+ The callback will be called with the worker ID, the final
1016
+ `Distributed.WorkerState` of the worker, and the last status of the worker as
1017
+ set by [`setstatus`](@ref), e.g. `f(w::Int, state, status)`. `state` is an
1015
1018
enum, a value of `WorkerState_terminated` means a graceful exit and a value of
1016
1019
`WorkerState_exterminated` means the worker died unexpectedly.
1017
1020
1018
1021
If the callback throws an exception it will be caught and printed.
1019
1022
"""
1020
1023
add_worker_exited_callback (f:: Base.Callable ; key= nothing ) = _add_callback (f, key, worker_exited_callbacks;
1021
- arg_types= Tuple{Int, WorkerState})
1024
+ arg_types= Tuple{Int, WorkerState, Any })
1022
1025
1023
1026
"""
1024
1027
remove_worker_exited_callback(key)
@@ -1206,6 +1209,59 @@ Identical to [`workers()`](@ref) except that the current worker is filtered out.
1206
1209
"""
1207
1210
other_workers () = filter (!= (myid ()), workers ())
1208
1211
1212
+ """
1213
+ setstatus(x, pid::Int=myid())
1214
+
1215
+ Set the status for worker `pid` to `x`. `x` may be any serializable object but
1216
+ it's recommended to keep it small enough to cheaply send over a network. The
1217
+ status will be passed to the worker-exited callbacks (see
1218
+ [`add_worker_exited_callback`](@ref)) when the worker exits.
1219
+
1220
+ This can be handy if you want a way to know what a worker is doing at any given
1221
+ time, or (in combination with a worker-exited callback) for knowing what a
1222
+ worker was last doing before it died.
1223
+
1224
+ # Examples
1225
+ ```julia-repl
1226
+ julia> DistributedNext.setstatus("working on dataset 42")
1227
+ "working on dataset 42"
1228
+
1229
+ julia> DistributedNext.getstatus()
1230
+ "working on dataset 42"
1231
+ ```
1232
+ """
1233
+ function setstatus (x, pid:: Int = myid ())
1234
+ if pid ∉ procs ()
1235
+ throw (ArgumentError (" Worker $(pid) does not exist, cannot set its status" ))
1236
+ end
1237
+
1238
+ if myid () == 1
1239
+ @lock map_pid_statuses_lock map_pid_statuses[pid] = x
1240
+ else
1241
+ remotecall_fetch (setstatus, 1 , x, myid ())
1242
+ end
1243
+ end
1244
+
1245
+ _getstatus (pid) = @lock map_pid_statuses_lock get! (map_pid_statuses, pid, nothing )
1246
+
1247
+ """
1248
+ getstatus(pid::Int=myid())
1249
+
1250
+ Get the status for worker `pid`. If one was never explicitly set with
1251
+ [`setstatus`](@ref) this will return `nothing`.
1252
+ """
1253
+ function getstatus (pid:: Int = myid ())
1254
+ if pid ∉ procs ()
1255
+ throw (ArgumentError (" Worker $(pid) does not exist, cannot get its status" ))
1256
+ end
1257
+
1258
+ if myid () == 1
1259
+ _getstatus (pid)
1260
+ else
1261
+ remotecall_fetch (getstatus, 1 , pid)
1262
+ end
1263
+ end
1264
+
1209
1265
function cluster_mgmt_from_master_check ()
1210
1266
if myid () != 1
1211
1267
throw (ErrorException (" Only process 1 can add and remove workers" ))
@@ -1425,15 +1481,20 @@ function deregister_worker(pg, pid)
1425
1481
end
1426
1482
end
1427
1483
1428
- # Call callbacks on the master
1429
1484
if myid () == 1
1485
+ status = _getstatus (pid)
1486
+
1487
+ # Call callbacks on the master
1430
1488
for (name, callback) in worker_exited_callbacks
1431
1489
try
1432
- callback (pid, w. state)
1490
+ callback (pid, w. state, status )
1433
1491
catch ex
1434
1492
@error " Error when running worker-exited callback '$(name) '" exception= (ex, catch_backtrace ())
1435
1493
end
1436
1494
end
1495
+
1496
+ # Delete its status
1497
+ @lock map_pid_statuses_lock delete! (map_pid_statuses, pid)
1437
1498
end
1438
1499
1439
1500
return
0 commit comments