@@ -92,7 +92,15 @@ mutable struct WorkerConfig
92
92
end
93
93
end
94
94
95
- @enum WorkerState W_CREATED W_CONNECTED W_TERMINATING W_TERMINATED W_UNKNOWN_STATE
95
+ @enum WorkerState begin
96
+ WorkerState_created
97
+ WorkerState_connected
98
+ WorkerState_terminating # rmprocs() has been called on the worker
99
+ WorkerState_terminated # Worker was gracefully removed
100
+ WorkerState_exterminated # Worker was forcefully removed (not by us)
101
+ WorkerState_unknown
102
+ end
103
+
96
104
mutable struct Worker
97
105
id:: Int
98
106
msg_lock:: Threads.ReentrantLock # Lock for del_msgs, add_msgs, and gcflag
@@ -123,7 +131,7 @@ mutable struct Worker
123
131
w. manager = manager
124
132
w. config = config
125
133
w. version = version
126
- set_worker_state (w, W_CONNECTED )
134
+ set_worker_state (w, WorkerState_connected )
127
135
register_worker_streams (w)
128
136
w
129
137
end
@@ -134,7 +142,7 @@ mutable struct Worker
134
142
if haskey (map_pid_wrkr, id)
135
143
return map_pid_wrkr[id]
136
144
end
137
- w= new (id, Threads. ReentrantLock (), [], [], false , W_CREATED , Threads. Condition (), time (), conn_func)
145
+ w= new (id, Threads. ReentrantLock (), [], [], false , WorkerState_created , Threads. Condition (), time (), conn_func)
138
146
w. initialized = Event ()
139
147
register_worker (w)
140
148
w
@@ -150,8 +158,15 @@ function set_worker_state(w, state)
150
158
end
151
159
end
152
160
161
+ # Helper function to check if a worker is dead or not. It's recommended to use
162
+ # this instead of checking Worker.state manually.
163
+ function is_worker_dead (w:: Worker )
164
+ state = @atomic w. state
165
+ return state === WorkerState_terminated || state === WorkerState_exterminated
166
+ end
167
+
153
168
function check_worker_state (w:: Worker )
154
- if (@atomic w. state) === W_CREATED
169
+ if (@atomic w. state) === WorkerState_created
155
170
if ! isclusterlazy ()
156
171
if PGRP. topology === :all_to_all
157
172
# Since higher pids connect with lower pids, the remote worker
@@ -190,7 +205,7 @@ function exec_conn_func(w::Worker)
190
205
end
191
206
192
207
function wait_for_conn (w)
193
- if (@atomic w. state) === W_CREATED
208
+ if (@atomic w. state) === WorkerState_created
194
209
timeout = worker_timeout () - (time () - w. ct_time)
195
210
timeout <= 0 && error (" peer $(w. id) has not connected to $(myid ()) " )
196
211
@@ -203,7 +218,7 @@ function wait_for_conn(w)
203
218
errormonitor (T)
204
219
lock (w. c_state) do
205
220
wait (w. c_state)
206
- (@atomic w. state) === W_CREATED && error (" peer $(w. id) didn't connect to $(myid ()) within $timeout seconds" )
221
+ (@atomic w. state) === WorkerState_created && error (" peer $(w. id) didn't connect to $(myid ()) within $timeout seconds" )
207
222
end
208
223
end
209
224
nothing
@@ -666,7 +681,7 @@ function create_worker(manager, wconfig)
666
681
if (jw. id != 1 ) && (jw. id < w. id)
667
682
lock (jw. c_state) do
668
683
# wait for wl to join
669
- if (@atomic jw. state) === W_CREATED
684
+ if (@atomic jw. state) === WorkerState_created
670
685
wait (jw. c_state)
671
686
end
672
687
end
@@ -693,7 +708,7 @@ function create_worker(manager, wconfig)
693
708
694
709
for wl in wlist
695
710
lock (wl. c_state) do
696
- if (@atomic wl. state) === W_CREATED
711
+ if (@atomic wl. state) === WorkerState_created
697
712
# wait for wl to join
698
713
wait (wl. c_state)
699
714
end
@@ -900,7 +915,7 @@ function nprocs()
900
915
n = length (PGRP. workers)
901
916
# filter out workers in the process of being setup/shutdown.
902
917
for jw in PGRP. workers
903
- if ! isa (jw, LocalProcess) && ((@atomic jw. state) != = W_CONNECTED )
918
+ if ! isa (jw, LocalProcess) && ((@atomic jw. state) != = WorkerState_connected )
904
919
n = n - 1
905
920
end
906
921
end
@@ -953,7 +968,7 @@ julia> procs()
953
968
function procs ()
954
969
if myid () == 1 || (PGRP. topology === :all_to_all && ! isclusterlazy ())
955
970
# filter out workers in the process of being setup/shutdown.
956
- return Int[x. id for x in PGRP. workers if isa (x, LocalProcess) || ((@atomic x. state) === W_CONNECTED )]
971
+ return Int[x. id for x in PGRP. workers if isa (x, LocalProcess) || ((@atomic x. state) === WorkerState_connected )]
957
972
else
958
973
return Int[x. id for x in PGRP. workers]
959
974
end
@@ -970,7 +985,7 @@ other_procs() = filter(!=(myid()), procs())
970
985
function id_in_procs (id) # faster version of `id in procs()`
971
986
if myid () == 1 || (PGRP. topology === :all_to_all && ! isclusterlazy ())
972
987
for x in PGRP. workers
973
- if (x. id:: Int ) == id && (isa (x, LocalProcess) || (@atomic (x:: Worker ). state) === W_CONNECTED )
988
+ if (x. id:: Int ) == id && (isa (x, LocalProcess) || (@atomic (x:: Worker ). state) === WorkerState_connected )
974
989
return true
975
990
end
976
991
end
@@ -994,7 +1009,7 @@ See also [`other_procs()`](@ref).
994
1009
"""
995
1010
function procs (pid:: Integer )
996
1011
if myid () == 1
997
- all_workers = [x for x in PGRP. workers if isa (x, LocalProcess) || ((@atomic x. state) === W_CONNECTED )]
1012
+ all_workers = [x for x in PGRP. workers if isa (x, LocalProcess) || ((@atomic x. state) === WorkerState_connected )]
998
1013
if (pid == 1 ) || (isa (map_pid_wrkr[pid]. manager, LocalManager))
999
1014
Int[x. id for x in filter (w -> (w. id== 1 ) || (isa (w. manager, LocalManager)), all_workers)]
1000
1015
else
@@ -1103,7 +1118,7 @@ function _rmprocs(pids, waitfor)
1103
1118
else
1104
1119
if haskey (map_pid_wrkr, p)
1105
1120
w = map_pid_wrkr[p]
1106
- set_worker_state (w, W_TERMINATING )
1121
+ set_worker_state (w, WorkerState_terminating )
1107
1122
kill (w. manager, p, w. config)
1108
1123
push! (rmprocset, w)
1109
1124
end
@@ -1112,11 +1127,11 @@ function _rmprocs(pids, waitfor)
1112
1127
1113
1128
start = time_ns ()
1114
1129
while (time_ns () - start) < waitfor* 1e9
1115
- all (w -> ( @atomic w . state) === W_TERMINATED , rmprocset) && break
1130
+ all (is_worker_dead , rmprocset) && break
1116
1131
sleep (min (0.1 , waitfor - (time_ns () - start)/ 1e9 ))
1117
1132
end
1118
1133
1119
- unremoved = [wrkr. id for wrkr in filter (w -> ( @atomic w . state) != = W_TERMINATED , rmprocset)]
1134
+ unremoved = [wrkr. id for wrkr in filter (! is_worker_dead , rmprocset)]
1120
1135
if length (unremoved) > 0
1121
1136
estr = string (" rmprocs: pids " , unremoved, " not terminated after " , waitfor, " seconds." )
1122
1137
throw (ErrorException (estr))
0 commit comments