@@ -59,6 +59,7 @@ mutable struct Worker
59
59
state:: WorkerState
60
60
c_state:: Condition # wait for state changes
61
61
ct_time:: Float64 # creation time
62
+ conn_func:: Nullable{Function} # Used to setup connections lazily
62
63
63
64
r_stream:: IO
64
65
w_stream:: IO
@@ -82,12 +83,13 @@ mutable struct Worker
82
83
w
83
84
end
84
85
85
- function Worker (id:: Int )
86
+ Worker (id:: Int ) = Worker (id, Nullable {Function} ())
87
+ function Worker (id:: Int , conn_func)
86
88
@assert id > 0
87
89
if haskey (map_pid_wrkr, id)
88
90
return map_pid_wrkr[id]
89
91
end
90
- w= new (id, [], [], false , W_CREATED, Condition (), time ())
92
+ w= new (id, [], [], false , W_CREATED, Condition (), time (), conn_func )
91
93
register_worker (w)
92
94
w
93
95
end
@@ -102,21 +104,56 @@ end
102
104
103
105
function check_worker_state (w:: Worker )
104
106
if w. state == W_CREATED
105
- if PGRP. topology == :all_to_all
106
- # Since higher pids connect with lower pids, the remote worker
107
- # may not have connected to us yet. Wait for some time.
108
- timeout = worker_timeout () - (time () - w. ct_time)
109
- timeout <= 0 && error (" peer $(w. id) has not connected to $(myid ()) " )
110
-
111
- @schedule (sleep (timeout); notify (w. c_state; all= true ))
112
- wait (w. c_state)
113
- w. state == W_CREATED && error (" peer $(w. id) didn't connect to $(myid ()) within $timeout seconds" )
107
+ if ! isclusterlazy ()
108
+ if PGRP. topology == :all_to_all
109
+ # Since higher pids connect with lower pids, the remote worker
110
+ # may not have connected to us yet. Wait for some time.
111
+ wait_for_conn (w)
112
+ else
113
+ error (" peer $(w. id) is not connected to $(myid ()) . Topology : " * string (PGRP. topology))
114
+ end
114
115
else
115
- error (" peer $(w. id) is not connected to $(myid ()) . Topology : " * string (PGRP. topology))
116
+ w. ct_time = time ()
117
+ if myid () > w. id
118
+ @schedule exec_conn_func (w)
119
+ else
120
+ # route request via node 1
121
+ @schedule remotecall_fetch ((p,to_id) -> remotecall_fetch (exec_conn_func, p, to_id), 1 , w. id, myid ())
122
+ end
123
+ wait_for_conn (w)
116
124
end
117
125
end
118
126
end
119
127
128
+ exec_conn_func (id:: Int ) = exec_conn_func (worker_from_id (id))
129
+ function exec_conn_func (w:: Worker )
130
+ if isnull (w. conn_func)
131
+ return wait_for_conn (w) # Some other task may be trying to connect at the same time.
132
+ end
133
+
134
+ try
135
+ f = get (w. conn_func)
136
+ w. conn_func = Nullable {Function} ()
137
+ f ()
138
+ catch e
139
+ w. conn_func = () -> throw (e)
140
+ rethrow (e)
141
+ end
142
+ nothing
143
+ end
144
+
145
+ function wait_for_conn (w)
146
+ if w. state == W_CREATED
147
+ timeout = worker_timeout () - (time () - w. ct_time)
148
+ timeout <= 0 && error (" peer $(w. id) has not connected to $(myid ()) " )
149
+
150
+ @schedule (sleep (timeout); notify (w. c_state; all= true ))
151
+ wait (w. c_state)
152
+ w. state == W_CREATED && error (" peer $(w. id) didn't connect to $(myid ()) within $timeout seconds" )
153
+ end
154
+ nothing
155
+ end
156
+
120
157
# # process group creation ##
121
158
122
159
mutable struct LocalProcess
@@ -340,6 +377,17 @@ function addprocs_locked(manager::ClusterManager; kwargs...)
340
377
params = merge (default_addprocs_params (), AnyDict (kwargs))
341
378
topology (Symbol (params[:topology ]))
342
379
380
+ if PGRP. topology != :all_to_all
381
+ params[:lazy ] = false
382
+ end
383
+
384
+ if isnull (PGRP. lazy) || nprocs () == 1
385
+ PGRP. lazy = Nullable {Bool} (params[:lazy ])
386
+ elseif isclusterlazy () != params[:lazy ]
387
+ throw (ArgumentError (string (" Active workers with lazy=" , isclusterlazy (),
388
+ " . Cannot set lazy=" , params[:lazy ])))
389
+ end
390
+
343
391
# References to launched workers, filled when each worker is fully initialized and
344
392
# has connected to all nodes.
345
393
launched_q = Int[] # Asynchronously filled by the launch method
@@ -396,7 +444,8 @@ default_addprocs_params() = AnyDict(
396
444
:dir => pwd (),
397
445
:exename => joinpath (JULIA_HOME, julia_exename ()),
398
446
:exeflags => ` ` ,
399
- :enable_threaded_blas => false )
447
+ :enable_threaded_blas => false ,
448
+ :lazy => true )
400
449
401
450
402
451
function setup_launched_worker (manager, wconfig, launched_q)
@@ -517,7 +566,7 @@ function create_worker(manager, wconfig)
517
566
518
567
all_locs = map (x -> isa (x, Worker) ? (get (x. config. connect_at, ()), x. id) : ((), x. id, true ), join_list)
519
568
send_connection_hdr (w, true )
520
- join_message = JoinPGRPMsg (w. id, all_locs, PGRP. topology, get (wconfig. enable_threaded_blas, false ))
569
+ join_message = JoinPGRPMsg (w. id, all_locs, PGRP. topology, get (wconfig. enable_threaded_blas, false ), isclusterlazy () )
521
570
send_msg_now (w, MsgHeader (RRID (0 ,0 ), ntfy_oid), join_message)
522
571
523
572
@schedule manage (w. manager, w. id, w. config, :register )
@@ -619,8 +668,9 @@ mutable struct ProcessGroup
619
668
workers:: Array{Any,1}
620
669
refs:: Dict # global references
621
670
topology:: Symbol
671
+ lazy:: Nullable{Bool}
622
672
623
- ProcessGroup (w:: Array{Any,1} ) = new (" pg-default" , w, Dict (), :all_to_all )
673
+ ProcessGroup (w:: Array{Any,1} ) = new (" pg-default" , w, Dict (), :all_to_all , Nullable {Bool} () )
624
674
end
625
675
const PGRP = ProcessGroup ([])
626
676
@@ -634,6 +684,14 @@ function topology(t)
634
684
t
635
685
end
636
686
687
+ function isclusterlazy ()
688
+ if isnull (PGRP. lazy)
689
+ return false
690
+ else
691
+ return get (PGRP. lazy)
692
+ end
693
+ end
694
+
637
695
get_bind_addr (pid:: Integer ) = get_bind_addr (worker_from_id (pid))
638
696
get_bind_addr (w:: LocalProcess ) = LPROC. bind_addr
639
697
function get_bind_addr (w:: Worker )
@@ -667,7 +725,7 @@ myid() = LPROC.id
667
725
Get the number of available processes.
668
726
"""
669
727
function nprocs ()
670
- if myid () == 1 || PGRP. topology == :all_to_all
728
+ if myid () == 1 || ( PGRP. topology == :all_to_all && ! isclusterlazy ())
671
729
n = length (PGRP. workers)
672
730
# filter out workers in the process of being setup/shutdown.
673
731
for jw in PGRP. workers
698
756
Returns a list of all process identifiers.
699
757
"""
700
758
function procs ()
701
- if myid () == 1 || PGRP. topology == :all_to_all
759
+ if myid () == 1 || ( PGRP. topology == :all_to_all && ! isclusterlazy ())
702
760
# filter out workers in the process of being setup/shutdown.
703
761
return Int[x. id for x in PGRP. workers if isa (x, LocalProcess) || (x. state == W_CONNECTED)]
704
762
else
@@ -707,7 +765,7 @@ function procs()
707
765
end
708
766
709
767
function id_in_procs (id) # faster version of `id in procs()`
710
- if myid () == 1 || PGRP. topology == :all_to_all
768
+ if myid () == 1 || ( PGRP. topology == :all_to_all && ! isclusterlazy ())
711
769
for x in PGRP. workers
712
770
if (x. id:: Int ) == id && (isa (x, LocalProcess) || (x:: Worker ). state == W_CONNECTED)
713
771
return true
@@ -903,7 +961,7 @@ function deregister_worker(pg, pid)
903
961
if myid () == 1 && isdefined (w, :config )
904
962
# Notify the cluster manager of this workers death
905
963
manage (w. manager, w. id, w. config, :deregister )
906
- if PGRP. topology != :all_to_all
964
+ if PGRP. topology != :all_to_all || isclusterlazy ()
907
965
for rpid in workers ()
908
966
try
909
967
remote_do (deregister_worker, rpid, pid)
0 commit comments