Skip to content

Commit c012c1f

Browse files
committed
Add the "auto-detect the current allocation" feature
1 parent a4847ab commit c012c1f

File tree

3 files changed

+113
-1
lines changed

3 files changed

+113
-1
lines changed

Project.toml

+2
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,14 @@ version = "0.4.9"
66
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
77
Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
88
Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
9+
SlurmClusterManager = "c82cd089-7bf7-41d7-976b-6b5d413cbe0a"
910
Sockets = "6462fe0b-24de-5631-8697-dd941f90decc"
1011

1112
[compat]
1213
Distributed = "< 0.0.1, 1"
1314
Logging = "< 0.0.1, 1"
1415
Pkg = "< 0.0.1, 1"
16+
SlurmClusterManager = "0.1.3"
1517
Sockets = "< 0.0.1, 1"
1618
julia = "1.2"
1719

src/ClusterManagers.jl

+8-1
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,23 @@ using Distributed
44
using Sockets
55
using Pkg
66

7+
import SlurmClusterManager
8+
79
export launch, manage, kill, init_worker, connect
810
import Distributed: launch, manage, kill, init_worker, connect
911

12+
# Bring some other names into scope, just for convenience:
13+
using Distributed: addprocs
14+
15+
1016
worker_cookie() = begin Distributed.init_multi(); cluster_cookie() end
1117
worker_arg() = `--worker=$(worker_cookie())`
1218

13-
1419
# PBS doesn't have the same semantics as SGE wrt to file accumulate,
1520
# a different solution will have to be found
1621
include("qsub.jl")
22+
23+
include("auto_detect.jl")
1724
include("scyld.jl")
1825
include("condor.jl")
1926
include("slurm.jl")

src/auto_detect.jl

+103
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
function addprocs_autodetect_current_scheduler(; kwargs...)
2+
sched = _autodetect_is_slurm()
3+
if sched == :slurm
4+
return addprocs(SlurmClusterManager.SlurmManager(); kwargs...)
5+
elseif sched == :sge
6+
np = _sge_get_number_of_tasks()
7+
return addprocs_sge(np; kwargs...)
8+
elseif sched == :pbs
9+
np = _torque_get_numtasks()
10+
return addprocs_pbs(np; kwargs...)
11+
end
12+
error("Unable to auto-detect cluster scheduler: $(sched)")
13+
end
14+
15+
function autodetect_current_scheduler()
16+
if _autodetect_is_slurm()
17+
return :slurm
18+
elseif _autodetect_is_sge()
19+
return :sge
20+
elseif _autodetect_is_pbs()
21+
return :pbs
22+
end
23+
return nothing
24+
end
25+
26+
##### Slurm:
27+
28+
function _autodetect_is_slurm()
29+
has_SLURM_JOB_ID = _has_env_nonempty("SLURM_JOB_ID")
30+
has_SLURM_JOBID = _has_env_nonempty("SLURM_JOBID")
31+
res = has_SLURM_JOB_ID || has_SLURM_JOBID
32+
return res
33+
end
34+
35+
##### SGE (Sun Grid Engine):
36+
37+
function _autodetect_is_sge()
38+
# https://docs.oracle.com/cd/E19957-01/820-0699/chp4-21/index.html
39+
has_SGE_O_HOST = _has_env_nonempty("SGE_O_HOST")
40+
return has_SGE_O_HOST
41+
42+
# Important note:
43+
# The "job ID" environment variable in SGE is just named `JOB_ID`.
44+
# This is obviously too vague, because the variable name is not specific to SGE.
45+
# Therefore, we can't use that variable for our SGE auto-detection.
46+
end
47+
48+
function _sge_get_numtasks()
49+
msg = "Because this is Sun Grid Engine (SGE), ClusterManagers.jl is not able " *
50+
"to correctly auto-detect the number of tasks. " *
51+
"Therefore, ClusterManagers.jl will instead use the value of the " *
52+
"NHOSTS environment variable: $(np)"
53+
@warn msg
54+
55+
# https://docs.oracle.com/cd/E19957-01/820-0699/chp4-21/index.html
56+
name = "NHOSTS"
57+
value_int = _getenv_parse_int(name)
58+
return value_int
59+
end
60+
61+
##### PBS and Torque:
62+
63+
function _autodetect_is_pbs()
64+
# https://docs.adaptivecomputing.com/torque/2-5-12/help.htm#topics/2-jobs/exportedBatchEnvVar.htm
65+
has_PBS_JOBID = _has_env_nonempty("PBS_JOBID")
66+
return has_PBS_JOBID
67+
end
68+
69+
function _torque_get_numtasks()
70+
# https://docs.adaptivecomputing.com/torque/2-5-12/help.htm#topics/2-jobs/exportedBatchEnvVar.htm
71+
name = "PBS_TASKNUM"
72+
value_int = _getenv_parse_int(name)
73+
return value_int
74+
75+
@info "Using auto-detected num_tasks: $(np)"
76+
end
77+
78+
##### General utility functions:
79+
80+
function _getenv_parse_int(name::AbstractString)
81+
if !haskey(ENV, name)
82+
msg = "Environment variable is not defined: $(name)"
83+
error(msg)
84+
end
85+
original_value = ENV[name]
86+
if isempty(original_value)
87+
msg = "Environment variable is defined, but is empty: $(name)"
88+
error(msg)
89+
end
90+
stripped_value_str = strip(original_value)
91+
if isempty(stripped_value)
92+
msg = "Environment variable is defined, but contains only whitespace: $(name)"
93+
error(msg)
94+
end
95+
value_int = tryparse(Int, stripped_value_str)
96+
if !(value_int isa Int)
97+
msg =
98+
"Environment variable \"$(name)\" is defined, " *
99+
"but its value \"$(stripped_value_str)\" could not be parsed as an integer."
100+
error(msg)
101+
end
102+
return value_int
103+
end

0 commit comments

Comments
 (0)