From 2b84d29e2a2f6b1f66df5e58f691fa54a73a1729 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Mon, 20 Jan 2025 19:42:39 -0500 Subject: [PATCH 1/2] Add the "auto-detect the current allocation" feature --- Project.toml | 4 ++ src/ClusterManagers.jl | 8 +++ src/auto_detect.jl | 141 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 153 insertions(+) create mode 100644 src/auto_detect.jl diff --git a/Project.toml b/Project.toml index e955313..ef2f338 100644 --- a/Project.toml +++ b/Project.toml @@ -4,14 +4,18 @@ version = "1.1.0" [deps] Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" +LSFClusterManager = "af02cf76-cbe3-4eeb-96a8-af9391005858" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +SlurmClusterManager = "c82cd089-7bf7-41d7-976b-6b5d413cbe0a" Sockets = "6462fe0b-24de-5631-8697-dd941f90decc" [compat] Distributed = "< 0.0.1, 1" +LSFClusterManager = "1.0.0" Logging = "< 0.0.1, 1" Pkg = "< 0.0.1, 1" +SlurmClusterManager = "0.1.3" Sockets = "< 0.0.1, 1" julia = "1.2" diff --git a/src/ClusterManagers.jl b/src/ClusterManagers.jl index ce91285..43e16cc 100755 --- a/src/ClusterManagers.jl +++ b/src/ClusterManagers.jl @@ -4,9 +4,15 @@ using Distributed using Sockets using Pkg +import LSFClusterManager +import SlurmClusterManager + export launch, manage, kill, init_worker, connect import Distributed: launch, manage, kill, init_worker, connect +# Bring some other names into scope, just for convenience: +using Distributed: addprocs + worker_cookie() = begin Distributed.init_multi(); cluster_cookie() end worker_arg() = `--worker=$(worker_cookie())` @@ -14,6 +20,8 @@ worker_arg() = `--worker=$(worker_cookie())` # PBS doesn't have the same semantics as SGE wrt to file accumulate, # a different solution will have to be found include("qsub.jl") + +include("auto_detect.jl") include("scyld.jl") include("condor.jl") include("slurm.jl") diff --git a/src/auto_detect.jl b/src/auto_detect.jl new file mode 100644 index 0000000..c98fffd --- /dev/null +++ b/src/auto_detect.jl @@ -0,0 +1,141 @@ +function addprocs_autodetect_current_scheduler(; kwargs...) + sched = autodetect_current_scheduler() + + if sched == :slurm + res = Distributed.addprocs(SlurmClusterManager.SlurmManager(); kwargs...) + + elseif sched == :lsf + np = _lsf_get_numtasks() + res = LSFClusterManager.addprocs_lsf(np; kwargs...) + + elseif sched == :sge + np = _sge_get_number_of_tasks() + res = addprocs_sge(np; kwargs...) + + elseif sched == :pbs + np = _torque_get_numtasks() + res = addprocs_pbs(np; kwargs...) + + else + error("Unable to auto-detect cluster scheduler: $(sched)") + end + + return res +end + +function autodetect_current_scheduler() + if _autodetect_is_slurm() + return :slurm + elseif _autodetect_is_lsf() + return :lsf + elseif _autodetect_is_sge() + return :sge + elseif _autodetect_is_pbs() + return :pbs + end + return nothing +end + +##### Slurm: + +function _autodetect_is_slurm() + has_SLURM_JOB_ID = _has_env_nonempty("SLURM_JOB_ID") + has_SLURM_JOBID = _has_env_nonempty("SLURM_JOBID") + res = has_SLURM_JOB_ID || has_SLURM_JOBID + return res +end + +##### LSF: + +function _autodetect_is_lsf() + # https://www.ibm.com/docs/en/spectrum-lsf/10.1.0?topic=variables-environment-set-job-execution + has_LSB_JOBNAME = _has_env_nonempty("LSB_JOBNAME") + return has_LSB_JOBNAME +end + +function _lsf_get_numtasks() + # https://www.ibm.com/docs/en/spectrum-lsf/10.1.0?topic=variables-environment-variable-reference + # + # See also: + # https://portal.supercomputing.wales/index.php/index/slurm/lsf-to-slurm-ref/ + name = "LSB_DJOB_NUMPROC" + value_str = strip(ENV[name]) + value_int = _getenv_parse_int(name) + return value_int +end + +##### SGE (Sun Grid Engine): + +function _autodetect_is_sge() + # https://docs.oracle.com/cd/E19957-01/820-0699/chp4-21/index.html + has_SGE_O_HOST = _has_env_nonempty("SGE_O_HOST") + return has_SGE_O_HOST + + # Important note: + # The "job ID" environment variable in SGE is just named `JOB_ID`. + # This is obviously too vague, because the variable name is not specific to SGE. + # Therefore, we can't use that variable for our SGE auto-detection. +end + +function _sge_get_numtasks() + msg = "Because this is Sun Grid Engine (SGE), ClusterManagers.jl is not able " * + "to correctly auto-detect the number of tasks. " * + "Therefore, ClusterManagers.jl will instead use the value of the " * + "NHOSTS environment variable: $(np)" + @warn msg + + # https://docs.oracle.com/cd/E19957-01/820-0699/chp4-21/index.html + name = "NHOSTS" + value_int = _getenv_parse_int(name) + return value_int +end + +##### PBS and Torque: + +function _autodetect_is_pbs() + # https://docs.adaptivecomputing.com/torque/2-5-12/help.htm#topics/2-jobs/exportedBatchEnvVar.htm + has_PBS_JOBID = _has_env_nonempty("PBS_JOBID") + return has_PBS_JOBID +end + +function _torque_get_numtasks() + # https://docs.adaptivecomputing.com/torque/2-5-12/help.htm#topics/2-jobs/exportedBatchEnvVar.htm + name = "PBS_TASKNUM" + value_int = _getenv_parse_int(name) + return value_int + + @info "Using auto-detected num_tasks: $(np)" +end + +##### General utility functions: + +function _has_env_nonempty(name::AbstractString) + stripped_value = strip(get(ENV, name, "")) + res_b = !isempty(stripped_value) + return res_b +end + +function _getenv_parse_int(name::AbstractString) + if !haskey(ENV, name) + msg = "Environment variable is not defined: $(name)" + error(msg) + end + original_value = ENV[name] + if isempty(original_value) + msg = "Environment variable is defined, but is empty: $(name)" + error(msg) + end + stripped_value_str = strip(original_value) + if isempty(stripped_value) + msg = "Environment variable is defined, but contains only whitespace: $(name)" + error(msg) + end + value_int = tryparse(Int, stripped_value_str) + if !(value_int isa Int) + msg = + "Environment variable \"$(name)\" is defined, " * + "but its value \"$(stripped_value_str)\" could not be parsed as an integer." + error(msg) + end + return value_int +end From 2777104efe6447f07a6d94f5e0c655bd6b0694b7 Mon Sep 17 00:00:00 2001 From: Dilum Aluthge Date: Sun, 9 Feb 2025 18:46:47 -0500 Subject: [PATCH 2/2] SGE and PBS aren't currently maintained --- src/auto_detect.jl | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/auto_detect.jl b/src/auto_detect.jl index c98fffd..32c03f9 100644 --- a/src/auto_detect.jl +++ b/src/auto_detect.jl @@ -8,13 +8,15 @@ function addprocs_autodetect_current_scheduler(; kwargs...) np = _lsf_get_numtasks() res = LSFClusterManager.addprocs_lsf(np; kwargs...) - elseif sched == :sge - np = _sge_get_number_of_tasks() - res = addprocs_sge(np; kwargs...) - - elseif sched == :pbs - np = _torque_get_numtasks() - res = addprocs_pbs(np; kwargs...) + # elseif sched == :sge + # # SGE is not currently maintained. + # np = _sge_get_number_of_tasks() + # res = addprocs_sge(np; kwargs...) + + # elseif sched == :pbs + # # PBS is not currently maintained. + # np = _torque_get_numtasks() + # res = addprocs_pbs(np; kwargs...) else error("Unable to auto-detect cluster scheduler: $(sched)")