-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathlaunch.sh
executable file
·70 lines (57 loc) · 2.89 KB
/
launch.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/bin/bash
# Script to launch a multi-node pytorch.distributed training run on UF HiPerGator's AI partition,
# a SLURM cluster using Singularity as container runtime.
#
# This script uses `pt_multinode_helper_funcs.sh` and `run_on_node.sh`.
#
# If launch with torch.distributed.launch,
# set #SBATCH --ntasks=--nodes
# set #SBATCH --ntasks-per-node=1
# set #SBATCH --gpus=total number of processes to run on all nodes
# set #SBATCH --gpus-per-task=--gpus / --ntasks
# modify `LAUNCH_CMD` in `run_on_node.sh` to launch with torch.distributed.launch
# If launch without torch.distributed.launch,
# set #SBATCH --ntasks=total number of processes to run on all nodes
# set #SBATCH --ntasks-per-node=--ntasks/--nodes
# set #SBATCH --gpus=--ntasks
# set #SBATCH --gpus-per-task=1
# modify `LAUNCH_CMD` in `run_on_node.sh` to launch without torch.distributed.launch
# (c) 2021, Brian J. Stucky, UF Research Computing
# 2021/09, modified by Huiwen Ju, [email protected]
# Resource allocation.
#SBATCH --wait-all-nodes=1
#SBATCH --job-name=multinode_pytorch
#SBATCH --mail-type=ALL
#SBATCH --mail-user=<your email>
#SBATCH --nodes=2 # How many DGX nodes? Each has 8 A100 GPUs
#SBATCH --ntasks=16 # How many tasks? One per GPU
#SBATCH --ntasks-per-node=8 # Split 8 per node for the 8 GPUs
#SBATCH --gpus=16 # Total GPUs
#SBATCH --gpus-per-task=1 # 1 GPU per task
#SBATCH --cpus-per-task=4 # How many CPU cores per task, upto 16 for 8 tasks per node
#SBATCH --mem=96gb # CPU memory per node--up to 1TB (Not GPU memory--that is 80GB per A100 GPU)
#SBATCH --partition=hpg-ai # Specify the HPG AI partition
# Enable the following to limit the allocation to a single SU.
## SBATCH --constraint=su7
#SBATCH --time=48:00:00
#SBATCH --output=train.%j.out
export NCCL_DEBUG=INFO
# Training command specification: training_script -args.
# TRAINING_CMD="unet_training_ddp_slurm_torchlaunch.py" # See notes above on other changes needed
# TRAINING_CMD="unet_training_ddp_slurm.py"
TRAINING_CMD="unet_training_workflows_slurm.py"
# Python location (if not provided, system default will be used).
# Here we run within a MONAI Singularity container based on NGC PyTorch container,
# see `build_container.sh` to build a MONAI Singularity container.
PYTHON_PATH="singularity exec --nv pyt21.07 python3"
# Location of the PyTorch launch utilities, i.e. `pt_multinode_helper_funcs.sh` & `run_on_node.sh`.
PT_LAUNCH_UTILS_PATH="./"
source "${PT_LAUNCH_UTILS_PATH}/pt_multinode_helper_funcs.sh"
init_node_info
pwd; hostname; date
echo "Primary node: $PRIMARY"
echo "Primary TCP port: $PRIMARY_PORT"
echo "Secondary nodes: $SECONDARIES"
PT_LAUNCH_SCRIPT="run_on_node.sh"
echo "Running \"$TRAINING_CMD\" on each node..."
srun --unbuffered "$PT_LAUNCH_SCRIPT" "${PT_LAUNCH_UTILS_PATH}" "$TRAINING_CMD" "$PYTHON_PATH"