-
Notifications
You must be signed in to change notification settings - Fork 352
/
Copy pathlaunch.py
25 lines (22 loc) · 959 Bytes
/
launch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import subprocess, torch
from fastai2.basics import *
@call_parse
def main(
gpus:Param("The GPUs to use for distributed training", str)='all',
script:Param("Script to run", str, opt=False)='',
args:Param("Args to pass to script", nargs='...', opt=False)=''
):
"PyTorch distributed training launch helper that spawns multiple distributed processes"
# Loosely based on torch.distributed.launch
current_env = os.environ.copy()
gpus = list(range(torch.cuda.device_count())) if gpus=='all' else list(gpus)
current_env["WORLD_SIZE"] = str(len(gpus))
current_env["MASTER_ADDR"] = '127.0.0.1'
current_env["MASTER_PORT"] = '29500'
processes = []
for i,gpu in enumerate(gpus):
current_env["RANK"] = str(i)
cmd = [sys.executable, "-u", script, f"--gpu={gpu}"] + args
process = subprocess.Popen(cmd, env=current_env)
processes.append(process)
for process in processes: process.wait()