Skip to content

Commit 80ae594

Browse files
Alexander ReshytkoBorda
Alexander Reshytko
authored andcommitted
Default values and warnings for DDP env variables
1 parent b71c141 commit 80ae594

File tree

2 files changed

+13
-0
lines changed

2 files changed

+13
-0
lines changed

pytorch_lightning/core/lightning.py

+12
Original file line numberDiff line numberDiff line change
@@ -927,6 +927,18 @@ def init_ddp_connection(
927927
if is_slurm_managing_tasks:
928928
self._init_slurm_connection()
929929

930+
if 'MASTER_ADDR' not in os.environ:
931+
log.warning("MASTER_ADDR environment variable is not defined. Set as localhost")
932+
os.environ['MASTER_ADDR'] = '127.0.0.2'
933+
934+
if 'MASTER_PORT' not in os.environ:
935+
log.warning("MASTER_PORT environment variable is not defined. Set as 12910")
936+
os.environ['MASTER_PORT'] = '12910'
937+
938+
if 'WORLD_SIZE' in os.environ and os.environ['WORLD_SIZE'] != world_size:
939+
log.warning("WORLD_SIZE environment variable is not equal to the computed "
940+
"world size. Ignored.")
941+
930942
torch_distrib.init_process_group('nccl', rank=proc_rank, world_size=world_size)
931943

932944
def configure_apex(

pytorch_lightning/trainer/distrib_data_parallel.py

+1
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,7 @@ def ddp_train(self, gpu_idx, model):
281281
node_id = os.environ['SLURM_NODEID'] if self.is_slurm_managing_tasks else os.environ['RANK']
282282
self.node_rank = int(node_id)
283283
except Exception:
284+
log.warning("SLURM_NODEID or RANK environment variable is not defined. Set as 0.")
284285
self.node_rank = 0
285286

286287
# show progressbar only on progress_rank 0

0 commit comments

Comments
 (0)