Skip to content

Commit f90afa2

Browse files
authored
Fix disabling progress bar on non-zero ranks using Horovod backend (#1709)
* Fix Horovod backend to disable progress bar on all ranks except 0 * Add join barriers * Added changelog * Make protected and add verbosity * Refactor to disable progress bar callback in train * Removed vebose setting * Add cache check for Horovod * Test run again * Updated comment * Always skip cache for Horovod * Only reinstall when necessary * Added separate step * Fixed spacing * Skip Python 3.8
1 parent 1a9f1c8 commit f90afa2

File tree

5 files changed

+39
-2
lines changed

5 files changed

+39
-2
lines changed

.github/workflows/ci-testing.yml

+11
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,17 @@ jobs:
8686
pip list
8787
shell: bash
8888

89+
- name: Reinstall Horovod if necessary
90+
if: runner.os != 'windows' && matrix.python-version != '3.8'
91+
run: |
92+
HOROVOD_BUILT=$(python -c "import horovod.torch; horovod.torch.nccl_built(); print('SUCCESS')")
93+
if [[ $HOROVOD_BUILT != "SUCCESS" ]]; then
94+
pip uninstall -y horovod
95+
HOROVOD_BUILD_ARCH_FLAGS="-mfma" pip install --no-cache-dir $(grep "horovod" requirements-extra.txt)
96+
fi
97+
horovodrun --check-build
98+
shell: bash
99+
89100
- name: Cache datasets
90101
uses: actions/cache@v1
91102
with:

CHANGELOG.md

+2
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
3434

3535
- Fixed wandb logger `global_step` affects other loggers ([#1492](https://github.com/PyTorchLightning/pytorch-lightning/issues/1485))
3636

37+
- Fixed disabling progress bar on non-zero ranks using Horovod backend ([#1709](https://github.com/PyTorchLightning/pytorch-lightning/pull/1709))
38+
3739
- Fixed bugs that prevent lr finder to be used together with early stopping and validation dataloaders ([#1676](https://github.com/PyTorchLightning/pytorch-lightning/pull/1676))
3840

3941
## [0.7.5] - 2020-04-27

pytorch_lightning/trainer/distrib_parts.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -576,8 +576,9 @@ def horovod_train(self, model):
576576
torch.cuda.set_device(self.root_gpu)
577577
model.cuda(self.root_gpu)
578578

579-
# Only show progress bar from the first worker
580-
self.progress_bar_refresh_rate = self.progress_bar_refresh_rate if hvd.rank() == 0 else 0
579+
# avoid duplicating progress bar
580+
if hvd.rank() != 0 and self.progress_bar_callback is not None:
581+
self.progress_bar_callback.disable()
581582

582583
# CHOOSE OPTIMIZER
583584
# allow for lr schedulers as well

pytorch_lightning/trainer/trainer.py

+11
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,13 @@
5151
else:
5252
XLA_AVAILABLE = True
5353

54+
try:
55+
import horovod.torch as hvd
56+
except ImportError:
57+
HOROVOD_AVAILABLE = False
58+
else:
59+
HOROVOD_AVAILABLE = True
60+
5461

5562
class Trainer(
5663
TrainerIOMixin,
@@ -853,6 +860,10 @@ def run_pretrain_routine(self, model: LightningModule):
853860
# wait for all processes to catch up
854861
torch_xla.core.xla_model.rendezvous("pl.Trainer.run_pretrain_routine")
855862

863+
elif self.use_horovod:
864+
# wait for all processes to catch up
865+
hvd.join()
866+
856867
# register auto-resubmit when on SLURM
857868
self.register_slurm_signal_handlers()
858869

pytorch_lightning/trainer/training_io.py

+12
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,13 @@
112112
else:
113113
XLA_AVAILABLE = True
114114

115+
try:
116+
import horovod.torch as hvd
117+
except ImportError:
118+
HOROVOD_AVAILABLE = False
119+
else:
120+
HOROVOD_AVAILABLE = True
121+
115122

116123
class TrainerIOMixin(ABC):
117124

@@ -123,6 +130,7 @@ class TrainerIOMixin(ABC):
123130
resume_from_checkpoint: ...
124131
use_ddp: bool
125132
use_ddp2: bool
133+
use_horovod: bool
126134
checkpoint_callback: ...
127135
proc_rank: int
128136
weights_save_path: str
@@ -175,6 +183,10 @@ def restore_weights(self, model: LightningModule):
175183
# wait for all processes to catch up
176184
torch_xla.core.xla_model.rendezvous("pl.TrainerIOMixin.restore_weights")
177185

186+
elif self.use_horovod:
187+
# wait for all processes to catch up
188+
hvd.join()
189+
178190
# clear cache after restore
179191
if self.on_gpu:
180192
torch.cuda.empty_cache()

0 commit comments

Comments
 (0)