Fix disabling progress bar on non-zero ranks using Horovod backend (#1709)

tgaddair · web-flow · commit f90afa29b884 · 2020-05-04T13:02:57.000-04:00
* Fix Horovod backend to disable progress bar on all ranks except 0

* Add join barriers

* Added changelog

* Make protected and add verbosity

* Refactor to disable progress bar callback in train

* Removed vebose setting

* Add cache check for Horovod

* Test run again

* Updated comment

* Always skip cache for Horovod

* Only reinstall when necessary

* Added separate step

* Fixed spacing

* Skip Python 3.8
diff --git a/.github/workflows/ci-testing.yml b/.github/workflows/ci-testing.yml
@@ -86,6 +86,17 @@ jobs:
         pip list
       shell: bash
 
+    - name: Reinstall Horovod if necessary
+      if: runner.os != 'windows' && matrix.python-version != '3.8'
+      run: |
+        HOROVOD_BUILT=$(python -c "import horovod.torch; horovod.torch.nccl_built(); print('SUCCESS')")
+        if [[ $HOROVOD_BUILT != "SUCCESS" ]]; then
+          pip uninstall -y horovod
+          HOROVOD_BUILD_ARCH_FLAGS="-mfma" pip install --no-cache-dir $(grep "horovod" requirements-extra.txt)
+        fi
+        horovodrun --check-build
+      shell: bash
+
     - name: Cache datasets
       uses: actions/cache@v1
       with:
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -34,6 +34,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Fixed wandb logger `global_step` affects other loggers ([#1492](https://github.com/PyTorchLightning/pytorch-lightning/issues/1485))
 
+- Fixed disabling progress bar on non-zero ranks using Horovod backend ([#1709](https://github.com/PyTorchLightning/pytorch-lightning/pull/1709))
+
 - Fixed bugs that prevent lr finder to be used together with early stopping and validation dataloaders ([#1676](https://github.com/PyTorchLightning/pytorch-lightning/pull/1676))
 
 ## [0.7.5] - 2020-04-27
diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py
@@ -576,8 +576,9 @@ def horovod_train(self, model):
             torch.cuda.set_device(self.root_gpu)
             model.cuda(self.root_gpu)
 
-        # Only show progress bar from the first worker
-        self.progress_bar_refresh_rate = self.progress_bar_refresh_rate if hvd.rank() == 0 else 0
+        # avoid duplicating progress bar
+        if hvd.rank() != 0 and self.progress_bar_callback is not None:
+            self.progress_bar_callback.disable()
 
         # CHOOSE OPTIMIZER
         # allow for lr schedulers as well
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
@@ -51,6 +51,13 @@
 else:
     XLA_AVAILABLE = True
 
+try:
+    import horovod.torch as hvd
+except ImportError:
+    HOROVOD_AVAILABLE = False
+else:
+    HOROVOD_AVAILABLE = True
+
 
 class Trainer(
     TrainerIOMixin,
@@ -853,6 +860,10 @@ def run_pretrain_routine(self, model: LightningModule):
             # wait for all processes to catch up
             torch_xla.core.xla_model.rendezvous("pl.Trainer.run_pretrain_routine")
 
+        elif self.use_horovod:
+            # wait for all processes to catch up
+            hvd.join()
+
         # register auto-resubmit when on SLURM
         self.register_slurm_signal_handlers()
 
diff --git a/pytorch_lightning/trainer/training_io.py b/pytorch_lightning/trainer/training_io.py
@@ -112,6 +112,13 @@
 else:
     XLA_AVAILABLE = True
 
+try:
+    import horovod.torch as hvd
+except ImportError:
+    HOROVOD_AVAILABLE = False
+else:
+    HOROVOD_AVAILABLE = True
+
 
 class TrainerIOMixin(ABC):
 
@@ -123,6 +130,7 @@ class TrainerIOMixin(ABC):
     resume_from_checkpoint: ...
     use_ddp: bool
     use_ddp2: bool
+    use_horovod: bool
     checkpoint_callback: ...
     proc_rank: int
     weights_save_path: str
@@ -175,6 +183,10 @@ def restore_weights(self, model: LightningModule):
             # wait for all processes to catch up
             torch_xla.core.xla_model.rendezvous("pl.TrainerIOMixin.restore_weights")
 
+        elif self.use_horovod:
+            # wait for all processes to catch up
+            hvd.join()
+
         # clear cache after restore
         if self.on_gpu:
             torch.cuda.empty_cache()