Fixes .test() for ddp (#2570)

williamFalcon · web-flow · commit f35337adba05 · 2020-07-09T18:36:36.000-04:00
* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint

* enable none checkpoint
diff --git a/pytorch_lightning/trainer/distrib_data_parallel.py b/pytorch_lightning/trainer/distrib_data_parallel.py
@@ -122,16 +122,15 @@ def train_fx(trial_hparams, cluster_manager, _):
 from time import sleep
 import numpy as np
 from os.path import abspath
-from torch import distributed as dist
-import queue
 
 import torch
 from pytorch_lightning import _logger as log
-from pytorch_lightning.callbacks import ModelCheckpoint
 from pytorch_lightning.loggers import LightningLoggerBase
 from pytorch_lightning.utilities import NATIVE_AMP_AVALAIBLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.distributed import rank_zero_only, rank_zero_warn, rank_zero_info
+from pytorch_lightning.core.lightning import LightningModule
+
 
 try:
     from apex import amp
@@ -230,6 +229,10 @@ def save_checkpoint(self, *args):
     def setup(self, *args) -> None:
         """Warning: this is just empty shell for code implemented in other class."""
 
+    @abstractmethod
+    def get_model(self) -> LightningModule:
+        """Warning: this is just empty shell for code implemented in other class."""
+
     @abstractmethod
     def is_function_implemented(self, *args) -> bool:
         """Warning: this is just empty shell for code implemented in other class."""
@@ -556,17 +559,20 @@ def ddp_train(self, process_idx, q, model, is_master=False, proc_offset=0):
         # continue training routine
         results = self.run_pretrain_routine(model)
 
+        # get original model
+        model = self.get_model()
+
         # persist info in ddp_spawn
-        self.__transfer_ddp_spawn_state_on_fit_end(model, q, results)
+        self.transfer_ddp_spawn_state_on_fit_end(model, q, results)
 
         # clean up memory
         torch.cuda.empty_cache()
 
         if self.global_rank == 0 and self.distributed_backend not in ['ddp_spawn', 'ddp_cpu']:
             return results
 
-    def __transfer_ddp_spawn_state_on_fit_end(self, model, q, results):
-        if not self.distributed_backend in ['ddp_spawn', 'ddp_cpu']:
+    def transfer_ddp_spawn_state_on_fit_end(self, model, q, results):
+        if self.distributed_backend not in ['ddp_spawn', 'ddp_cpu', 'tpu']:
             return
 
         # track the best model path
@@ -581,8 +587,8 @@ def __transfer_ddp_spawn_state_on_fit_end(self, model, q, results):
 
             # save the last weights
             last_path = None
-            if not self.testing:
-                last_path = os.path.join(self.default_root_dir, '__temp_weight_ddp_end.ckpt')
+            if not self.testing and best_model_path is not None and len(best_model_path) > 0:
+                last_path = re.sub('.ckpt', '.tmp_end.ckpt', best_model_path)
                 torch.save(model.state_dict(), last_path)
             q.put(last_path)
 
diff --git a/pytorch_lightning/trainer/distrib_parts.py b/pytorch_lightning/trainer/distrib_parts.py
@@ -222,7 +222,7 @@ def tpu_train(self, tpu_core_idx, model):
         self.run_pretrain_routine(model)
 
         # when training ends on these platforms dump weights to get out of the main process
-        if self.on_colab_kaggle and not self.testing:
+        if self.on_colab_kaggle:
             rank_zero_warn('cleaning up... please do not interrupt')
             self.save_spawn_weights(model)
 
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
@@ -396,6 +396,9 @@ def __init__(
         self.test_dataloaders = None
         self.val_dataloaders = None
 
+        # when .test() is called, it sets this
+        self.tested_ckpt_path = None
+
         # training state
         self.model = None
         self.testing = False
@@ -965,6 +968,10 @@ def fit(
 
             self.ddp_train(process_idx=task, q=None, model=model)
         elif self.use_ddp:
+
+            # set testing if set in environ
+            self.testing = os.environ.get('PL_TESTING_MODE', self.testing)
+
             if self.is_slurm_managing_tasks:
                 task = int(os.environ['SLURM_LOCALID'])
                 self.ddp_train(process_idx=task, q=None, model=model)
@@ -1058,7 +1065,7 @@ def __run_ddp_spawn(self, model, nprocs):
         smp = mp.get_context('spawn')
         q = smp.SimpleQueue()
 
-        mp.spawn(self.ddp_train, nprocs=nprocs, args=(q, model,))
+        mp.spawn(self.ddp_train, nprocs=nprocs, args=(q, model, ))
 
         # restore main state with best weights
         best_path = q.get()
@@ -1070,7 +1077,8 @@ def __run_ddp_spawn(self, model, nprocs):
 
         # load last weights
         if last_path is not None and not self.testing:
-            torch.load(last_path, map_location=lambda storage, loc: storage)
+            ckpt = torch.load(last_path, map_location=lambda storage, loc: storage)
+            model.load_state_dict(ckpt)
 
         self.model = model
         return results
@@ -1262,62 +1270,83 @@ def test(
         # --------------------
         # SETUP HOOK
         # --------------------
+        if self.global_rank != 0:
+            return
+
         self.setup('test')
-        model_ref = self.model if model is None else model
-        if self.is_function_implemented('setup', model_ref):
-            model_ref.setup('test')
+
+        if model is not None:
+            results = self.__test_given_model(model, test_dataloaders)
+        else:
+            results = self.__test_using_best_weights(ckpt_path, test_dataloaders)
+
+        self.teardown('test')
+
+        return results
+
+    def __test_using_best_weights(self, ckpt_path, test_dataloaders):
+        model = self.get_model()
+        if self.is_function_implemented('setup', model):
+            model.setup('test')
 
         # if user requests the best checkpoint but we don't have it, error
-        if model is None and ckpt_path == 'best' and self.checkpoint_callback.save_top_k <= 0:
+        if ckpt_path == 'best' and self.checkpoint_callback.save_top_k <= 0:
             raise MisconfigurationException(
                 'ckpt_path is "best", but ModelCheckpoint is not configured to save the best model.')
 
-        # --------------------
-        # AUTO-LOAD BEST CKPT
-        # --------------------
-        # load the best checkpoint automatically unless model is given
-        # in which case we use that one
-        if model is None and ckpt_path is not None:
+        # load best weights
+        if ckpt_path is not None:
             # ckpt_path is 'best' so load the best model
             if ckpt_path == 'best':
                 ckpt_path = self.checkpoint_callback.best_model_path
-            model = self.get_model().load_from_checkpoint(ckpt_path)
 
-        # ----------------------------------------------------
-        # AUTO-LOAD BEST CKPT with the model trained in .fit()
-        # ----------------------------------------------------
-        elif model is None and ckpt_path is None:
-            model = model_ref
+            ckpt = torch.load(ckpt_path, map_location=lambda storage, loc: storage)
+            model.load_state_dict(ckpt['state_dict'])
 
-        # --------------------
-        # LOAD DATA
-        # --------------------
+        # attach dataloaders
         if test_dataloaders is not None:
-            if model:
-                self.__attach_dataloaders(model, test_dataloaders=test_dataloaders)
-            else:
-                self.__attach_dataloaders(self.model, test_dataloaders=test_dataloaders)
+            self.__attach_dataloaders(model, test_dataloaders=test_dataloaders)
 
-        # --------------------
-        # RUN TEST SET
-        # --------------------
-        # sets up testing so we short circuit to eval
+        # run tests
+        self.tested_ckpt_path = ckpt_path
         self.set_random_port(force=True)
         self.testing = True
+        os.environ['PL_TESTING_MODE'] = '1'
         self.model = model
         results = self.fit(model)
         self.testing = False
+        del os.environ['PL_TESTING_MODE']
 
-        # --------------------
-        # TEAR DOWN HOOK
-        # --------------------
-        self.teardown('test')
+        # teardown
         if self.is_function_implemented('teardown'):
             model_ref = self.get_model()
             model_ref.teardown('test')
 
         return results
 
+    def __test_given_model(self, model, test_dataloaders):
+        # setup hook
+        if self.is_function_implemented('setup', model):
+            model.setup('test')
+
+        # attach data
+        if test_dataloaders is not None:
+            self.__attach_dataloaders(model, test_dataloaders=test_dataloaders)
+
+        # run test
+        # sets up testing so we short circuit to eval
+        self.set_random_port(force=True)
+        self.testing = True
+        self.model = model
+        results = self.fit(model)
+        self.testing = False
+
+        # teardown
+        if self.is_function_implemented('teardown'):
+            model.teardown('test')
+
+        return results
+
     def check_model_configuration(self, model: LightningModule):
         r"""
         Checks that the model is configured correctly before training or testing is started.
diff --git a/tests/loggers/test_wandb.py b/tests/loggers/test_wandb.py
@@ -25,7 +25,7 @@ def test_wandb_logger(wandb):
         {'test': 'None', 'nested/a': 1, 'b': [2, 3, 4]},
         allow_val_change=True,
     )
-    
+
     logger.watch('model', 'log', 10)
     wandb.init().watch.assert_called_once_with('model', log='log', log_freq=10)
 
diff --git a/tests/models/data/horovod/train_default_model.py b/tests/models/data/horovod/train_default_model.py
@@ -88,7 +88,6 @@ def run_test_from_config(trainer_options):
         assert trainer.root_gpu == hvd.local_rank()
 
 
-
 if __name__ == "__main__":
     args = parser.parse_args()
     run_test_from_config(json.loads(args.trainer_options))
diff --git a/tests/models/test_tpu.py b/tests/models/test_tpu.py
@@ -141,11 +141,7 @@ def long_train_loader():
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
-@pytest.mark.parametrize(['tpu_cores', 'expected_device'], [
-    pytest.param([1], 'xla:1'),
-    pytest.param([8], 'xla:8'),
-])
-def test_early_stop_checkpoints_on_tpu(tmpdir, tpu_cores, expected_device):
+def test_early_stop_checkpoints_on_tpu(tmpdir):
     """Test if single TPU core training works"""
     model = EvalModelTemplate()
     trainer = Trainer(
@@ -155,10 +151,10 @@ def test_early_stop_checkpoints_on_tpu(tmpdir, tpu_cores, expected_device):
         max_epochs=50,
         limit_train_batches=10,
         limit_val_batches=10,
-        tpu_cores=tpu_cores,
+        tpu_cores=[1],
     )
     trainer.fit(model)
-    assert torch_xla._XLAC._xla_get_default_device() == expected_device
+    assert torch_xla._XLAC._xla_get_default_device() == 'xla:1'
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
@@ -172,10 +168,10 @@ def test_early_stop_checkpoints_on_tpu(tmpdir):
         max_epochs=50,
         limit_train_batches=10,
         limit_val_batches=10,
-        tpu_cores=1,
+        tpu_cores=[8],
     )
     trainer.fit(model)
-    assert torch_xla._XLAC._xla_get_default_device() == 'xla:1'
+    assert torch_xla._XLAC._xla_get_default_device() == 'xla:8'
 
 
 @pytest.mark.skipif(not TPU_AVAILABLE, reason="test requires TPU machine")
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
@@ -562,16 +562,7 @@ def test_testpass_overrides(tmpdir):
 def test_test_checkpoint_path(tmpdir, ckpt_path, save_top_k):
     hparams = EvalModelTemplate.get_default_hparams()
 
-    loaded_checkpoint_path = ''
-
-    class TestBestModel(EvalModelTemplate):
-        @classmethod
-        def load_from_checkpoint(cls, checkpoint_path, *args, **kwargs):
-            nonlocal loaded_checkpoint_path
-            loaded_checkpoint_path = checkpoint_path
-            return super().load_from_checkpoint(checkpoint_path, *args, **kwargs)
-
-    model = TestBestModel(**hparams)
+    model = EvalModelTemplate(**hparams)
     trainer = Trainer(
         max_epochs=2,
         progress_bar_refresh_rate=0,
@@ -586,12 +577,12 @@ def load_from_checkpoint(cls, checkpoint_path, *args, **kwargs):
                 trainer.test(ckpt_path=ckpt_path)
         else:
             trainer.test(ckpt_path=ckpt_path)
-            assert loaded_checkpoint_path == trainer.checkpoint_callback.best_model_path
+            assert trainer.tested_ckpt_path == trainer.checkpoint_callback.best_model_path
     elif ckpt_path is None:
         # ckpt_path is None, meaning we don't load any checkpoints and
         # use the weights from the end of training
         trainer.test(ckpt_path=ckpt_path)
-        assert loaded_checkpoint_path == ''
+        assert trainer.tested_ckpt_path is None
     else:
         # specific checkpoint, pick one from saved ones
         if save_top_k == 0:
@@ -600,7 +591,7 @@ def load_from_checkpoint(cls, checkpoint_path, *args, **kwargs):
         else:
             ckpt_path = str(list((Path(tmpdir) / 'lightning_logs/version_0/checkpoints').iterdir())[0].absolute())
             trainer.test(ckpt_path=ckpt_path)
-            assert loaded_checkpoint_path == ckpt_path
+            assert trainer.tested_ckpt_path == ckpt_path
 
 
 def test_disabled_validation(tmpdir):

Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,7 @@ def test_wandb_logger(wandb):`
`25`	`25`	`{'test': 'None', 'nested/a': 1, 'b': [2, 3, 4]},`
`26`	`26`	`allow_val_change=True,`
`27`	`27`	`)`
`28`		`-`
	`28`	`+`
`29`	`29`	`logger.watch('model', 'log', 10)`
`30`	`30`	`wandb.init().watch.assert_called_once_with('model', log='log', log_freq=10)`
`31`	`31`