Add non-existing resume_from_checkpoint acceptance for auto-resubmit (#4402)

tarepan · Borda · commit bb366232e753 · 2021-01-06T12:55:38.000+01:00
* Add empty resume_from_checkpoint acceptance #4366 * Fix general error catch with focused file check * Add fsspec HTTP extras Add fsspec's HTTPFileSystem support through http extras. pl has supported remote http file (e.g. #2925), so this commit do not add new functionality. * Fix potential too much logging in DDP * Add PR changelog * Add well-written argument explanation Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * Fix DDP-compatible restore logging Notify from where the states are restored. This feature temporally deleted as a result of PR review. With succeeding review, added with DDP compatibility. * Fix utility import pathes * Refactor load step commentaries * Refactor hpc ckpt suffix acquisition * Refactor restore/hpc_load match * Refactor hpc load trial * Refactor checkpoint dir check * Refactor unneeded function nest * Refactor nested If * Refactor duplicated cache clear * Refactor attempt flow with if/elif * Fix pip8 * Refactor hook commentary Co-authored-by: chaton <thomas@grid.ai> * Fix pep8 * Refactor hpc load checkpoint path acquisition * Fix pip8 * Fix typo Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * Fix typo Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * Fix doc Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * Refactor None Union type with Optional * Fix build-doc CI failure debuged in #5329 * Fix fsspec import during build-doc #5329 * Fix test epoch Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> * Fix test with latest test models * . Co-authored-by: Adrian Wälchli <aedu.waelchli@gmail.com> Co-authored-by: chaton <thomas@grid.ai> Co-authored-by: Jirka Borovec <Borda@users.noreply.github.com> Co-authored-by: Sean Naren <sean.narenthiran@gmail.com> Co-authored-by: Roger Shieh <sh.rog@protonmail.ch> (cherry picked from commit b0051e8)
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -60,6 +60,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ### Added
 
+- Added `resume_from_checkpoint` accept non-existing file path ([#4402](https://github.com/PyTorchLightning/pytorch-lightning/pull/4402))
+
 
 ### Removed
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -293,10 +293,14 @@ def setup(app):
 # Ignoring Third-party packages
 # https://stackoverflow.com/questions/15889621/sphinx-how-to-exclude-imports-in-automodule
 def package_list_from_file(file):
+    """List up package name (not containing version and extras) from a package list file
+    """
     mocked_packages = []
     with open(file, 'r') as fp:
         for ln in fp.readlines():
-            found = [ln.index(ch) for ch in list(',=<>#') if ch in ln]
+            # Example: `tqdm>=4.41.0` => `tqdm`
+            # `[` is for package with extras
+            found = [ln.index(ch) for ch in list(',=<>#[') if ch in ln]
             pkg = ln[:min(found)] if found else ln
             if pkg.rstrip():
                 mocked_packages.append(pkg.rstrip())
diff --git a/environment.yml b/environment.yml
@@ -30,7 +30,7 @@ dependencies:
     - future>=0.17.1
     - PyYAML>=5.1
     - tqdm>=4.41.0
-    - fsspec>=0.8.0
+    - fsspec[http]>=0.8.1
     #- tensorboard>=2.2.0  # not needed, already included in pytorch
 
     # Optional
diff --git a/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/pytorch_lightning/trainer/connectors/checkpoint_connector.py
@@ -42,7 +42,7 @@ def __init__(self, trainer):
         # used to validate checkpointing logic
         self.has_trained = False
 
-    def restore_weights(self, model: LightningModule):
+    def restore_weights(self, model: LightningModule) -> None:
         """
         Attempt to restore a checkpoint (e.g. weights) in this priority:
         1. from HPC weights
@@ -72,11 +72,16 @@ def restore_weights(self, model: LightningModule):
         if self.trainer.on_gpu:
             torch.cuda.empty_cache()
 
-    def restore(self, checkpoint_path: str, on_gpu: bool):
+    def restore(self, checkpoint_path: str, on_gpu: bool) -> bool:
         """
         Load model/training states from a 'PyTorch-Lightning checkpoint' file through file-read and state-restore.
         All restored states are listed in return value description of `dump_checkpoint`.
         """
+        # Try to read the checkpoint file at `checkpoint_path`. If not exist, do not restore checkpoint.
+        fs = get_filesystem(checkpoint_path)
+        if not fs.exists(checkpoint_path):
+            rank_zero_warn("No checkpoint file exists at `resume_from_checkpoint`. Start from scratch")
+            return False
 
         # read a checkpoint dictionary object from the 'PyTorch-Lightning checkpoint' file at `checkpoint_path`
         checkpoint = pl_load(checkpoint_path, map_location=lambda storage, loc: storage)
@@ -93,6 +98,9 @@ def restore(self, checkpoint_path: str, on_gpu: bool):
         # restore training state
         self.restore_training_state(checkpoint)
 
+        rank_zero_info(f"Restored states from the checkpoint file at {checkpoint_path}")
+        return True
+
     def restore_model_state(self, model: LightningModule, checkpoint) -> None:
         """
         Restore model states from a 'PyTorch-Lightning checkpoint' dictionary object
diff --git a/pytorch_lightning/trainer/trainer.py b/pytorch_lightning/trainer/trainer.py
@@ -252,9 +252,9 @@ def __init__(
                 train sampler and ``shuffle=False`` for val/test sampler. If you want to customize it,
                 you can set ``replace_sampler_ddp=False`` and add your own distributed sampler.
 
-            resume_from_checkpoint: To resume training from a specific checkpoint pass in the path here.
-                This can be a URL. If resuming from mid-epoch checkpoint, training will start from
-                the beginning of the next epoch.
+            resume_from_checkpoint: Path/URL of the checkpoint from which training is resumed. If there is
+                no checkpoint file at the path, start from scratch. If resuming from mid-epoch checkpoint,
+                training will start from the beginning of the next epoch.
 
             sync_batchnorm: Synchronize batch norm layers between process groups/whole world.
 
diff --git a/requirements.txt b/requirements.txt
@@ -6,5 +6,5 @@ future>=0.17.1  # required for builtins in setup.py
 # pyyaml>=3.13
 PyYAML>=5.1  # OmegaConf requirement >=5.1
 tqdm>=4.41.0
-fsspec>=0.8.0
+fsspec[http]>=0.8.1
 tensorboard>=2.2.0
diff --git a/requirements/docs.txt b/requirements/docs.txt
@@ -11,4 +11,4 @@ https://github.com/PyTorchLightning/lightning_sphinx_theme/archive/master.zip#eg
 sphinx-autodoc-typehints
 sphinx-paramlinks<0.4.0
 sphinx-togglebutton
-sphinx-copybutton
+sphinx-copybutton
diff --git a/tests/models/test_restore.py b/tests/models/test_restore.py
@@ -25,7 +25,7 @@
 import tests.base.develop_utils as tutils
 from pytorch_lightning import Callback, Trainer
 from pytorch_lightning.callbacks import ModelCheckpoint
-from tests.base import EvalModelTemplate, GenericEvalModelTemplate
+from tests.base import BoringModel, EvalModelTemplate, GenericEvalModelTemplate
 
 
 class ModelTrainerPropertyParity(Callback):
@@ -71,6 +71,25 @@ def test_model_properties_resume_from_checkpoint(enable_pl_optimizer, tmpdir):
     trainer.fit(model)
 
 
+def test_try_resume_from_non_existing_checkpoint(tmpdir):
+    """ Test that trying to resume from non-existing `resume_from_checkpoint` fail without error."""
+    model = BoringModel()
+    checkpoint_cb = ModelCheckpoint(dirpath=tmpdir, monitor="early_stop_on", save_last=True)
+    trainer = Trainer(
+        default_root_dir=tmpdir,
+        max_epochs=1,
+        logger=False,
+        callbacks=[checkpoint_cb],
+        limit_train_batches=0.1,
+        limit_val_batches=0.1,
+    )
+    # Generate checkpoint `last.ckpt` with BoringModel
+    trainer.fit(model)
+    # `True` if resume/restore successfully else `False`
+    assert trainer.checkpoint_connector.restore(str(tmpdir / "last.ckpt"), trainer.on_gpu)
+    assert not trainer.checkpoint_connector.restore(str(tmpdir / "last_non_existing.ckpt"), trainer.on_gpu)
+
+
 class CaptureCallbacksBeforeTraining(Callback):
     callbacks = []