Feature: wandb logger (#627)

borisdayma · vanpelt · williamFalcon · williamFalcon · commit ec7fc97857f8 · 2020-01-13T22:25:27.000-05:00
* Basic wandb support

* refactor(wandb): remove unused variables and document logger

* docs(wandb): explain how to use WandbLogger

* test(wandb): add tests for WandbLogger

* feat(wandb): add save_dir

* fix(wandb): allow pickle of logger

* fix(wandb): save logs in custom directory

* test(wandb): test import

* docs(wandb): simplify docstring and use doctest

* test: increase number of epochs for satisfactory accuracy

* test(test_load_model_from_checkpoint): ensure we load last checkpoint

Co-authored-by: Chris Van Pelt &lt;vanpelt@wandb.com&gt;
Co-authored-by: William Falcon &lt;waf2107@columbia.edu&gt;
diff --git a/.run_local_tests.sh b/.run_local_tests.sh
@@ -3,6 +3,7 @@ rm -rf _ckpt_*
 rm -rf tests/save_dir*
 rm -rf tests/mlruns_*
 rm -rf tests/cometruns*
+rm -rf tests/wandb*
 rm -rf tests/tests/*
 rm -rf lightning_logs
 coverage run --source pytorch_lightning -m py.test pytorch_lightning tests pl_examples -v --doctest-modules
diff --git a/README.md b/README.md
@@ -306,6 +306,7 @@ Lightning also adds a text column with all the hyperparameters for this experime
 - [Save a snapshot of all hyperparameters](https://williamfalcon.github.io/pytorch-lightning/Trainer/Logging/#save-a-snapshot-of-all-hyperparameters) 
 - [Snapshot code for a training run](https://williamfalcon.github.io/pytorch-lightning/Trainer/Logging/#snapshot-code-for-a-training-run) 
 - [Write logs file to csv every k batches](https://williamfalcon.github.io/pytorch-lightning/Trainer/Logging/#write-logs-file-to-csv-every-k-batches)
+- [Logging on W&B](https://williamfalcon.github.io/pytorch-lightning/Trainer/Logging/#w&b)
 - [Logging experiment data to Neptune](https://williamfalcon.github.io/pytorch-lightning/Trainer/Logging/#neptune-support)
 
 #### Training loop    
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -294,7 +294,7 @@ def setup(app):
             MOCK_REQUIRE_PACKAGES.append(pkg.rstrip())
 
 # TODO: better parse from package since the import name and package name may differ
-MOCK_MANUAL_PACKAGES = ['torch', 'torchvision', 'sklearn', 'test_tube', 'mlflow', 'comet_ml', 'neptune']
+MOCK_MANUAL_PACKAGES = ['torch', 'torchvision', 'sklearn', 'test_tube', 'mlflow', 'comet_ml', 'wandb', 'neptune']
 autodoc_mock_imports = MOCK_REQUIRE_PACKAGES + MOCK_MANUAL_PACKAGES
 # for mod_name in MOCK_REQUIRE_PACKAGES:
 #     sys.modules[mod_name] = mock.Mock()
diff --git a/pytorch_lightning/logging/__init__.py b/pytorch_lightning/logging/__init__.py
@@ -180,6 +180,10 @@ def __init__(self, hparams):
 except ImportError:
     pass
 
+try:
+    from .wandb import WandbLogger
+except ImportError:
+    pass
 try:
     # needed to prevent ImportError and duplicated logs.
     environ["COMET_DISABLE_AUTO_LOGGING"] = "1"
diff --git a/pytorch_lightning/logging/wandb.py b/pytorch_lightning/logging/wandb.py
@@ -0,0 +1,108 @@
+"""
+Log using `W&B <https://www.wandb.com>`_
+
+.. code-block:: python
+
+    >>> from pytorch_lightning.logging import WandbLogger
+    >>> from pytorch_lightning import Trainer
+    >>> wandb_logger = WandbLogger()
+    >>> trainer = Trainer(logger=wandb_logger)
+
+
+Use the logger anywhere in you LightningModule as follows:
+
+.. code-block:: python
+
+    def train_step(...):
+        # example
+        self.logger.experiment.whatever_wandb_supports(...)
+
+    def any_lightning_module_function_or_hook(...):
+        self.logger.experiment.whatever_wandb_supports(...)
+
+"""
+
+import os
+
+try:
+    import wandb
+except ImportError:
+    raise ImportError('Missing wandb package.')
+
+from .base import LightningLoggerBase, rank_zero_only
+
+
+class WandbLogger(LightningLoggerBase):
+    """
+    Logger for W&B.
+
+    Args:
+        name (str): display name for the run.
+        save_dir (str): path where data is saved.
+        offline (bool): run offline (data can be streamed later to wandb servers).
+        id or version (str): sets the version, mainly used to resume a previous run.
+        anonymous (bool): enables or explicitly disables anonymous logging.
+        project (str): the name of the project to which this run will belong.
+        tags (list of str): tags associated with this run.
+    """
+
+    def __init__(self, name=None, save_dir=None, offline=False, id=None, anonymous=False,
+                 version=None, project=None, tags=None):
+        super().__init__()
+        self._name = name
+        self._save_dir = save_dir
+        self._anonymous = "allow" if anonymous else None
+        self._id = version or id
+        self._tags = tags
+        self._project = project
+        self._experiment = None
+        self._offline = offline
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        # cannot be pickled
+        state['_experiment'] = None
+        # args needed to reload correct experiment
+        state['_id'] = self.experiment.id
+        return state
+
+    @property
+    def experiment(self):
+        if self._experiment is None:
+            if self._offline:
+                os.environ["WANDB_MODE"] = "dryrun"
+            self._experiment = wandb.init(
+                name=self._name, dir=self._save_dir, project=self._project, anonymous=self._anonymous,
+                id=self._id, resume="allow", tags=self._tags)
+        return self._experiment
+
+    def watch(self, model, log="gradients", log_freq=100):
+        wandb.watch(model, log, log_freq)
+
+    @rank_zero_only
+    def log_hyperparams(self, params):
+        self.experiment.config.update(params)
+
+    @rank_zero_only
+    def log_metrics(self, metrics, step=None):
+        metrics["global_step"] = step
+        self.experiment.history.add(metrics)
+
+    def save(self):
+        pass
+
+    @rank_zero_only
+    def finalize(self, status='success'):
+        try:
+            exit_code = 0 if status == 'success' else 1
+            wandb.join(exit_code)
+        except TypeError:
+            wandb.join()
+
+    @property
+    def name(self):
+        return self.experiment.project_name()
+
+    @property
+    def version(self):
+        return self.experiment.id
diff --git a/tests/requirements.txt b/tests/requirements.txt
@@ -8,6 +8,7 @@ check-manifest
 # test_tube  # already installed in main req.
 mlflow
 comet_ml
+wandb
 neptune-client
 twine==1.13.0
 pillow<7.0.0
diff --git a/tests/test_logging.py b/tests/test_logging.py
@@ -192,6 +192,14 @@ def test_comet_pickle(tmpdir, monkeypatch):
     trainer2 = pickle.loads(pkl_bytes)
     trainer2.logger.log_metrics({"acc": 1.0})
 
+def test_wandb_logger(tmpdir):
+    """Verify that basic functionality of wandb logger works."""
+    tutils.reset_seed()
+
+    from pytorch_lightning.logging import WandbLogger
+
+    wandb_dir = os.path.join(tmpdir, "wandb")
+    logger = WandbLogger(save_dir=wandb_dir, anonymous=True)
 
 def test_neptune_logger(tmpdir):
     """Verify that basic functionality of neptune logger works."""
@@ -201,7 +209,6 @@ def test_neptune_logger(tmpdir):
 
     hparams = tutils.get_hparams()
     model = LightningTestModel(hparams)
-
     logger = NeptuneLogger(offline_mode=True)
 
     trainer_options = dict(
@@ -216,6 +223,13 @@ def test_neptune_logger(tmpdir):
     print('result finished')
     assert result == 1, "Training failed"
 
+def test_wandb_pickle(tmpdir):
+    """Verify that pickling trainer with wandb logger works."""
+    tutils.reset_seed()
+
+    from pytorch_lightning.logging import WandbLogger
+    wandb_dir = str(tmpdir)
+    logger = WandbLogger(save_dir=wandb_dir, anonymous=True)
 
 def test_neptune_pickle(tmpdir):
     """Verify that pickling trainer with neptune logger works."""
@@ -227,6 +241,7 @@ def test_neptune_pickle(tmpdir):
     # model = LightningTestModel(hparams)
 
     logger = NeptuneLogger(offline_mode=True)
+
     trainer_options = dict(
         default_save_path=tmpdir,
         max_epochs=1,
diff --git a/tests/test_restore_models.py b/tests/test_restore_models.py
@@ -106,7 +106,7 @@ def test_load_model_from_checkpoint(tmpdir):
 
     trainer_options = dict(
         show_progress_bar=False,
-        max_epochs=5,
+        max_epochs=2,
         train_percent_check=0.4,
         val_percent_check=0.2,
         checkpoint_callback=ModelCheckpoint(tmpdir, save_top_k=-1),
@@ -120,9 +120,12 @@ def test_load_model_from_checkpoint(tmpdir):
 
     # correct result and ok accuracy
     assert result == 1, 'training failed to complete'
-    pretrained_model = LightningTestModel.load_from_checkpoint(
-        os.path.join(trainer.checkpoint_callback.filepath, "_ckpt_epoch_4.ckpt")
-    )
+
+    # load last checkpoint
+    last_checkpoint = os.path.join(trainer.checkpoint_callback.filepath, "_ckpt_epoch_1.ckpt")
+    if not os.path.isfile(last_checkpoint):
+        last_checkpoint = os.path.join(trainer.checkpoint_callback.filepath, "_ckpt_epoch_0.ckpt")
+    pretrained_model = LightningTestModel.load_from_checkpoint(last_checkpoint)
 
     # test that hparams loaded correctly
     for k, v in vars(hparams).items():