Adamage · Adamage · Oct 21, 2021 · Oct 5, 2021 · Oct 7, 2021 · Oct 8, 2021
diff --git a/simple_applications/pytorch/mnist/README.md b/simple_applications/pytorch/mnist/README.md
@@ -1,42 +1,305 @@
-# Graphcore
+# PyTorch(PopTorch) MNIST Training Demo
 
----
-## PyTorch(PopTorch) MNIST Training Demo
+This example demonstrates how to train a network on the MNIST dataset using
+PopTorch. To learn more about PopTorch, see our [PyTorch for the IPU: User Guide](https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/index.html).
 
-This example demonstrates how to train a network on the MNIST dataset using PopTorch.
+## How to use this demo
 
-### File structure
+1) Prepare the environment.
 
-* `mnist_poptorch.py` The main file.
-* `README.md` This file.
+Install the Poplar SDK following the instructions in the [Getting Started](https://docs.graphcore.ai/en/latest/getting-started.html)
+guide for your IPU system. Make sure to run the `enable.sh` scripts for Poplar 
+and PopART and activate a Python3 virtualenv with PopTorch installed.
 
-### How to use this demo
+Then install the package requirements:
+```bash
+pip install -r requirements.txt
+```
 
-1) Prepare the environment.
+2) Run the program. Note that the PopTorch Python API only supports Python 3.
+Data will be automatically downloaded using torchvision utils.
 
-    Install the Poplar SDK following the instructions in the Getting Started guide for your IPU system. Make sure to run the `enable.sh` scripts for Poplar and PopART and activate a Python virtualenv with PopTorch installed.
+```bash
+python3 mnist_poptorch.py
+```
 
-    Then install the package requirements:
+Select your hyperparameters in this cell. If you wish to modify them, re-run
+all cells below it. For further reading on hyperparameters, see [Hyperparameters (machine learning)](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning))
+Set up parameters for training:
 
-       pip install -r requirements.txt
 
+```python
+# Batch size for training
+batch_size = 8
 
-2) Run the program. Note that the PopTorch Python API only supports Python 3.
-Data will be automatically downloaded using torch vision utils.
+# Device iteration - batches per step
+device_iterations = 50
+
+# Batch size for testing
+test_batch_size = 80
+
+# Number of epochs to train
+epochs = 10
+
+# Learning rate
+learning_rate = 0.05
+```
+
+Import required libraries:
+
+
+```python
+from tqdm.auto import tqdm
+import torch
+import torch.nn as nn
+import torchvision
+import poptorch
+import torch.optim as optim
+```
+
+Download the datasets for MNIST - database for handwritten digits.
+Source: [The MNIST Database](http://yann.lecun.com/exdb/mnist/)
+
+
+```python
+local_dataset_path = '~/.torch/datasets'
+
+transform_mnist = torchvision.transforms.Compose(
+    [
+        torchvision.transforms.ToTensor(),
+        torchvision.transforms.Normalize((0.1307, ), (0.3081, ))
+    ]
+)
+
+training_dataset = torchvision.datasets.MNIST(
+        local_dataset_path,
+        train=True,
+        download=True,
+        transform=transform_mnist
+)
+
+training_data = torch.utils.data.DataLoader(
+    training_dataset,
+    batch_size=batch_size * device_iterations,
+    shuffle=True,
+    drop_last=True
+)
+
+test_dataset = torchvision.datasets.MNIST(
+        local_dataset_path,
+        train=False,
+        download=True,
+        transform=transform_mnist
+)
+
+test_data = torch.utils.data.DataLoader(
+    test_dataset,
+    batch_size=test_batch_size,
+    shuffle=True,
+    drop_last=True
+)
+```
+
+Let's define the elements of our neural network. We first create a `Block`
+instance consisting of a 2D convolutional layer with pooling, followed by
+a ReLU activation.
+
+
+```python
+class Block(nn.Module):
+    def __init__(self, in_channels, num_filters, kernel_size, pool_size):
+        super(Block, self).__init__()
+        self.conv = nn.Conv2d(in_channels,
+                              num_filters,
+                              kernel_size=kernel_size)
+        self.pool = nn.MaxPool2d(kernel_size=pool_size)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.pool(x)
+        x = self.relu(x)
+        return x
+```
+
+Now, let's construct our neural network.
+
+
+```python
+class Network(nn.Module):
+    def __init__(self):
+        super(Network, self).__init__()
+        self.layer1 = Block(1, 32, 3, 2)
+        self.layer2 = Block(32, 64, 3, 2)
+        self.layer3 = nn.Linear(1600, 128)
+        self.layer3_act = nn.ReLU()
+        self.layer3_dropout = torch.nn.Dropout(0.5)
+        self.layer4 = nn.Linear(128, 10)
+        self.softmax = nn.Softmax(1)
+
+    def forward(self, x):
+        x = self.layer1(x)
+        x = self.layer2(x)
+        # Flatten layer
+        x = x.view(-1, 1600)
+        x = self.layer3_act(self.layer3(x))
+        x = self.layer4(self.layer3_dropout(x))
+        return x
+```
+
+Here we define a thin wrapper around the `torch.nn.Module` that will use
+cross-entropy loss function - see more [here](https://en.wikipedia.org/wiki/Cross_entropy#Cross-entropy_loss_function_and_logistic_regression)
+
+This class is creating a custom module to compose the Neural Network and 
+the Cross Entropy module into one object, which under the hood will invoke 
+the `__call__` function on `nn.Module` and consequently the `forward` method 
+when called like this:
+```python
+prediction, losses = TrainingModelWithLoss(Network())(data, labels)
+```
+
+
+```python
+class TrainingModelWithLoss(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        self.loss = torch.nn.CrossEntropyLoss()
+
+    def forward(self, args, loss_inputs=None):
+        output = self.model(args)
+        loss = self.loss(output, loss_inputs)
+        return output, loss
+```
+
+Let's initialise the neural network from our defined classes.
+
+
+```python
+model = Network()
+model_with_loss = TrainingModelWithLoss(model)
+model_opts = poptorch.Options().deviceIterations(device_iterations)
+```
+
+We can check if the model is assembled correctly by printing the string 
+representation of the model object
+
+
+```python
+print(model_with_loss)
+```
+
+    TrainingModelWithLoss(
+      (model): Network(
+        (layer1): Block(
+          (conv): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
+          (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
+          (relu): ReLU()
+        )
+        (layer2): Block(
+          (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
+          (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
+          (relu): ReLU()
+        )
+        (layer3): Linear(in_features=1600, out_features=128, bias=True)
+        (layer3_act): ReLU()
+        (layer3_dropout): Dropout(p=0.5, inplace=False)
+        (layer4): Linear(in_features=128, out_features=10, bias=True)
+        (softmax): Softmax(dim=1)
+      )
+      (loss): CrossEntropyLoss()
+    )
+
+
+Now we apply the model wrapping function, which will perform a shallow copy
+of the PyTorch model. To train the model, we also will use the Stochastic 
+Gradient Descent with no momentum [SGD](https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/reference.html#poptorch.optim.SGD).
+
+
+```python
+training_model = poptorch.trainingModel(
+    model_with_loss,
+    model_opts,
+    optimizer=optim.SGD(model.parameters(), lr=learning_rate)
+)
+```
+
+We are ready to start training. However to track the accuracy while training
+we need to define one more helper function. During the training, not every 
+samples prediction is returned for efficiency reasons, so this helper function
+will check accuracy for labels where prediction is available.
+
+
+```python
+def accuracy(predictions, labels):
+    _, ind = torch.max(predictions, 1)
+    labels = labels[-predictions.size()[0]:]
+    accuracy = \
+        torch.sum(torch.eq(ind, labels)).item() / labels.size()[0] * 100.0
+    return accuracy
+```
+
+This code will perform the requested amount of epochs and batches using the
+configured Graphcore IPUs.
+
+
+```python
+nr_batches = len(training_data)
+
+for epoch in tqdm(range(1, epochs+1), leave=True, desc="Epochs", total=epochs):
+    with tqdm(training_data, total=nr_batches, leave=False) as bar:
+        for data, labels in bar:
+            preds, losses = training_model(data, labels)
+
+            mean_loss = torch.mean(losses).item()
+
+            acc = accuracy(preds, labels)
+            bar.set_description(
+                "Loss: {:0.4f} | Accuracy: {:05.2F}% ".format(mean_loss, acc)
+            )
+```
+
+Release resources:
+
+
+```python
+training_model.detachFromDevice()
+```
+
+Let's check the validation loss on IPU using the trained model. The weights 
+in `model.parameters()` will be copied from the IPU to the host. The trained
+model will be reused to compile the new inference model.
+
+
+```python
+inference_model = poptorch.inferenceModel(model)
+```
+
+Perform validation
+
+
+```python
+nr_batches = len(test_data)
+sum_acc = 0.0
+with tqdm(test_data, total=nr_batches, leave=False) as bar:
+    for data, labels in bar:
+        output = inference_model(data)
+        sum_acc += accuracy(output, labels)
+```
 
-       python3 mnist_poptorch.py
+Finally the accuracy on the test set is:
 
-#### Options
-The program has a few command-line options:
 
-`-h` Show usage information.
+```python
+print("Accuracy on test set: {:0.2f}%".format(sum_acc / len(test_data)))
+```
 
-`--batch-size`        Sets the batch size for training.
+    Accuracy on test set: 99.07%
 
-`--batches-per-step`  Number on mini-batches to perform on the device before returning to the host.
 
-`--test-batch-size`   Sets the batch size for inference.
+Release resources:
 
-`--epochs`            Number of epoch to train for.
 
-`--lr`                Learning rate of the optimizer.
+```python
+inference_model.detachFromDevice()
+```
diff --git a/simple_applications/pytorch/mnist/mnist_poptorch.ipynb b/simple_applications/pytorch/mnist/mnist_poptorch.ipynb
@@ -0,0 +1,498 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "235c25bc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Copyright (c) 2020 Graphcore Ltd. All rights reserved."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "54adddc9",
+   "metadata": {},
+   "source": [
+    "# PyTorch(PopTorch) MNIST Training Demo\n",
+    "\n",
+    "This example demonstrates how to train a network on the MNIST dataset using\n",
+    "PopTorch. To learn more about PopTorch, see our [PyTorch for the IPU: User Guide](https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/index.html)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cb1fbe56",
+   "metadata": {},
+   "source": [
+    "## How to use this demo\n",
+    "\n",
+    "1) Prepare the environment.\n",
+    "\n",
+    "Install the Poplar SDK following the instructions in the [Getting Started](https://docs.graphcore.ai/en/latest/getting-started.html)\n",
+    "guide for your IPU system. Make sure to run the `enable.sh` scripts for Poplar \n",
+    "and PopART and activate a Python3 virtualenv with PopTorch installed.\n",
+    "\n",
+    "Then install the package requirements:\n",
+    "```bash\n",
+    "pip install -r requirements.txt\n",
+    "```\n",
+    "\n",
+    "2) Run the program. Note that the PopTorch Python API only supports Python 3.\n",
+    "Data will be automatically downloaded using torchvision utils.\n",
+    "\n",
+    "```bash\n",
+    "python3 mnist_poptorch.py\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "81f3de09",
+   "metadata": {},
+   "source": [
+    "Select your hyperparameters in this cell. If you wish to modify them, re-run\n",
+    "all cells below it. For further reading on hyperparameters, see [Hyperparameters (machine learning)](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning))\n",
+    "Set up parameters for training:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c8801566",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Batch size for training\n",
+    "batch_size = 8\n",
+    "\n",
+    "# Device iteration - batches per step\n",
+    "device_iterations = 50\n",
+    "\n",
+    "# Batch size for testing\n",
+    "test_batch_size = 80\n",
+    "\n",
+    "# Number of epochs to train\n",
+    "epochs = 10\n",
+    "\n",
+    "# Learning rate\n",
+    "learning_rate = 0.05"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3b9bf21b",
+   "metadata": {},
+   "source": [
+    "Import required libraries:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7defa8a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm.auto import tqdm\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torchvision\n",
+    "import poptorch\n",
+    "import torch.optim as optim"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8538bcc9",
+   "metadata": {},
+   "source": [
+    "Download the datasets for MNIST - database for handwritten digits.\n",
+    "Source: [The MNIST Database](http://yann.lecun.com/exdb/mnist/)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8ecfc36b",
+   "metadata": {
+    "tags": [
+     "sst_hide_output"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "local_dataset_path = '~/.torch/datasets'\n",
+    "\n",
+    "transform_mnist = torchvision.transforms.Compose(\n",
+    "    [\n",
+    "        torchvision.transforms.ToTensor(),\n",
+    "        torchvision.transforms.Normalize((0.1307, ), (0.3081, ))\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "training_dataset = torchvision.datasets.MNIST(\n",
+    "        local_dataset_path,\n",
+    "        train=True,\n",
+    "        download=True,\n",
+    "        transform=transform_mnist\n",
+    ")\n",
+    "\n",
+    "training_data = torch.utils.data.DataLoader(\n",
+    "    training_dataset,\n",
+    "    batch_size=batch_size * device_iterations,\n",
+    "    shuffle=True,\n",
+    "    drop_last=True\n",
+    ")\n",
+    "\n",
+    "test_dataset = torchvision.datasets.MNIST(\n",
+    "        local_dataset_path,\n",
+    "        train=False,\n",
+    "        download=True,\n",
+    "        transform=transform_mnist\n",
+    ")\n",
+    "\n",
+    "test_data = torch.utils.data.DataLoader(\n",
+    "    test_dataset,\n",
+    "    batch_size=test_batch_size,\n",
+    "    shuffle=True,\n",
+    "    drop_last=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5d0a6f14",
+   "metadata": {},
+   "source": [
+    "Let's define the elements of our neural network. We first create a `Block`\n",
+    "instance consisting of a 2D convolutional layer with pooling, followed by\n",
+    "a ReLU activation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e7643317",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Block(nn.Module):\n",
+    "    def __init__(self, in_channels, num_filters, kernel_size, pool_size):\n",
+    "        super(Block, self).__init__()\n",
+    "        self.conv = nn.Conv2d(in_channels,\n",
+    "                              num_filters,\n",
+    "                              kernel_size=kernel_size)\n",
+    "        self.pool = nn.MaxPool2d(kernel_size=pool_size)\n",
+    "        self.relu = nn.ReLU()\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        x = self.conv(x)\n",
+    "        x = self.pool(x)\n",
+    "        x = self.relu(x)\n",
+    "        return x"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e5ff931b",
+   "metadata": {},
+   "source": [
+    "Now, let's construct our neural network."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6bce35c5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Network(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(Network, self).__init__()\n",
+    "        self.layer1 = Block(1, 32, 3, 2)\n",
+    "        self.layer2 = Block(32, 64, 3, 2)\n",
+    "        self.layer3 = nn.Linear(1600, 128)\n",
+    "        self.layer3_act = nn.ReLU()\n",
+    "        self.layer3_dropout = torch.nn.Dropout(0.5)\n",
+    "        self.layer4 = nn.Linear(128, 10)\n",
+    "        self.softmax = nn.Softmax(1)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        x = self.layer1(x)\n",
+    "        x = self.layer2(x)\n",
+    "        # Flatten layer\n",
+    "        x = x.view(-1, 1600)\n",
+    "        x = self.layer3_act(self.layer3(x))\n",
+    "        x = self.layer4(self.layer3_dropout(x))\n",
+    "        return x"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5d8f37cc",
+   "metadata": {},
+   "source": [
+    "Here we define a thin wrapper around the `torch.nn.Module` that will use\n",
+    "cross-entropy loss function - see more [here](https://en.wikipedia.org/wiki/Cross_entropy#Cross-entropy_loss_function_and_logistic_regression)\n",
+    "\n",
+    "This class is creating a custom module to compose the Neural Network and \n",
+    "the Cross Entropy module into one object, which under the hood will invoke \n",
+    "the `__call__` function on `nn.Module` and consequently the `forward` method \n",
+    "when called like this:\n",
+    "```python\n",
+    "prediction, losses = TrainingModelWithLoss(Network())(data, labels)\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1e1a3985",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class TrainingModelWithLoss(torch.nn.Module):\n",
+    "    def __init__(self, model):\n",
+    "        super().__init__()\n",
+    "        self.model = model\n",
+    "        self.loss = torch.nn.CrossEntropyLoss()\n",
+    "\n",
+    "    def forward(self, args, loss_inputs=None):\n",
+    "        output = self.model(args)\n",
+    "        loss = self.loss(output, loss_inputs)\n",
+    "        return output, loss"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "12c2971b",
+   "metadata": {},
+   "source": [
+    "Let's initialise the neural network from our defined classes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "72b43efd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = Network()\n",
+    "model_with_loss = TrainingModelWithLoss(model)\n",
+    "model_opts = poptorch.Options().deviceIterations(device_iterations)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "86759745",
+   "metadata": {},
+   "source": [
+    "We can check if the model is assembled correctly by printing the string \n",
+    "representation of the model object"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0ce8abbc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(model_with_loss)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "daec48d0",
+   "metadata": {},
+   "source": [
+    "Now we apply the model wrapping function, which will perform a shallow copy\n",
+    "of the PyTorch model. To train the model, we also will use the Stochastic \n",
+    "Gradient Descent with no momentum [SGD](https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/reference.html#poptorch.optim.SGD)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3a16aebf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "training_model = poptorch.trainingModel(\n",
+    "    model_with_loss,\n",
+    "    model_opts,\n",
+    "    optimizer=optim.SGD(model.parameters(), lr=learning_rate)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d75fb0e2",
+   "metadata": {},
+   "source": [
+    "We are ready to start training. However to track the accuracy while training\n",
+    "we need to define one more helper function. During the training, not every \n",
+    "samples prediction is returned for efficiency reasons, so this helper function\n",
+    "will check accuracy for labels where prediction is available."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4dd721c1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def accuracy(predictions, labels):\n",
+    "    _, ind = torch.max(predictions, 1)\n",
+    "    labels = labels[-predictions.size()[0]:]\n",
+    "    accuracy = \\\n",
+    "        torch.sum(torch.eq(ind, labels)).item() / labels.size()[0] * 100.0\n",
+    "    return accuracy"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f52aa835",
+   "metadata": {},
+   "source": [
+    "This code will perform the requested amount of epochs and batches using the\n",
+    "configured Graphcore IPUs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "35b6e928",
+   "metadata": {
+    "tags": [
+     "sst_hide_output"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "nr_batches = len(training_data)\n",
+    "\n",
+    "for epoch in tqdm(range(1, epochs+1), leave=True, desc=\"Epochs\", total=epochs):\n",
+    "    with tqdm(training_data, total=nr_batches, leave=False) as bar:\n",
+    "        for data, labels in bar:\n",
+    "            preds, losses = training_model(data, labels)\n",
+    "\n",
+    "            mean_loss = torch.mean(losses).item()\n",
+    "\n",
+    "            acc = accuracy(preds, labels)\n",
+    "            bar.set_description(\n",
+    "                \"Loss: {:0.4f} | Accuracy: {:05.2F}% \".format(mean_loss, acc)\n",
+    "            )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "199c900c",
+   "metadata": {},
+   "source": [
+    "Release resources:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aa0ff8b5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "training_model.detachFromDevice()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b0916d57",
+   "metadata": {},
+   "source": [
+    "Let's check the validation loss on IPU using the trained model. The weights \n",
+    "in `model.parameters()` will be copied from the IPU to the host. The trained\n",
+    "model will be reused to compile the new inference model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a5081ea4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inference_model = poptorch.inferenceModel(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8e834da8",
+   "metadata": {},
+   "source": [
+    "Perform validation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cd3bb8fd",
+   "metadata": {
+    "tags": [
+     "sst_hide_output"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "nr_batches = len(test_data)\n",
+    "sum_acc = 0.0\n",
+    "with tqdm(test_data, total=nr_batches, leave=False) as bar:\n",
+    "    for data, labels in bar:\n",
+    "        output = inference_model(data)\n",
+    "        sum_acc += accuracy(output, labels)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7b84b437",
+   "metadata": {},
+   "source": [
+    "Finally the accuracy on the test set is:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "77454a3a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Accuracy on test set: {:0.2f}%\".format(sum_acc / len(test_data)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1281f9a0",
+   "metadata": {},
+   "source": [
+    "Release resources:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e75cc42c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inference_model.detachFromDevice()"
+   ]
+  }
+ ],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/simple_applications/pytorch/mnist/mnist_poptorch.py b/simple_applications/pytorch/mnist/mnist_poptorch.py
@@ -1,37 +1,106 @@
 #!/usr/bin/env python3
 # Copyright (c) 2020 Graphcore Ltd. All rights reserved.
-import argparse
-from tqdm import tqdm
+"""
+# PyTorch(PopTorch) MNIST Training Demo
+
+This example demonstrates how to train a network on the MNIST dataset using
+PopTorch. To learn more about PopTorch, see our [PyTorch for the IPU: User Guide](https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/index.html).
+"""
+"""
+## How to use this demo
+
+1) Prepare the environment.
+
+Install the Poplar SDK following the instructions in the [Getting Started](https://docs.graphcore.ai/en/latest/getting-started.html)
+guide for your IPU system. Make sure to run the `enable.sh` scripts for Poplar 
+and PopART and activate a Python3 virtualenv with PopTorch installed.
+
+Then install the package requirements:
+```bash
+pip install -r requirements.txt
+```
+
+2) Run the program. Note that the PopTorch Python API only supports Python 3.
+Data will be automatically downloaded using torchvision utils.
+
+```bash
+python3 mnist_poptorch.py
+```
+"""
+"""
+Select your hyperparameters in this cell. If you wish to modify them, re-run
+all cells below it. For further reading on hyperparameters, see [Hyperparameters (machine learning)](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning))
+Set up parameters for training:
+"""
+# Batch size for training
+batch_size = 8
+
+# Device iteration - batches per step
+device_iterations = 50
+
+# Batch size for testing
+test_batch_size = 80
+
+# Number of epochs to train
+epochs = 10
+
+# Learning rate
+learning_rate = 0.05
+"""
+Import required libraries:
+"""
+from tqdm.auto import tqdm
 import torch
 import torch.nn as nn
 import torchvision
 import poptorch
 import torch.optim as optim
-
-# The following is a workaround for pytorch issue #1938
-from six.moves import urllib
-opener = urllib.request.build_opener()
-opener.addheaders = [("User-agent", "Mozilla/5.0")]
-urllib.request.install_opener(opener)
-
-
-def get_mnist_data(opts):
-    training_data = torch.utils.data.DataLoader(
-                    torchvision.datasets.MNIST('mnist_data/', train=True, download=True,
-                                               transform=torchvision.transforms.Compose([
-                                                torchvision.transforms.ToTensor(),
-                                                torchvision.transforms.Normalize((0.1307, ), (0.3081, ))])),
-                    batch_size=opts.batch_size * opts.batches_per_step, shuffle=True, drop_last=True)
-
-    validation_data = torch.utils.data.DataLoader(
-                      torchvision.datasets.MNIST('mnist_data/', train=False, download=True,
-                                                 transform=torchvision.transforms.Compose([
-                                                    torchvision.transforms.ToTensor(),
-                                                    torchvision.transforms.Normalize((0.1307, ), (0.3081, ))])),
-                      batch_size=opts.test_batch_size, shuffle=True, drop_last=True)
-    return training_data, validation_data
-
-
+"""
+Download the datasets for MNIST - database for handwritten digits.
+Source: [The MNIST Database](http://yann.lecun.com/exdb/mnist/)
+"""
+local_dataset_path = '~/.torch/datasets'
+
+transform_mnist = torchvision.transforms.Compose(
+    [
+        torchvision.transforms.ToTensor(),
+        torchvision.transforms.Normalize((0.1307, ), (0.3081, ))
+    ]
+)
+
+training_dataset = torchvision.datasets.MNIST(
+        local_dataset_path,
+        train=True,
+        download=True,
+        transform=transform_mnist
+)
+
+training_data = torch.utils.data.DataLoader(
+    training_dataset,
+    batch_size=batch_size * device_iterations,
+    shuffle=True,
+    drop_last=True
+)
+
+test_dataset = torchvision.datasets.MNIST(
+        local_dataset_path,
+        train=False,
+        download=True,
+        transform=transform_mnist
+)
+
+test_data = torch.utils.data.DataLoader(
+    test_dataset,
+    batch_size=test_batch_size,
+    shuffle=True,
+    drop_last=True
+)
+# sst_hide_output
+"""
+Let's define the elements of our neural network. We first create a `Block`
+instance consisting of a 2D convolutional layer with pooling, followed by
+a ReLU activation.
+"""
 class Block(nn.Module):
     def __init__(self, in_channels, num_filters, kernel_size, pool_size):
         super(Block, self).__init__()
@@ -46,8 +115,9 @@ def forward(self, x):
         x = self.pool(x)
         x = self.relu(x)
         return x
-
-
+"""
+Now, let's construct our neural network.
+"""
 class Network(nn.Module):
     def __init__(self):
         super(Network, self).__init__()
@@ -66,10 +136,20 @@ def forward(self, x):
         x = x.view(-1, 1600)
         x = self.layer3_act(self.layer3(x))
         x = self.layer4(self.layer3_dropout(x))
-        x = self.softmax(x)
         return x
 
-
+"""
+Here we define a thin wrapper around the `torch.nn.Module` that will use
+cross-entropy loss function - see more [here](https://en.wikipedia.org/wiki/Cross_entropy#Cross-entropy_loss_function_and_logistic_regression)
+
+This class is creating a custom module to compose the Neural Network and 
+the Cross Entropy module into one object, which under the hood will invoke 
+the `__call__` function on `nn.Module` and consequently the `forward` method 
+when called like this:
+```python
+prediction, losses = TrainingModelWithLoss(Network())(data, labels)
+```
+"""
 class TrainingModelWithLoss(torch.nn.Module):
     def __init__(self, model):
         super().__init__()
@@ -78,68 +158,84 @@ def __init__(self, model):
 
     def forward(self, args, loss_inputs=None):
         output = self.model(args)
-        if loss_inputs is None:
-            return output
-        else:
-            loss = self.loss(output, loss_inputs)
-            return output, loss
-
-
+        loss = self.loss(output, loss_inputs)
+        return output, loss
+"""
+Let's initialise the neural network from our defined classes.
+"""
+model = Network()
+model_with_loss = TrainingModelWithLoss(model)
+model_opts = poptorch.Options().deviceIterations(device_iterations)
+"""
+We can check if the model is assembled correctly by printing the string 
+representation of the model object
+"""
+print(model_with_loss)
+"""
+Now we apply the model wrapping function, which will perform a shallow copy
+of the PyTorch model. To train the model, we also will use the Stochastic 
+Gradient Descent with no momentum [SGD](https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/reference.html#poptorch.optim.SGD).
+"""
+training_model = poptorch.trainingModel(
+    model_with_loss,
+    model_opts,
+    optimizer=optim.SGD(model.parameters(), lr=learning_rate)
+)
+"""
+We are ready to start training. However to track the accuracy while training
+we need to define one more helper function. During the training, not every 
+samples prediction is returned for efficiency reasons, so this helper function
+will check accuracy for labels where prediction is available.
+"""
 def accuracy(predictions, labels):
     _, ind = torch.max(predictions, 1)
-    # provide labels only for samples, where prediction is available (during the training, not every samples prediction is returned for efficiency reasons)
     labels = labels[-predictions.size()[0]:]
-    accuracy = torch.sum(torch.eq(ind, labels)).item() / labels.size()[0] * 100.0
+    accuracy = \
+        torch.sum(torch.eq(ind, labels)).item() / labels.size()[0] * 100.0
     return accuracy
-
-
-def train(training_model, training_data, opts):
-    nr_batches = len(training_data)
-    for epoch in range(1, opts.epochs+1):
-        print("Epoch {0}/{1}".format(epoch, opts.epochs))
-        bar = tqdm(training_data, total=nr_batches)
+"""
+This code will perform the requested amount of epochs and batches using the
+configured Graphcore IPUs.
+"""
+nr_batches = len(training_data)
+
+for epoch in tqdm(range(1, epochs+1), leave=True, desc="Epochs", total=epochs):
+    with tqdm(training_data, total=nr_batches, leave=False) as bar:
         for data, labels in bar:
             preds, losses = training_model(data, labels)
-            with torch.no_grad():
-                mean_loss = torch.mean(losses).item()
-                acc = accuracy(preds, labels)
-            bar.set_description("Loss:{:0.4f} | Accuracy:{:0.2f}%".format(mean_loss, acc))
-
-
-def test(inference_model, test_data):
-    nr_batches = len(test_data)
-    sum_acc = 0.0
-    with torch.no_grad():
-        for data, labels in tqdm(test_data, total=nr_batches):
-            output = inference_model(data)
-            sum_acc += accuracy(output, labels)
-    print("Accuracy on test set: {:0.2f}%".format(sum_acc / len(test_data)))
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='MNIST training in PopTorch')
-    parser.add_argument('--batch-size', type=int, default=8, help='batch size for training (default: 8)')
-    parser.add_argument('--batches-per-step', type=int, default=50, help='device iteration (default:50)')
-    parser.add_argument('--test-batch-size', type=int, default=80, help='batch size for testing (default: 80)')
-    parser.add_argument('--epochs', type=int, default=10, help='number of epochs to train (default: 10)')
-    parser.add_argument('--lr', type=float, default=0.05, help='learning rate (default: 0.05)')
-    opts = parser.parse_args()
-
-    training_data, test_data = get_mnist_data(opts)
-    model = Network()
-    model_with_loss = TrainingModelWithLoss(model)
-    model_opts = poptorch.Options().deviceIterations(opts.batches_per_step)
-    training_model = poptorch.trainingModel(model_with_loss, model_opts, optimizer=optim.SGD(model.parameters(), lr=opts.lr))
-
-    inference_model = poptorch.inferenceModel(model)
-
-    # run training, on IPU
-    train(training_model, training_data, opts)
-
-    # Update the weights in model by copying from the training IPU. This updates (model.parameters())
-    training_model.copyWeightsToHost()
-
-    # Check validation loss on IPU once trained. Because PopTorch will be compiled on first call the
-    # weights in model.parameters() will be copied implicitly. Subsequent calls will need to call
-    # inference_model.copyWeightsToDevice()
-    test(inference_model, test_data)
+
+            mean_loss = torch.mean(losses).item()
+
+            acc = accuracy(preds, labels)
+            bar.set_description(
+                "Loss: {:0.4f} | Accuracy: {:05.2F}% ".format(mean_loss, acc)
+            )
+# sst_hide_output
+"""
+Release resources:
+"""
+training_model.detachFromDevice()
+"""
+Let's check the validation loss on IPU using the trained model. The weights 
+in `model.parameters()` will be copied from the IPU to the host. The trained
+model will be reused to compile the new inference model.
+"""
+inference_model = poptorch.inferenceModel(model)
+"""
+Perform validation
+"""
+nr_batches = len(test_data)
+sum_acc = 0.0
+with tqdm(test_data, total=nr_batches, leave=False) as bar:
+    for data, labels in bar:
+        output = inference_model(data)
+        sum_acc += accuracy(output, labels)
+# sst_hide_output
+"""
+Finally the accuracy on the test set is:
+"""
+print("Accuracy on test set: {:0.2f}%".format(sum_acc / len(test_data)))
+"""
+Release resources:
+"""
+inference_model.detachFromDevice()
diff --git a/simple_applications/pytorch/mnist/mnist_poptorch_code_only.py b/simple_applications/pytorch/mnist/mnist_poptorch_code_only.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
+
+# Batch size for training
+batch_size = 8
+
+# Device iteration - batches per step
+device_iterations = 50
+
+# Batch size for testing
+test_batch_size = 80
+
+# Number of epochs to train
+epochs = 10
+
+# Learning rate
+learning_rate = 0.05
+
+from tqdm.auto import tqdm
+import torch
+import torch.nn as nn
+import torchvision
+import poptorch
+import torch.optim as optim
+
+local_dataset_path = '~/.torch/datasets'
+
+transform_mnist = torchvision.transforms.Compose(
+    [
+        torchvision.transforms.ToTensor(),
+        torchvision.transforms.Normalize((0.1307, ), (0.3081, ))
+    ]
+)
+
+training_dataset = torchvision.datasets.MNIST(
+        local_dataset_path,
+        train=True,
+        download=True,
+        transform=transform_mnist
+)
+
+training_data = torch.utils.data.DataLoader(
+    training_dataset,
+    batch_size=batch_size * device_iterations,
+    shuffle=True,
+    drop_last=True
+)
+
+test_dataset = torchvision.datasets.MNIST(
+        local_dataset_path,
+        train=False,
+        download=True,
+        transform=transform_mnist
+)
+
+test_data = torch.utils.data.DataLoader(
+    test_dataset,
+    batch_size=test_batch_size,
+    shuffle=True,
+    drop_last=True
+)
+
+class Block(nn.Module):
+    def __init__(self, in_channels, num_filters, kernel_size, pool_size):
+        super(Block, self).__init__()
+        self.conv = nn.Conv2d(in_channels,
+                              num_filters,
+                              kernel_size=kernel_size)
+        self.pool = nn.MaxPool2d(kernel_size=pool_size)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.pool(x)
+        x = self.relu(x)
+        return x
+
+class Network(nn.Module):
+    def __init__(self):
+        super(Network, self).__init__()
+        self.layer1 = Block(1, 32, 3, 2)
+        self.layer2 = Block(32, 64, 3, 2)
+        self.layer3 = nn.Linear(1600, 128)
+        self.layer3_act = nn.ReLU()
+        self.layer3_dropout = torch.nn.Dropout(0.5)
+        self.layer4 = nn.Linear(128, 10)
+        self.softmax = nn.Softmax(1)
+
+    def forward(self, x):
+        x = self.layer1(x)
+        x = self.layer2(x)
+        # Flatten layer
+        x = x.view(-1, 1600)
+        x = self.layer3_act(self.layer3(x))
+        x = self.layer4(self.layer3_dropout(x))
+        return x
+
+class TrainingModelWithLoss(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        self.loss = torch.nn.CrossEntropyLoss()
+
+    def forward(self, args, loss_inputs=None):
+        output = self.model(args)
+        loss = self.loss(output, loss_inputs)
+        return output, loss
+
+model = Network()
+model_with_loss = TrainingModelWithLoss(model)
+model_opts = poptorch.Options().deviceIterations(device_iterations)
+
+print(model_with_loss)
+
+training_model = poptorch.trainingModel(
+    model_with_loss,
+    model_opts,
+    optimizer=optim.SGD(model.parameters(), lr=learning_rate)
+)
+
+def accuracy(predictions, labels):
+    _, ind = torch.max(predictions, 1)
+    labels = labels[-predictions.size()[0]:]
+    accuracy = \
+        torch.sum(torch.eq(ind, labels)).item() / labels.size()[0] * 100.0
+    return accuracy
+
+nr_batches = len(training_data)
+
+for epoch in tqdm(range(1, epochs+1), leave=True, desc="Epochs", total=epochs):
+    with tqdm(training_data, total=nr_batches, leave=False) as bar:
+        for data, labels in bar:
+            preds, losses = training_model(data, labels)
+
+            mean_loss = torch.mean(losses).item()
+
+            acc = accuracy(preds, labels)
+            bar.set_description(
+                "Loss: {:0.4f} | Accuracy: {:05.2F}% ".format(mean_loss, acc)
+            )
+
+training_model.detachFromDevice()
+
+inference_model = poptorch.inferenceModel(model)
+
+nr_batches = len(test_data)
+sum_acc = 0.0
+with tqdm(test_data, total=nr_batches, leave=False) as bar:
+    for data, labels in bar:
+        output = inference_model(data)
+        sum_acc += accuracy(output, labels)
+
+print("Accuracy on test set: {:0.2f}%".format(sum_acc / len(test_data)))
+
+inference_model.detachFromDevice()