diff --git a/simple_applications/pytorch/mnist/README.md b/simple_applications/pytorch/mnist/README.md
index 6d348fbd..9f4acb9d 100644
--- a/simple_applications/pytorch/mnist/README.md
+++ b/simple_applications/pytorch/mnist/README.md
@@ -1,42 +1,319 @@
-# Graphcore
+# PyTorch(PopTorch) MNIST Training Demo
 
----
-## PyTorch(PopTorch) MNIST Training Demo
+This example demonstrates how to train a network on the MNIST dataset using
+PopTorch. To learn more about PopTorch, see our [PyTorch for the IPU: User Guide](https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/index.html).
 
-This example demonstrates how to train a network on the MNIST dataset using PopTorch.
+## How to use this demo
 
-### File structure
+### Environment preparation
 
-* `mnist_poptorch.py` The main file.
-* `README.md` This file.
+Install the Poplar SDK following the instructions in the [Getting Started](https://docs.graphcore.ai/en/latest/getting-started.html)
+guide for your IPU system. Make sure to run the `enable.sh` scripts for Poplar 
+and PopART and activate a Python3 virtualenv with PopTorch installed.
 
-### How to use this demo
+Then install the package requirements:
+```bash
+pip install -r requirements.txt
+```
 
-1) Prepare the environment.
+### Setting hyperparameters
+Set the hyperparameters for this demo. If you're running this example in 
+a Jupyter notebook and wish to modify them, re-run all the cells below.
 
-    Install the Poplar SDK following the instructions in the Getting Started guide for your IPU system. Make sure to run the `enable.sh` scripts for Poplar and PopART and activate a Python virtualenv with PopTorch installed.
 
-    Then install the package requirements:
+```python
+# Learning rate.
+learning_rate = 0.03
 
-       pip install -r requirements.txt
+# Number of epochs to train.
+epochs = 10
 
+# Batch size for training.
+batch_size = 8
 
-2) Run the program. Note that the PopTorch Python API only supports Python 3.
-Data will be automatically downloaded using torch vision utils.
+# Batch size for testing.
+test_batch_size = 80
 
-       python3 mnist_poptorch.py
+# Device iteration - batches per step. Number of iterations the device should
+# run over the data before returning to the user.
+# This is equivalent to running the IPU in a loop over that the specified
+# number of iterations, with a new batch of data each time. However, increasing
+# deviceIterations is more efficient because the loop runs on the IPU directly.
+device_iterations = 50
+```
 
-#### Options
-The program has a few command-line options:
+## Training a PopTorch model for MNIST classification
 
-`-h` Show usage information.
+Import required libraries:
 
-`--batch-size`        Sets the batch size for training.
 
-`--batches-per-step`  Number on mini-batches to perform on the device before returning to the host.
+```python
+from tqdm.auto import tqdm
+import torch
+import torch.nn as nn
+import torchvision
+import poptorch
+import torch.optim as optim
+```
 
-`--test-batch-size`   Sets the batch size for inference.
+Download the datasets for MNIST and set up data loaders.
+Source: [The MNIST Database](http://yann.lecun.com/exdb/mnist/).
 
-`--epochs`            Number of epoch to train for.
 
-`--lr`                Learning rate of the optimizer.
\ No newline at end of file
+```python
+local_dataset_path = '~/.torch/datasets'
+
+transform_mnist = torchvision.transforms.Compose(
+    [
+        torchvision.transforms.ToTensor(),
+        torchvision.transforms.Normalize((0.1307, ), (0.3081, ))
+    ]
+)
+
+training_dataset = torchvision.datasets.MNIST(
+        local_dataset_path,
+        train=True,
+        download=True,
+        transform=transform_mnist
+)
+
+training_data = torch.utils.data.DataLoader(
+    training_dataset,
+    batch_size=batch_size * device_iterations,
+    shuffle=True,
+    drop_last=True
+)
+
+test_dataset = torchvision.datasets.MNIST(
+        local_dataset_path,
+        train=False,
+        download=True,
+        transform=transform_mnist
+)
+
+test_data = torch.utils.data.DataLoader(
+    test_dataset,
+    batch_size=test_batch_size,
+    shuffle=True,
+    drop_last=True
+)
+```
+
+Let's define the elements of our neural network. We first create a `Block`
+instance consisting of a 2D convolutional layer with pooling, followed by
+a ReLU activation.
+
+
+```python
+class Block(nn.Module):
+    def __init__(self, in_channels, num_filters, kernel_size, pool_size):
+        super(Block, self).__init__()
+        self.conv = nn.Conv2d(in_channels,
+                              num_filters,
+                              kernel_size=kernel_size)
+        self.pool = nn.MaxPool2d(kernel_size=pool_size)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.pool(x)
+        x = self.relu(x)
+        return x
+```
+
+Now, let's construct our neural network.
+
+
+```python
+class Network(nn.Module):
+    def __init__(self):
+        super(Network, self).__init__()
+        self.layer1 = Block(1, 32, 3, 2)
+        self.layer2 = Block(32, 64, 3, 2)
+        self.layer3 = nn.Linear(1600, 128)
+        self.layer3_act = nn.ReLU()
+        self.layer3_dropout = torch.nn.Dropout(0.5)
+        self.layer4 = nn.Linear(128, 10)
+
+    def forward(self, x):
+        x = self.layer1(x)
+        x = self.layer2(x)
+        # Flatten layer
+        x = x.view(-1, 1600)
+        x = self.layer3_act(self.layer3(x))
+        x = self.layer4(self.layer3_dropout(x))
+        return x
+```
+
+Next we define a thin wrapper around the `torch.nn.Module` that will use
+the cross-entropy loss function.
+
+This class is creating a custom module to compose the Neural Network and 
+the Cross Entropy module into one object, which under the hood will invoke 
+the `__call__` function on `nn.Module` and consequently the `forward` method.
+
+
+```python
+class TrainingModelWithLoss(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        self.loss = torch.nn.CrossEntropyLoss()
+
+    def forward(self, args, loss_inputs=None):
+        output = self.model(args)
+        loss = self.loss(output, loss_inputs)
+        return output, loss
+```
+
+Let's initialise the neural network from our defined classes.
+
+
+```python
+model = Network()
+model_with_loss = TrainingModelWithLoss(model)
+model_opts = poptorch.Options().deviceIterations(device_iterations)
+```
+
+Next we will set the `AnchorMode` for our training. By default, PopTorch will
+return to the host machine only a limited set of information for performance
+reasons. This is represented by having `AnchorMode.Final` as the default, which
+means that only the final batch of the internal loop is returned to the host.
+When inspecting the training performance as it is executing, values like 
+accuracy or losses will then be calculated only for that last batch, 
+specifically the `batch_size` out of the whole step which is 
+`batch_size*device_iterations`.
+We can set this to `AnchorMode.All` to be able to present the full information.
+This has an impact on the speed of training, due to overhead of transferring
+more data to the host machine.
+To learn about all values for `AnchorMode`, please see the [PopTorch API documentation](https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/reference.html?highlight=anchorMode#poptorch.Options.anchorMode).
+
+
+```python
+model_opts = model_opts.anchorMode(poptorch.AnchorMode.All)
+```
+
+We can check if the model is assembled correctly by printing the string 
+representation of the model object.
+
+
+```python
+print(model_with_loss)
+```
+
+    TrainingModelWithLoss(
+      (model): Network(
+        (layer1): Block(
+          (conv): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
+          (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
+          (relu): ReLU()
+        )
+        (layer2): Block(
+          (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
+          (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
+          (relu): ReLU()
+        )
+        (layer3): Linear(in_features=1600, out_features=128, bias=True)
+        (layer3_act): ReLU()
+        (layer3_dropout): Dropout(p=0.5, inplace=False)
+        (layer4): Linear(in_features=128, out_features=10, bias=True)
+      )
+      (loss): CrossEntropyLoss()
+    )
+
+
+Now we apply the model wrapping function, which will perform a shallow copy
+of the PyTorch model. To train the model, we will use [SGD](https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/reference.html#poptorch.optim.SGD),
+the Stochastic Gradient Descent with no momentum.
+
+
+```python
+training_model = poptorch.trainingModel(
+    model_with_loss,
+    model_opts,
+    optimizer=optim.SGD(model.parameters(), lr=learning_rate)
+)
+```
+
+We are ready to start training. However to track the accuracy while training
+we need to define one more helper function. During the training, not every 
+samples prediction is returned for efficiency reasons, so this helper function
+will check accuracy for labels where prediction is available. This behavior
+is controlled by setting `AnchorMode` in `poptorch.Options()`.
+
+
+```python
+def accuracy(predictions, labels):
+    _, ind = torch.max(predictions, 1)
+    labels = labels[-predictions.size()[0]:]
+    accuracy = \
+        torch.sum(torch.eq(ind, labels)).item() / labels.size()[0] * 100.0
+    return accuracy
+```
+
+This code will perform the training over the requested amount of epochs
+and batches using the configured Graphcore IPUs.
+
+
+```python
+nr_batches = len(training_data)
+
+for epoch in tqdm(range(1, epochs+1), leave=True, desc="Epochs", total=epochs):
+    with tqdm(training_data, total=nr_batches, leave=False) as bar:
+        for data, labels in bar:
+            preds, losses = training_model(data, labels)
+
+            mean_loss = torch.mean(losses).item()
+
+            acc = accuracy(preds, labels)
+            bar.set_description(
+                "Loss: {:0.4f} | Accuracy: {:05.2F}% ".format(mean_loss, acc)
+            )
+```
+
+Release resources:
+
+
+```python
+training_model.detachFromDevice()
+```
+
+## Evaluating the trained model
+
+Let's check the validation loss on IPU using the trained model. The weights 
+in `model.parameters()` will be copied from the IPU to the host. The weights
+from the trained model will be reused to compile the new inference model.
+
+
+```python
+inference_model = poptorch.inferenceModel(model)
+```
+
+Perform validation.
+
+
+```python
+nr_batches = len(test_data)
+sum_acc = 0.0
+with tqdm(test_data, total=nr_batches, leave=False) as bar:
+    for data, labels in bar:
+        output = inference_model(data)
+        sum_acc += accuracy(output, labels)
+```
+
+Finally the accuracy on the test set is:
+
+
+```python
+print("Accuracy on test set: {:0.2f}%".format(sum_acc / len(test_data)))
+```
+
+    Accuracy on test set: 99.29%
+
+
+Release resources:
+
+
+```python
+inference_model.detachFromDevice()
+```
diff --git a/simple_applications/pytorch/mnist/mnist_poptorch.ipynb b/simple_applications/pytorch/mnist/mnist_poptorch.ipynb
new file mode 100644
index 00000000..5579d766
--- /dev/null
+++ b/simple_applications/pytorch/mnist/mnist_poptorch.ipynb
@@ -0,0 +1,528 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "5ea9dcbf",
+   "metadata": {},
+   "source": [
+    "Copyright (c) 2020 Graphcore Ltd. All rights reserved."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fe8a4121",
+   "metadata": {},
+   "source": [
+    "# PyTorch(PopTorch) MNIST Training Demo\n",
+    "\n",
+    "This example demonstrates how to train a network on the MNIST dataset using\n",
+    "PopTorch. To learn more about PopTorch, see our [PyTorch for the IPU: User Guide](https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/index.html)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "68a89b62",
+   "metadata": {},
+   "source": [
+    "## How to use this demo\n",
+    "\n",
+    "### Environment preparation\n",
+    "\n",
+    "Install the Poplar SDK following the instructions in the [Getting Started](https://docs.graphcore.ai/en/latest/getting-started.html)\n",
+    "guide for your IPU system. Make sure to run the `enable.sh` scripts for Poplar \n",
+    "and PopART and activate a Python3 virtualenv with PopTorch installed.\n",
+    "\n",
+    "Then install the package requirements:\n",
+    "```bash\n",
+    "pip install -r requirements.txt\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b3a01177",
+   "metadata": {},
+   "source": [
+    "### Setting hyperparameters\n",
+    "Set the hyperparameters for this demo. If you're running this example in \n",
+    "a Jupyter notebook and wish to modify them, re-run all the cells below."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9b30ddb4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Learning rate.\n",
+    "learning_rate = 0.03\n",
+    "\n",
+    "# Number of epochs to train.\n",
+    "epochs = 10\n",
+    "\n",
+    "# Batch size for training.\n",
+    "batch_size = 8\n",
+    "\n",
+    "# Batch size for testing.\n",
+    "test_batch_size = 80\n",
+    "\n",
+    "# Device iteration - batches per step. Number of iterations the device should\n",
+    "# run over the data before returning to the user.\n",
+    "# This is equivalent to running the IPU in a loop over that the specified\n",
+    "# number of iterations, with a new batch of data each time. However, increasing\n",
+    "# deviceIterations is more efficient because the loop runs on the IPU directly.\n",
+    "device_iterations = 50"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "adb536d0",
+   "metadata": {},
+   "source": [
+    "## Training a PopTorch model for MNIST classification"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f45d49d5",
+   "metadata": {},
+   "source": [
+    "Import required libraries:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4880cdc6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tqdm.auto import tqdm\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torchvision\n",
+    "import poptorch\n",
+    "import torch.optim as optim"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d7799dce",
+   "metadata": {},
+   "source": [
+    "Download the datasets for MNIST and set up data loaders.\n",
+    "Source: [The MNIST Database](http://yann.lecun.com/exdb/mnist/)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6d1c14c4",
+   "metadata": {
+    "tags": [
+     "sst_hide_output"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "local_dataset_path = '~/.torch/datasets'\n",
+    "\n",
+    "transform_mnist = torchvision.transforms.Compose(\n",
+    "    [\n",
+    "        torchvision.transforms.ToTensor(),\n",
+    "        torchvision.transforms.Normalize((0.1307, ), (0.3081, ))\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "training_dataset = torchvision.datasets.MNIST(\n",
+    "        local_dataset_path,\n",
+    "        train=True,\n",
+    "        download=True,\n",
+    "        transform=transform_mnist\n",
+    ")\n",
+    "\n",
+    "training_data = torch.utils.data.DataLoader(\n",
+    "    training_dataset,\n",
+    "    batch_size=batch_size * device_iterations,\n",
+    "    shuffle=True,\n",
+    "    drop_last=True\n",
+    ")\n",
+    "\n",
+    "test_dataset = torchvision.datasets.MNIST(\n",
+    "        local_dataset_path,\n",
+    "        train=False,\n",
+    "        download=True,\n",
+    "        transform=transform_mnist\n",
+    ")\n",
+    "\n",
+    "test_data = torch.utils.data.DataLoader(\n",
+    "    test_dataset,\n",
+    "    batch_size=test_batch_size,\n",
+    "    shuffle=True,\n",
+    "    drop_last=True\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "737749d5",
+   "metadata": {},
+   "source": [
+    "Let's define the elements of our neural network. We first create a `Block`\n",
+    "instance consisting of a 2D convolutional layer with pooling, followed by\n",
+    "a ReLU activation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d6aec255",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Block(nn.Module):\n",
+    "    def __init__(self, in_channels, num_filters, kernel_size, pool_size):\n",
+    "        super(Block, self).__init__()\n",
+    "        self.conv = nn.Conv2d(in_channels,\n",
+    "                              num_filters,\n",
+    "                              kernel_size=kernel_size)\n",
+    "        self.pool = nn.MaxPool2d(kernel_size=pool_size)\n",
+    "        self.relu = nn.ReLU()\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        x = self.conv(x)\n",
+    "        x = self.pool(x)\n",
+    "        x = self.relu(x)\n",
+    "        return x"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "79e798c3",
+   "metadata": {},
+   "source": [
+    "Now, let's construct our neural network."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ec965bda",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class Network(nn.Module):\n",
+    "    def __init__(self):\n",
+    "        super(Network, self).__init__()\n",
+    "        self.layer1 = Block(1, 32, 3, 2)\n",
+    "        self.layer2 = Block(32, 64, 3, 2)\n",
+    "        self.layer3 = nn.Linear(1600, 128)\n",
+    "        self.layer3_act = nn.ReLU()\n",
+    "        self.layer3_dropout = torch.nn.Dropout(0.5)\n",
+    "        self.layer4 = nn.Linear(128, 10)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        x = self.layer1(x)\n",
+    "        x = self.layer2(x)\n",
+    "        # Flatten layer\n",
+    "        x = x.view(-1, 1600)\n",
+    "        x = self.layer3_act(self.layer3(x))\n",
+    "        x = self.layer4(self.layer3_dropout(x))\n",
+    "        return x"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7ffdad87",
+   "metadata": {},
+   "source": [
+    "Next we define a thin wrapper around the `torch.nn.Module` that will use\n",
+    "the cross-entropy loss function.\n",
+    "\n",
+    "This class is creating a custom module to compose the Neural Network and \n",
+    "the Cross Entropy module into one object, which under the hood will invoke \n",
+    "the `__call__` function on `nn.Module` and consequently the `forward` method."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "54c2c3aa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class TrainingModelWithLoss(torch.nn.Module):\n",
+    "    def __init__(self, model):\n",
+    "        super().__init__()\n",
+    "        self.model = model\n",
+    "        self.loss = torch.nn.CrossEntropyLoss()\n",
+    "\n",
+    "    def forward(self, args, loss_inputs=None):\n",
+    "        output = self.model(args)\n",
+    "        loss = self.loss(output, loss_inputs)\n",
+    "        return output, loss"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "08c28dd5",
+   "metadata": {},
+   "source": [
+    "Let's initialise the neural network from our defined classes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a6a0efcf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = Network()\n",
+    "model_with_loss = TrainingModelWithLoss(model)\n",
+    "model_opts = poptorch.Options().deviceIterations(device_iterations)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0b5b8690",
+   "metadata": {},
+   "source": [
+    "Next we will set the `AnchorMode` for our training. By default, PopTorch will\n",
+    "return to the host machine only a limited set of information for performance\n",
+    "reasons. This is represented by having `AnchorMode.Final` as the default, which\n",
+    "means that only the final batch of the internal loop is returned to the host.\n",
+    "When inspecting the training performance as it is executing, values like \n",
+    "accuracy or losses will then be calculated only for that last batch, \n",
+    "specifically the `batch_size` out of the whole step which is \n",
+    "`batch_size*device_iterations`.\n",
+    "We can set this to `AnchorMode.All` to be able to present the full information.\n",
+    "This has an impact on the speed of training, due to overhead of transferring\n",
+    "more data to the host machine.\n",
+    "To learn about all values for `AnchorMode`, please see the [PopTorch API documentation](https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/reference.html?highlight=anchorMode#poptorch.Options.anchorMode)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c7f68254",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_opts = model_opts.anchorMode(poptorch.AnchorMode.All)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0f9091f0",
+   "metadata": {},
+   "source": [
+    "We can check if the model is assembled correctly by printing the string \n",
+    "representation of the model object."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d78ec229",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(model_with_loss)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3899a3a8",
+   "metadata": {},
+   "source": [
+    "Now we apply the model wrapping function, which will perform a shallow copy\n",
+    "of the PyTorch model. To train the model, we will use [SGD](https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/reference.html#poptorch.optim.SGD),\n",
+    "the Stochastic Gradient Descent with no momentum."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6543bb8b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "training_model = poptorch.trainingModel(\n",
+    "    model_with_loss,\n",
+    "    model_opts,\n",
+    "    optimizer=optim.SGD(model.parameters(), lr=learning_rate)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8547b538",
+   "metadata": {},
+   "source": [
+    "We are ready to start training. However to track the accuracy while training\n",
+    "we need to define one more helper function. During the training, not every \n",
+    "samples prediction is returned for efficiency reasons, so this helper function\n",
+    "will check accuracy for labels where prediction is available. This behavior\n",
+    "is controlled by setting `AnchorMode` in `poptorch.Options()`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "83a377a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def accuracy(predictions, labels):\n",
+    "    _, ind = torch.max(predictions, 1)\n",
+    "    labels = labels[-predictions.size()[0]:]\n",
+    "    accuracy = \\\n",
+    "        torch.sum(torch.eq(ind, labels)).item() / labels.size()[0] * 100.0\n",
+    "    return accuracy"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "517bed89",
+   "metadata": {},
+   "source": [
+    "This code will perform the training over the requested amount of epochs\n",
+    "and batches using the configured Graphcore IPUs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "03698d31",
+   "metadata": {
+    "tags": [
+     "sst_hide_output"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "nr_batches = len(training_data)\n",
+    "\n",
+    "for epoch in tqdm(range(1, epochs+1), leave=True, desc=\"Epochs\", total=epochs):\n",
+    "    with tqdm(training_data, total=nr_batches, leave=False) as bar:\n",
+    "        for data, labels in bar:\n",
+    "            preds, losses = training_model(data, labels)\n",
+    "\n",
+    "            mean_loss = torch.mean(losses).item()\n",
+    "\n",
+    "            acc = accuracy(preds, labels)\n",
+    "            bar.set_description(\n",
+    "                \"Loss: {:0.4f} | Accuracy: {:05.2F}% \".format(mean_loss, acc)\n",
+    "            )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "58449f5e",
+   "metadata": {},
+   "source": [
+    "Release resources:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "58350390",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "training_model.detachFromDevice()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3473427d",
+   "metadata": {},
+   "source": [
+    "## Evaluating the trained model\n",
+    "\n",
+    "Let's check the validation loss on IPU using the trained model. The weights \n",
+    "in `model.parameters()` will be copied from the IPU to the host. The weights\n",
+    "from the trained model will be reused to compile the new inference model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a113faad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inference_model = poptorch.inferenceModel(model)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "58a38c10",
+   "metadata": {},
+   "source": [
+    "Perform validation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "88b45015",
+   "metadata": {
+    "tags": [
+     "sst_hide_output"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "nr_batches = len(test_data)\n",
+    "sum_acc = 0.0\n",
+    "with tqdm(test_data, total=nr_batches, leave=False) as bar:\n",
+    "    for data, labels in bar:\n",
+    "        output = inference_model(data)\n",
+    "        sum_acc += accuracy(output, labels)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "17879607",
+   "metadata": {},
+   "source": [
+    "Finally the accuracy on the test set is:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d9879831",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"Accuracy on test set: {:0.2f}%\".format(sum_acc / len(test_data)))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d8fa3c59",
+   "metadata": {},
+   "source": [
+    "Release resources:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "934ffd0a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inference_model.detachFromDevice()"
+   ]
+  }
+ ],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/simple_applications/pytorch/mnist/mnist_poptorch.py b/simple_applications/pytorch/mnist/mnist_poptorch.py
index df5b56f8..c5ca9e48 100644
--- a/simple_applications/pytorch/mnist/mnist_poptorch.py
+++ b/simple_applications/pytorch/mnist/mnist_poptorch.py
@@ -1,37 +1,108 @@
 #!/usr/bin/env python3
-# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
-import argparse
-from tqdm import tqdm
+"""
+Copyright (c) 2020 Graphcore Ltd. All rights reserved.
+"""
+"""
+# PyTorch(PopTorch) MNIST Training Demo
+
+This example demonstrates how to train a network on the MNIST dataset using
+PopTorch. To learn more about PopTorch, see our [PyTorch for the IPU: User Guide](https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/index.html).
+"""
+"""
+## How to use this demo
+
+### Environment preparation
+
+Install the Poplar SDK following the instructions in the [Getting Started](https://docs.graphcore.ai/en/latest/getting-started.html)
+guide for your IPU system. Make sure to run the `enable.sh` scripts for Poplar 
+and PopART and activate a Python3 virtualenv with PopTorch installed.
+
+Then install the package requirements:
+```bash
+pip install -r requirements.txt
+```
+"""
+"""
+### Setting hyperparameters
+Set the hyperparameters for this demo. If you're running this example in 
+a Jupyter notebook and wish to modify them, re-run all the cells below.
+"""
+# Learning rate.
+learning_rate = 0.03
+
+# Number of epochs to train.
+epochs = 10
+
+# Batch size for training.
+batch_size = 8
+
+# Batch size for testing.
+test_batch_size = 80
+
+# Device iteration - batches per step. Number of iterations the device should
+# run over the data before returning to the user.
+# This is equivalent to running the IPU in a loop over that the specified
+# number of iterations, with a new batch of data each time. However, increasing
+# deviceIterations is more efficient because the loop runs on the IPU directly.
+device_iterations = 50
+"""
+## Training a PopTorch model for MNIST classification
+"""
+"""
+Import required libraries:
+"""
+from tqdm.auto import tqdm
 import torch
 import torch.nn as nn
 import torchvision
 import poptorch
 import torch.optim as optim
+"""
+Download the datasets for MNIST and set up data loaders.
+Source: [The MNIST Database](http://yann.lecun.com/exdb/mnist/).
+"""
+local_dataset_path = '~/.torch/datasets'
 
-# The following is a workaround for pytorch issue #1938
-from six.moves import urllib
-opener = urllib.request.build_opener()
-opener.addheaders = [("User-agent", "Mozilla/5.0")]
-urllib.request.install_opener(opener)
-
+transform_mnist = torchvision.transforms.Compose(
+    [
+        torchvision.transforms.ToTensor(),
+        torchvision.transforms.Normalize((0.1307, ), (0.3081, ))
+    ]
+)
 
-def get_mnist_data(opts):
-    training_data = torch.utils.data.DataLoader(
-                    torchvision.datasets.MNIST('mnist_data/', train=True, download=True,
-                                               transform=torchvision.transforms.Compose([
-                                                torchvision.transforms.ToTensor(),
-                                                torchvision.transforms.Normalize((0.1307, ), (0.3081, ))])),
-                    batch_size=opts.batch_size * opts.batches_per_step, shuffle=True, drop_last=True)
+training_dataset = torchvision.datasets.MNIST(
+        local_dataset_path,
+        train=True,
+        download=True,
+        transform=transform_mnist
+)
 
-    validation_data = torch.utils.data.DataLoader(
-                      torchvision.datasets.MNIST('mnist_data/', train=False, download=True,
-                                                 transform=torchvision.transforms.Compose([
-                                                    torchvision.transforms.ToTensor(),
-                                                    torchvision.transforms.Normalize((0.1307, ), (0.3081, ))])),
-                      batch_size=opts.test_batch_size, shuffle=True, drop_last=True)
-    return training_data, validation_data
+training_data = torch.utils.data.DataLoader(
+    training_dataset,
+    batch_size=batch_size * device_iterations,
+    shuffle=True,
+    drop_last=True
+)
 
+test_dataset = torchvision.datasets.MNIST(
+        local_dataset_path,
+        train=False,
+        download=True,
+        transform=transform_mnist
+)
 
+test_data = torch.utils.data.DataLoader(
+    test_dataset,
+    batch_size=test_batch_size,
+    shuffle=True,
+    drop_last=True
+)
+# sst_hide_output
+"""
+Let's define the elements of our neural network. We first create a `Block`
+instance consisting of a 2D convolutional layer with pooling, followed by
+a ReLU activation.
+"""
 class Block(nn.Module):
     def __init__(self, in_channels, num_filters, kernel_size, pool_size):
         super(Block, self).__init__()
@@ -46,8 +117,9 @@ def forward(self, x):
         x = self.pool(x)
         x = self.relu(x)
         return x
-
-
+"""
+Now, let's construct our neural network.
+"""
 class Network(nn.Module):
     def __init__(self):
         super(Network, self).__init__()
@@ -57,7 +129,6 @@ def __init__(self):
         self.layer3_act = nn.ReLU()
         self.layer3_dropout = torch.nn.Dropout(0.5)
         self.layer4 = nn.Linear(128, 10)
-        self.softmax = nn.Softmax(1)
 
     def forward(self, x):
         x = self.layer1(x)
@@ -66,10 +137,16 @@ def forward(self, x):
         x = x.view(-1, 1600)
         x = self.layer3_act(self.layer3(x))
         x = self.layer4(self.layer3_dropout(x))
-        x = self.softmax(x)
         return x
 
+"""
+Next we define a thin wrapper around the `torch.nn.Module` that will use
+the cross-entropy loss function.
 
+This class is creating a custom module to compose the Neural Network and 
+the Cross Entropy module into one object, which under the hood will invoke 
+the `__call__` function on `nn.Module` and consequently the `forward` method.
+"""
 class TrainingModelWithLoss(torch.nn.Module):
     def __init__(self, model):
         super().__init__()
@@ -78,68 +155,102 @@ def __init__(self, model):
 
     def forward(self, args, loss_inputs=None):
         output = self.model(args)
-        if loss_inputs is None:
-            return output
-        else:
-            loss = self.loss(output, loss_inputs)
-            return output, loss
-
-
+        loss = self.loss(output, loss_inputs)
+        return output, loss
+"""
+Let's initialise the neural network from our defined classes.
+"""
+model = Network()
+model_with_loss = TrainingModelWithLoss(model)
+model_opts = poptorch.Options().deviceIterations(device_iterations)
+"""
+Next we will set the `AnchorMode` for our training. By default, PopTorch will
+return to the host machine only a limited set of information for performance
+reasons. This is represented by having `AnchorMode.Final` as the default, which
+means that only the final batch of the internal loop is returned to the host.
+When inspecting the training performance as it is executing, values like 
+accuracy or losses will then be calculated only for that last batch, 
+specifically the `batch_size` out of the whole step which is 
+`batch_size*device_iterations`.
+We can set this to `AnchorMode.All` to be able to present the full information.
+This has an impact on the speed of training, due to overhead of transferring
+more data to the host machine.
+To learn about all values for `AnchorMode`, please see the [PopTorch API documentation](https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/reference.html?highlight=anchorMode#poptorch.Options.anchorMode).
+"""
+model_opts = model_opts.anchorMode(poptorch.AnchorMode.All)
+"""
+We can check if the model is assembled correctly by printing the string 
+representation of the model object.
+"""
+print(model_with_loss)
+"""
+Now we apply the model wrapping function, which will perform a shallow copy
+of the PyTorch model. To train the model, we will use [SGD](https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/reference.html#poptorch.optim.SGD),
+the Stochastic Gradient Descent with no momentum.
+"""
+training_model = poptorch.trainingModel(
+    model_with_loss,
+    model_opts,
+    optimizer=optim.SGD(model.parameters(), lr=learning_rate)
+)
+"""
+We are ready to start training. However to track the accuracy while training
+we need to define one more helper function. During the training, not every 
+samples prediction is returned for efficiency reasons, so this helper function
+will check accuracy for labels where prediction is available. This behavior
+is controlled by setting `AnchorMode` in `poptorch.Options()`.
+"""
 def accuracy(predictions, labels):
     _, ind = torch.max(predictions, 1)
-    # provide labels only for samples, where prediction is available (during the training, not every samples prediction is returned for efficiency reasons)
     labels = labels[-predictions.size()[0]:]
-    accuracy = torch.sum(torch.eq(ind, labels)).item() / labels.size()[0] * 100.0
+    accuracy = \
+        torch.sum(torch.eq(ind, labels)).item() / labels.size()[0] * 100.0
     return accuracy
+"""
+This code will perform the training over the requested amount of epochs
+and batches using the configured Graphcore IPUs.
+"""
+nr_batches = len(training_data)
 
-
-def train(training_model, training_data, opts):
-    nr_batches = len(training_data)
-    for epoch in range(1, opts.epochs+1):
-        print("Epoch {0}/{1}".format(epoch, opts.epochs))
-        bar = tqdm(training_data, total=nr_batches)
+for epoch in tqdm(range(1, epochs+1), leave=True, desc="Epochs", total=epochs):
+    with tqdm(training_data, total=nr_batches, leave=False) as bar:
         for data, labels in bar:
             preds, losses = training_model(data, labels)
-            with torch.no_grad():
-                mean_loss = torch.mean(losses).item()
-                acc = accuracy(preds, labels)
-            bar.set_description("Loss:{:0.4f} | Accuracy:{:0.2f}%".format(mean_loss, acc))
-
-
-def test(inference_model, test_data):
-    nr_batches = len(test_data)
-    sum_acc = 0.0
-    with torch.no_grad():
-        for data, labels in tqdm(test_data, total=nr_batches):
-            output = inference_model(data)
-            sum_acc += accuracy(output, labels)
-    print("Accuracy on test set: {:0.2f}%".format(sum_acc / len(test_data)))
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='MNIST training in PopTorch')
-    parser.add_argument('--batch-size', type=int, default=8, help='batch size for training (default: 8)')
-    parser.add_argument('--batches-per-step', type=int, default=50, help='device iteration (default:50)')
-    parser.add_argument('--test-batch-size', type=int, default=80, help='batch size for testing (default: 80)')
-    parser.add_argument('--epochs', type=int, default=10, help='number of epochs to train (default: 10)')
-    parser.add_argument('--lr', type=float, default=0.05, help='learning rate (default: 0.05)')
-    opts = parser.parse_args()
-
-    training_data, test_data = get_mnist_data(opts)
-    model = Network()
-    model_with_loss = TrainingModelWithLoss(model)
-    model_opts = poptorch.Options().deviceIterations(opts.batches_per_step)
-    training_model = poptorch.trainingModel(model_with_loss, model_opts, optimizer=optim.SGD(model.parameters(), lr=opts.lr))
-
-    inference_model = poptorch.inferenceModel(model)
-
-    # run training, on IPU
-    train(training_model, training_data, opts)
-
-    # Update the weights in model by copying from the training IPU. This updates (model.parameters())
-    training_model.copyWeightsToHost()
-
-    # Check validation loss on IPU once trained. Because PopTorch will be compiled on first call the
-    # weights in model.parameters() will be copied implicitly. Subsequent calls will need to call
-    # inference_model.copyWeightsToDevice()
-    test(inference_model, test_data)
+
+            mean_loss = torch.mean(losses).item()
+
+            acc = accuracy(preds, labels)
+            bar.set_description(
+                "Loss: {:0.4f} | Accuracy: {:05.2F}% ".format(mean_loss, acc)
+            )
+# sst_hide_output
+"""
+Release resources:
+"""
+training_model.detachFromDevice()
+"""
+## Evaluating the trained model
+
+Let's check the validation loss on IPU using the trained model. The weights 
+in `model.parameters()` will be copied from the IPU to the host. The weights
+from the trained model will be reused to compile the new inference model.
+"""
+inference_model = poptorch.inferenceModel(model)
+"""
+Perform validation.
+"""
+nr_batches = len(test_data)
+sum_acc = 0.0
+with tqdm(test_data, total=nr_batches, leave=False) as bar:
+    for data, labels in bar:
+        output = inference_model(data)
+        sum_acc += accuracy(output, labels)
+# sst_hide_output
+"""
+Finally the accuracy on the test set is:
+"""
+print("Accuracy on test set: {:0.2f}%".format(sum_acc / len(test_data)))
+"""
+Release resources:
+"""
+inference_model.detachFromDevice()
diff --git a/simple_applications/pytorch/mnist/mnist_poptorch_code_only.py b/simple_applications/pytorch/mnist/mnist_poptorch_code_only.py
new file mode 100644
index 00000000..4f0c80c8
--- /dev/null
+++ b/simple_applications/pytorch/mnist/mnist_poptorch_code_only.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2020 Graphcore Ltd. All rights reserved.
+# Learning rate.
+learning_rate = 0.03
+
+# Number of epochs to train.
+epochs = 10
+
+# Batch size for training.
+batch_size = 8
+
+# Batch size for testing.
+test_batch_size = 80
+
+# Device iteration - batches per step. Number of iterations the device should
+# run over the data before returning to the user.
+# This is equivalent to running the IPU in a loop over that the specified
+# number of iterations, with a new batch of data each time. However, increasing
+# deviceIterations is more efficient because the loop runs on the IPU directly.
+device_iterations = 50
+
+from tqdm.auto import tqdm
+import torch
+import torch.nn as nn
+import torchvision
+import poptorch
+import torch.optim as optim
+
+local_dataset_path = '~/.torch/datasets'
+
+transform_mnist = torchvision.transforms.Compose(
+    [
+        torchvision.transforms.ToTensor(),
+        torchvision.transforms.Normalize((0.1307, ), (0.3081, ))
+    ]
+)
+
+training_dataset = torchvision.datasets.MNIST(
+        local_dataset_path,
+        train=True,
+        download=True,
+        transform=transform_mnist
+)
+
+training_data = torch.utils.data.DataLoader(
+    training_dataset,
+    batch_size=batch_size * device_iterations,
+    shuffle=True,
+    drop_last=True
+)
+
+test_dataset = torchvision.datasets.MNIST(
+        local_dataset_path,
+        train=False,
+        download=True,
+        transform=transform_mnist
+)
+
+test_data = torch.utils.data.DataLoader(
+    test_dataset,
+    batch_size=test_batch_size,
+    shuffle=True,
+    drop_last=True
+)
+
+class Block(nn.Module):
+    def __init__(self, in_channels, num_filters, kernel_size, pool_size):
+        super(Block, self).__init__()
+        self.conv = nn.Conv2d(in_channels,
+                              num_filters,
+                              kernel_size=kernel_size)
+        self.pool = nn.MaxPool2d(kernel_size=pool_size)
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.pool(x)
+        x = self.relu(x)
+        return x
+
+class Network(nn.Module):
+    def __init__(self):
+        super(Network, self).__init__()
+        self.layer1 = Block(1, 32, 3, 2)
+        self.layer2 = Block(32, 64, 3, 2)
+        self.layer3 = nn.Linear(1600, 128)
+        self.layer3_act = nn.ReLU()
+        self.layer3_dropout = torch.nn.Dropout(0.5)
+        self.layer4 = nn.Linear(128, 10)
+
+    def forward(self, x):
+        x = self.layer1(x)
+        x = self.layer2(x)
+        # Flatten layer
+        x = x.view(-1, 1600)
+        x = self.layer3_act(self.layer3(x))
+        x = self.layer4(self.layer3_dropout(x))
+        return x
+
+class TrainingModelWithLoss(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.model = model
+        self.loss = torch.nn.CrossEntropyLoss()
+
+    def forward(self, args, loss_inputs=None):
+        output = self.model(args)
+        loss = self.loss(output, loss_inputs)
+        return output, loss
+
+model = Network()
+model_with_loss = TrainingModelWithLoss(model)
+model_opts = poptorch.Options().deviceIterations(device_iterations)
+
+model_opts = model_opts.anchorMode(poptorch.AnchorMode.All)
+
+print(model_with_loss)
+
+training_model = poptorch.trainingModel(
+    model_with_loss,
+    model_opts,
+    optimizer=optim.SGD(model.parameters(), lr=learning_rate)
+)
+
+def accuracy(predictions, labels):
+    _, ind = torch.max(predictions, 1)
+    labels = labels[-predictions.size()[0]:]
+    accuracy = \
+        torch.sum(torch.eq(ind, labels)).item() / labels.size()[0] * 100.0
+    return accuracy
+
+nr_batches = len(training_data)
+
+for epoch in tqdm(range(1, epochs+1), leave=True, desc="Epochs", total=epochs):
+    with tqdm(training_data, total=nr_batches, leave=False) as bar:
+        for data, labels in bar:
+            preds, losses = training_model(data, labels)
+
+            mean_loss = torch.mean(losses).item()
+
+            acc = accuracy(preds, labels)
+            bar.set_description(
+                "Loss: {:0.4f} | Accuracy: {:05.2F}% ".format(mean_loss, acc)
+            )
+
+training_model.detachFromDevice()
+
+inference_model = poptorch.inferenceModel(model)
+
+nr_batches = len(test_data)
+sum_acc = 0.0
+with tqdm(test_data, total=nr_batches, leave=False) as bar:
+    for data, labels in bar:
+        output = inference_model(data)
+        sum_acc += accuracy(output, labels)
+
+print("Accuracy on test set: {:0.2f}%".format(sum_acc / len(test_data)))
+
+inference_model.detachFromDevice()