diff --git a/simple_applications/pytorch/mnist/README.md b/simple_applications/pytorch/mnist/README.md index 6d348fbd..9f4acb9d 100644 --- a/simple_applications/pytorch/mnist/README.md +++ b/simple_applications/pytorch/mnist/README.md @@ -1,42 +1,319 @@ -# Graphcore +# PyTorch(PopTorch) MNIST Training Demo ---- -## PyTorch(PopTorch) MNIST Training Demo +This example demonstrates how to train a network on the MNIST dataset using +PopTorch. To learn more about PopTorch, see our [PyTorch for the IPU: User Guide](https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/index.html). -This example demonstrates how to train a network on the MNIST dataset using PopTorch. +## How to use this demo -### File structure +### Environment preparation -* `mnist_poptorch.py` The main file. -* `README.md` This file. +Install the Poplar SDK following the instructions in the [Getting Started](https://docs.graphcore.ai/en/latest/getting-started.html) +guide for your IPU system. Make sure to run the `enable.sh` scripts for Poplar +and PopART and activate a Python3 virtualenv with PopTorch installed. -### How to use this demo +Then install the package requirements: +```bash +pip install -r requirements.txt +``` -1) Prepare the environment. +### Setting hyperparameters +Set the hyperparameters for this demo. If you're running this example in +a Jupyter notebook and wish to modify them, re-run all the cells below. - Install the Poplar SDK following the instructions in the Getting Started guide for your IPU system. Make sure to run the `enable.sh` scripts for Poplar and PopART and activate a Python virtualenv with PopTorch installed. - Then install the package requirements: +```python +# Learning rate. +learning_rate = 0.03 - pip install -r requirements.txt +# Number of epochs to train. +epochs = 10 +# Batch size for training. +batch_size = 8 -2) Run the program. Note that the PopTorch Python API only supports Python 3. -Data will be automatically downloaded using torch vision utils. +# Batch size for testing. +test_batch_size = 80 - python3 mnist_poptorch.py +# Device iteration - batches per step. Number of iterations the device should +# run over the data before returning to the user. +# This is equivalent to running the IPU in a loop over that the specified +# number of iterations, with a new batch of data each time. However, increasing +# deviceIterations is more efficient because the loop runs on the IPU directly. +device_iterations = 50 +``` -#### Options -The program has a few command-line options: +## Training a PopTorch model for MNIST classification -`-h` Show usage information. +Import required libraries: -`--batch-size` Sets the batch size for training. -`--batches-per-step` Number on mini-batches to perform on the device before returning to the host. +```python +from tqdm.auto import tqdm +import torch +import torch.nn as nn +import torchvision +import poptorch +import torch.optim as optim +``` -`--test-batch-size` Sets the batch size for inference. +Download the datasets for MNIST and set up data loaders. +Source: [The MNIST Database](http://yann.lecun.com/exdb/mnist/). -`--epochs` Number of epoch to train for. -`--lr` Learning rate of the optimizer. \ No newline at end of file +```python +local_dataset_path = '~/.torch/datasets' + +transform_mnist = torchvision.transforms.Compose( + [ + torchvision.transforms.ToTensor(), + torchvision.transforms.Normalize((0.1307, ), (0.3081, )) + ] +) + +training_dataset = torchvision.datasets.MNIST( + local_dataset_path, + train=True, + download=True, + transform=transform_mnist +) + +training_data = torch.utils.data.DataLoader( + training_dataset, + batch_size=batch_size * device_iterations, + shuffle=True, + drop_last=True +) + +test_dataset = torchvision.datasets.MNIST( + local_dataset_path, + train=False, + download=True, + transform=transform_mnist +) + +test_data = torch.utils.data.DataLoader( + test_dataset, + batch_size=test_batch_size, + shuffle=True, + drop_last=True +) +``` + +Let's define the elements of our neural network. We first create a `Block` +instance consisting of a 2D convolutional layer with pooling, followed by +a ReLU activation. + + +```python +class Block(nn.Module): + def __init__(self, in_channels, num_filters, kernel_size, pool_size): + super(Block, self).__init__() + self.conv = nn.Conv2d(in_channels, + num_filters, + kernel_size=kernel_size) + self.pool = nn.MaxPool2d(kernel_size=pool_size) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.conv(x) + x = self.pool(x) + x = self.relu(x) + return x +``` + +Now, let's construct our neural network. + + +```python +class Network(nn.Module): + def __init__(self): + super(Network, self).__init__() + self.layer1 = Block(1, 32, 3, 2) + self.layer2 = Block(32, 64, 3, 2) + self.layer3 = nn.Linear(1600, 128) + self.layer3_act = nn.ReLU() + self.layer3_dropout = torch.nn.Dropout(0.5) + self.layer4 = nn.Linear(128, 10) + + def forward(self, x): + x = self.layer1(x) + x = self.layer2(x) + # Flatten layer + x = x.view(-1, 1600) + x = self.layer3_act(self.layer3(x)) + x = self.layer4(self.layer3_dropout(x)) + return x +``` + +Next we define a thin wrapper around the `torch.nn.Module` that will use +the cross-entropy loss function. + +This class is creating a custom module to compose the Neural Network and +the Cross Entropy module into one object, which under the hood will invoke +the `__call__` function on `nn.Module` and consequently the `forward` method. + + +```python +class TrainingModelWithLoss(torch.nn.Module): + def __init__(self, model): + super().__init__() + self.model = model + self.loss = torch.nn.CrossEntropyLoss() + + def forward(self, args, loss_inputs=None): + output = self.model(args) + loss = self.loss(output, loss_inputs) + return output, loss +``` + +Let's initialise the neural network from our defined classes. + + +```python +model = Network() +model_with_loss = TrainingModelWithLoss(model) +model_opts = poptorch.Options().deviceIterations(device_iterations) +``` + +Next we will set the `AnchorMode` for our training. By default, PopTorch will +return to the host machine only a limited set of information for performance +reasons. This is represented by having `AnchorMode.Final` as the default, which +means that only the final batch of the internal loop is returned to the host. +When inspecting the training performance as it is executing, values like +accuracy or losses will then be calculated only for that last batch, +specifically the `batch_size` out of the whole step which is +`batch_size*device_iterations`. +We can set this to `AnchorMode.All` to be able to present the full information. +This has an impact on the speed of training, due to overhead of transferring +more data to the host machine. +To learn about all values for `AnchorMode`, please see the [PopTorch API documentation](https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/reference.html?highlight=anchorMode#poptorch.Options.anchorMode). + + +```python +model_opts = model_opts.anchorMode(poptorch.AnchorMode.All) +``` + +We can check if the model is assembled correctly by printing the string +representation of the model object. + + +```python +print(model_with_loss) +``` + + TrainingModelWithLoss( + (model): Network( + (layer1): Block( + (conv): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1)) + (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) + (relu): ReLU() + ) + (layer2): Block( + (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1)) + (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False) + (relu): ReLU() + ) + (layer3): Linear(in_features=1600, out_features=128, bias=True) + (layer3_act): ReLU() + (layer3_dropout): Dropout(p=0.5, inplace=False) + (layer4): Linear(in_features=128, out_features=10, bias=True) + ) + (loss): CrossEntropyLoss() + ) + + +Now we apply the model wrapping function, which will perform a shallow copy +of the PyTorch model. To train the model, we will use [SGD](https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/reference.html#poptorch.optim.SGD), +the Stochastic Gradient Descent with no momentum. + + +```python +training_model = poptorch.trainingModel( + model_with_loss, + model_opts, + optimizer=optim.SGD(model.parameters(), lr=learning_rate) +) +``` + +We are ready to start training. However to track the accuracy while training +we need to define one more helper function. During the training, not every +samples prediction is returned for efficiency reasons, so this helper function +will check accuracy for labels where prediction is available. This behavior +is controlled by setting `AnchorMode` in `poptorch.Options()`. + + +```python +def accuracy(predictions, labels): + _, ind = torch.max(predictions, 1) + labels = labels[-predictions.size()[0]:] + accuracy = \ + torch.sum(torch.eq(ind, labels)).item() / labels.size()[0] * 100.0 + return accuracy +``` + +This code will perform the training over the requested amount of epochs +and batches using the configured Graphcore IPUs. + + +```python +nr_batches = len(training_data) + +for epoch in tqdm(range(1, epochs+1), leave=True, desc="Epochs", total=epochs): + with tqdm(training_data, total=nr_batches, leave=False) as bar: + for data, labels in bar: + preds, losses = training_model(data, labels) + + mean_loss = torch.mean(losses).item() + + acc = accuracy(preds, labels) + bar.set_description( + "Loss: {:0.4f} | Accuracy: {:05.2F}% ".format(mean_loss, acc) + ) +``` + +Release resources: + + +```python +training_model.detachFromDevice() +``` + +## Evaluating the trained model + +Let's check the validation loss on IPU using the trained model. The weights +in `model.parameters()` will be copied from the IPU to the host. The weights +from the trained model will be reused to compile the new inference model. + + +```python +inference_model = poptorch.inferenceModel(model) +``` + +Perform validation. + + +```python +nr_batches = len(test_data) +sum_acc = 0.0 +with tqdm(test_data, total=nr_batches, leave=False) as bar: + for data, labels in bar: + output = inference_model(data) + sum_acc += accuracy(output, labels) +``` + +Finally the accuracy on the test set is: + + +```python +print("Accuracy on test set: {:0.2f}%".format(sum_acc / len(test_data))) +``` + + Accuracy on test set: 99.29% + + +Release resources: + + +```python +inference_model.detachFromDevice() +``` diff --git a/simple_applications/pytorch/mnist/mnist_poptorch.ipynb b/simple_applications/pytorch/mnist/mnist_poptorch.ipynb new file mode 100644 index 00000000..5579d766 --- /dev/null +++ b/simple_applications/pytorch/mnist/mnist_poptorch.ipynb @@ -0,0 +1,528 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5ea9dcbf", + "metadata": {}, + "source": [ + "Copyright (c) 2020 Graphcore Ltd. All rights reserved." + ] + }, + { + "cell_type": "markdown", + "id": "fe8a4121", + "metadata": {}, + "source": [ + "# PyTorch(PopTorch) MNIST Training Demo\n", + "\n", + "This example demonstrates how to train a network on the MNIST dataset using\n", + "PopTorch. To learn more about PopTorch, see our [PyTorch for the IPU: User Guide](https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/index.html)." + ] + }, + { + "cell_type": "markdown", + "id": "68a89b62", + "metadata": {}, + "source": [ + "## How to use this demo\n", + "\n", + "### Environment preparation\n", + "\n", + "Install the Poplar SDK following the instructions in the [Getting Started](https://docs.graphcore.ai/en/latest/getting-started.html)\n", + "guide for your IPU system. Make sure to run the `enable.sh` scripts for Poplar \n", + "and PopART and activate a Python3 virtualenv with PopTorch installed.\n", + "\n", + "Then install the package requirements:\n", + "```bash\n", + "pip install -r requirements.txt\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "b3a01177", + "metadata": {}, + "source": [ + "### Setting hyperparameters\n", + "Set the hyperparameters for this demo. If you're running this example in \n", + "a Jupyter notebook and wish to modify them, re-run all the cells below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9b30ddb4", + "metadata": {}, + "outputs": [], + "source": [ + "# Learning rate.\n", + "learning_rate = 0.03\n", + "\n", + "# Number of epochs to train.\n", + "epochs = 10\n", + "\n", + "# Batch size for training.\n", + "batch_size = 8\n", + "\n", + "# Batch size for testing.\n", + "test_batch_size = 80\n", + "\n", + "# Device iteration - batches per step. Number of iterations the device should\n", + "# run over the data before returning to the user.\n", + "# This is equivalent to running the IPU in a loop over that the specified\n", + "# number of iterations, with a new batch of data each time. However, increasing\n", + "# deviceIterations is more efficient because the loop runs on the IPU directly.\n", + "device_iterations = 50" + ] + }, + { + "cell_type": "markdown", + "id": "adb536d0", + "metadata": {}, + "source": [ + "## Training a PopTorch model for MNIST classification" + ] + }, + { + "cell_type": "markdown", + "id": "f45d49d5", + "metadata": {}, + "source": [ + "Import required libraries:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4880cdc6", + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm.auto import tqdm\n", + "import torch\n", + "import torch.nn as nn\n", + "import torchvision\n", + "import poptorch\n", + "import torch.optim as optim" + ] + }, + { + "cell_type": "markdown", + "id": "d7799dce", + "metadata": {}, + "source": [ + "Download the datasets for MNIST and set up data loaders.\n", + "Source: [The MNIST Database](http://yann.lecun.com/exdb/mnist/)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d1c14c4", + "metadata": { + "tags": [ + "sst_hide_output" + ] + }, + "outputs": [], + "source": [ + "local_dataset_path = '~/.torch/datasets'\n", + "\n", + "transform_mnist = torchvision.transforms.Compose(\n", + " [\n", + " torchvision.transforms.ToTensor(),\n", + " torchvision.transforms.Normalize((0.1307, ), (0.3081, ))\n", + " ]\n", + ")\n", + "\n", + "training_dataset = torchvision.datasets.MNIST(\n", + " local_dataset_path,\n", + " train=True,\n", + " download=True,\n", + " transform=transform_mnist\n", + ")\n", + "\n", + "training_data = torch.utils.data.DataLoader(\n", + " training_dataset,\n", + " batch_size=batch_size * device_iterations,\n", + " shuffle=True,\n", + " drop_last=True\n", + ")\n", + "\n", + "test_dataset = torchvision.datasets.MNIST(\n", + " local_dataset_path,\n", + " train=False,\n", + " download=True,\n", + " transform=transform_mnist\n", + ")\n", + "\n", + "test_data = torch.utils.data.DataLoader(\n", + " test_dataset,\n", + " batch_size=test_batch_size,\n", + " shuffle=True,\n", + " drop_last=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "737749d5", + "metadata": {}, + "source": [ + "Let's define the elements of our neural network. We first create a `Block`\n", + "instance consisting of a 2D convolutional layer with pooling, followed by\n", + "a ReLU activation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6aec255", + "metadata": {}, + "outputs": [], + "source": [ + "class Block(nn.Module):\n", + " def __init__(self, in_channels, num_filters, kernel_size, pool_size):\n", + " super(Block, self).__init__()\n", + " self.conv = nn.Conv2d(in_channels,\n", + " num_filters,\n", + " kernel_size=kernel_size)\n", + " self.pool = nn.MaxPool2d(kernel_size=pool_size)\n", + " self.relu = nn.ReLU()\n", + "\n", + " def forward(self, x):\n", + " x = self.conv(x)\n", + " x = self.pool(x)\n", + " x = self.relu(x)\n", + " return x" + ] + }, + { + "cell_type": "markdown", + "id": "79e798c3", + "metadata": {}, + "source": [ + "Now, let's construct our neural network." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec965bda", + "metadata": {}, + "outputs": [], + "source": [ + "class Network(nn.Module):\n", + " def __init__(self):\n", + " super(Network, self).__init__()\n", + " self.layer1 = Block(1, 32, 3, 2)\n", + " self.layer2 = Block(32, 64, 3, 2)\n", + " self.layer3 = nn.Linear(1600, 128)\n", + " self.layer3_act = nn.ReLU()\n", + " self.layer3_dropout = torch.nn.Dropout(0.5)\n", + " self.layer4 = nn.Linear(128, 10)\n", + "\n", + " def forward(self, x):\n", + " x = self.layer1(x)\n", + " x = self.layer2(x)\n", + " # Flatten layer\n", + " x = x.view(-1, 1600)\n", + " x = self.layer3_act(self.layer3(x))\n", + " x = self.layer4(self.layer3_dropout(x))\n", + " return x" + ] + }, + { + "cell_type": "markdown", + "id": "7ffdad87", + "metadata": {}, + "source": [ + "Next we define a thin wrapper around the `torch.nn.Module` that will use\n", + "the cross-entropy loss function.\n", + "\n", + "This class is creating a custom module to compose the Neural Network and \n", + "the Cross Entropy module into one object, which under the hood will invoke \n", + "the `__call__` function on `nn.Module` and consequently the `forward` method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54c2c3aa", + "metadata": {}, + "outputs": [], + "source": [ + "class TrainingModelWithLoss(torch.nn.Module):\n", + " def __init__(self, model):\n", + " super().__init__()\n", + " self.model = model\n", + " self.loss = torch.nn.CrossEntropyLoss()\n", + "\n", + " def forward(self, args, loss_inputs=None):\n", + " output = self.model(args)\n", + " loss = self.loss(output, loss_inputs)\n", + " return output, loss" + ] + }, + { + "cell_type": "markdown", + "id": "08c28dd5", + "metadata": {}, + "source": [ + "Let's initialise the neural network from our defined classes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6a0efcf", + "metadata": {}, + "outputs": [], + "source": [ + "model = Network()\n", + "model_with_loss = TrainingModelWithLoss(model)\n", + "model_opts = poptorch.Options().deviceIterations(device_iterations)" + ] + }, + { + "cell_type": "markdown", + "id": "0b5b8690", + "metadata": {}, + "source": [ + "Next we will set the `AnchorMode` for our training. By default, PopTorch will\n", + "return to the host machine only a limited set of information for performance\n", + "reasons. This is represented by having `AnchorMode.Final` as the default, which\n", + "means that only the final batch of the internal loop is returned to the host.\n", + "When inspecting the training performance as it is executing, values like \n", + "accuracy or losses will then be calculated only for that last batch, \n", + "specifically the `batch_size` out of the whole step which is \n", + "`batch_size*device_iterations`.\n", + "We can set this to `AnchorMode.All` to be able to present the full information.\n", + "This has an impact on the speed of training, due to overhead of transferring\n", + "more data to the host machine.\n", + "To learn about all values for `AnchorMode`, please see the [PopTorch API documentation](https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/reference.html?highlight=anchorMode#poptorch.Options.anchorMode)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7f68254", + "metadata": {}, + "outputs": [], + "source": [ + "model_opts = model_opts.anchorMode(poptorch.AnchorMode.All)" + ] + }, + { + "cell_type": "markdown", + "id": "0f9091f0", + "metadata": {}, + "source": [ + "We can check if the model is assembled correctly by printing the string \n", + "representation of the model object." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d78ec229", + "metadata": {}, + "outputs": [], + "source": [ + "print(model_with_loss)" + ] + }, + { + "cell_type": "markdown", + "id": "3899a3a8", + "metadata": {}, + "source": [ + "Now we apply the model wrapping function, which will perform a shallow copy\n", + "of the PyTorch model. To train the model, we will use [SGD](https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/reference.html#poptorch.optim.SGD),\n", + "the Stochastic Gradient Descent with no momentum." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6543bb8b", + "metadata": {}, + "outputs": [], + "source": [ + "training_model = poptorch.trainingModel(\n", + " model_with_loss,\n", + " model_opts,\n", + " optimizer=optim.SGD(model.parameters(), lr=learning_rate)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "8547b538", + "metadata": {}, + "source": [ + "We are ready to start training. However to track the accuracy while training\n", + "we need to define one more helper function. During the training, not every \n", + "samples prediction is returned for efficiency reasons, so this helper function\n", + "will check accuracy for labels where prediction is available. This behavior\n", + "is controlled by setting `AnchorMode` in `poptorch.Options()`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "83a377a3", + "metadata": {}, + "outputs": [], + "source": [ + "def accuracy(predictions, labels):\n", + " _, ind = torch.max(predictions, 1)\n", + " labels = labels[-predictions.size()[0]:]\n", + " accuracy = \\\n", + " torch.sum(torch.eq(ind, labels)).item() / labels.size()[0] * 100.0\n", + " return accuracy" + ] + }, + { + "cell_type": "markdown", + "id": "517bed89", + "metadata": {}, + "source": [ + "This code will perform the training over the requested amount of epochs\n", + "and batches using the configured Graphcore IPUs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03698d31", + "metadata": { + "tags": [ + "sst_hide_output" + ] + }, + "outputs": [], + "source": [ + "nr_batches = len(training_data)\n", + "\n", + "for epoch in tqdm(range(1, epochs+1), leave=True, desc=\"Epochs\", total=epochs):\n", + " with tqdm(training_data, total=nr_batches, leave=False) as bar:\n", + " for data, labels in bar:\n", + " preds, losses = training_model(data, labels)\n", + "\n", + " mean_loss = torch.mean(losses).item()\n", + "\n", + " acc = accuracy(preds, labels)\n", + " bar.set_description(\n", + " \"Loss: {:0.4f} | Accuracy: {:05.2F}% \".format(mean_loss, acc)\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "58449f5e", + "metadata": {}, + "source": [ + "Release resources:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58350390", + "metadata": {}, + "outputs": [], + "source": [ + "training_model.detachFromDevice()" + ] + }, + { + "cell_type": "markdown", + "id": "3473427d", + "metadata": {}, + "source": [ + "## Evaluating the trained model\n", + "\n", + "Let's check the validation loss on IPU using the trained model. The weights \n", + "in `model.parameters()` will be copied from the IPU to the host. The weights\n", + "from the trained model will be reused to compile the new inference model." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a113faad", + "metadata": {}, + "outputs": [], + "source": [ + "inference_model = poptorch.inferenceModel(model)" + ] + }, + { + "cell_type": "markdown", + "id": "58a38c10", + "metadata": {}, + "source": [ + "Perform validation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88b45015", + "metadata": { + "tags": [ + "sst_hide_output" + ] + }, + "outputs": [], + "source": [ + "nr_batches = len(test_data)\n", + "sum_acc = 0.0\n", + "with tqdm(test_data, total=nr_batches, leave=False) as bar:\n", + " for data, labels in bar:\n", + " output = inference_model(data)\n", + " sum_acc += accuracy(output, labels)" + ] + }, + { + "cell_type": "markdown", + "id": "17879607", + "metadata": {}, + "source": [ + "Finally the accuracy on the test set is:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9879831", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Accuracy on test set: {:0.2f}%\".format(sum_acc / len(test_data)))" + ] + }, + { + "cell_type": "markdown", + "id": "d8fa3c59", + "metadata": {}, + "source": [ + "Release resources:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "934ffd0a", + "metadata": {}, + "outputs": [], + "source": [ + "inference_model.detachFromDevice()" + ] + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/simple_applications/pytorch/mnist/mnist_poptorch.py b/simple_applications/pytorch/mnist/mnist_poptorch.py index df5b56f8..c5ca9e48 100644 --- a/simple_applications/pytorch/mnist/mnist_poptorch.py +++ b/simple_applications/pytorch/mnist/mnist_poptorch.py @@ -1,37 +1,108 @@ #!/usr/bin/env python3 -# Copyright (c) 2020 Graphcore Ltd. All rights reserved. -import argparse -from tqdm import tqdm +""" +Copyright (c) 2020 Graphcore Ltd. All rights reserved. +""" +""" +# PyTorch(PopTorch) MNIST Training Demo + +This example demonstrates how to train a network on the MNIST dataset using +PopTorch. To learn more about PopTorch, see our [PyTorch for the IPU: User Guide](https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/index.html). +""" +""" +## How to use this demo + +### Environment preparation + +Install the Poplar SDK following the instructions in the [Getting Started](https://docs.graphcore.ai/en/latest/getting-started.html) +guide for your IPU system. Make sure to run the `enable.sh` scripts for Poplar +and PopART and activate a Python3 virtualenv with PopTorch installed. + +Then install the package requirements: +```bash +pip install -r requirements.txt +``` +""" +""" +### Setting hyperparameters +Set the hyperparameters for this demo. If you're running this example in +a Jupyter notebook and wish to modify them, re-run all the cells below. +""" +# Learning rate. +learning_rate = 0.03 + +# Number of epochs to train. +epochs = 10 + +# Batch size for training. +batch_size = 8 + +# Batch size for testing. +test_batch_size = 80 + +# Device iteration - batches per step. Number of iterations the device should +# run over the data before returning to the user. +# This is equivalent to running the IPU in a loop over that the specified +# number of iterations, with a new batch of data each time. However, increasing +# deviceIterations is more efficient because the loop runs on the IPU directly. +device_iterations = 50 +""" +## Training a PopTorch model for MNIST classification +""" +""" +Import required libraries: +""" +from tqdm.auto import tqdm import torch import torch.nn as nn import torchvision import poptorch import torch.optim as optim +""" +Download the datasets for MNIST and set up data loaders. +Source: [The MNIST Database](http://yann.lecun.com/exdb/mnist/). +""" +local_dataset_path = '~/.torch/datasets' -# The following is a workaround for pytorch issue #1938 -from six.moves import urllib -opener = urllib.request.build_opener() -opener.addheaders = [("User-agent", "Mozilla/5.0")] -urllib.request.install_opener(opener) - +transform_mnist = torchvision.transforms.Compose( + [ + torchvision.transforms.ToTensor(), + torchvision.transforms.Normalize((0.1307, ), (0.3081, )) + ] +) -def get_mnist_data(opts): - training_data = torch.utils.data.DataLoader( - torchvision.datasets.MNIST('mnist_data/', train=True, download=True, - transform=torchvision.transforms.Compose([ - torchvision.transforms.ToTensor(), - torchvision.transforms.Normalize((0.1307, ), (0.3081, ))])), - batch_size=opts.batch_size * opts.batches_per_step, shuffle=True, drop_last=True) +training_dataset = torchvision.datasets.MNIST( + local_dataset_path, + train=True, + download=True, + transform=transform_mnist +) - validation_data = torch.utils.data.DataLoader( - torchvision.datasets.MNIST('mnist_data/', train=False, download=True, - transform=torchvision.transforms.Compose([ - torchvision.transforms.ToTensor(), - torchvision.transforms.Normalize((0.1307, ), (0.3081, ))])), - batch_size=opts.test_batch_size, shuffle=True, drop_last=True) - return training_data, validation_data +training_data = torch.utils.data.DataLoader( + training_dataset, + batch_size=batch_size * device_iterations, + shuffle=True, + drop_last=True +) +test_dataset = torchvision.datasets.MNIST( + local_dataset_path, + train=False, + download=True, + transform=transform_mnist +) +test_data = torch.utils.data.DataLoader( + test_dataset, + batch_size=test_batch_size, + shuffle=True, + drop_last=True +) +# sst_hide_output +""" +Let's define the elements of our neural network. We first create a `Block` +instance consisting of a 2D convolutional layer with pooling, followed by +a ReLU activation. +""" class Block(nn.Module): def __init__(self, in_channels, num_filters, kernel_size, pool_size): super(Block, self).__init__() @@ -46,8 +117,9 @@ def forward(self, x): x = self.pool(x) x = self.relu(x) return x - - +""" +Now, let's construct our neural network. +""" class Network(nn.Module): def __init__(self): super(Network, self).__init__() @@ -57,7 +129,6 @@ def __init__(self): self.layer3_act = nn.ReLU() self.layer3_dropout = torch.nn.Dropout(0.5) self.layer4 = nn.Linear(128, 10) - self.softmax = nn.Softmax(1) def forward(self, x): x = self.layer1(x) @@ -66,10 +137,16 @@ def forward(self, x): x = x.view(-1, 1600) x = self.layer3_act(self.layer3(x)) x = self.layer4(self.layer3_dropout(x)) - x = self.softmax(x) return x +""" +Next we define a thin wrapper around the `torch.nn.Module` that will use +the cross-entropy loss function. +This class is creating a custom module to compose the Neural Network and +the Cross Entropy module into one object, which under the hood will invoke +the `__call__` function on `nn.Module` and consequently the `forward` method. +""" class TrainingModelWithLoss(torch.nn.Module): def __init__(self, model): super().__init__() @@ -78,68 +155,102 @@ def __init__(self, model): def forward(self, args, loss_inputs=None): output = self.model(args) - if loss_inputs is None: - return output - else: - loss = self.loss(output, loss_inputs) - return output, loss - - + loss = self.loss(output, loss_inputs) + return output, loss +""" +Let's initialise the neural network from our defined classes. +""" +model = Network() +model_with_loss = TrainingModelWithLoss(model) +model_opts = poptorch.Options().deviceIterations(device_iterations) +""" +Next we will set the `AnchorMode` for our training. By default, PopTorch will +return to the host machine only a limited set of information for performance +reasons. This is represented by having `AnchorMode.Final` as the default, which +means that only the final batch of the internal loop is returned to the host. +When inspecting the training performance as it is executing, values like +accuracy or losses will then be calculated only for that last batch, +specifically the `batch_size` out of the whole step which is +`batch_size*device_iterations`. +We can set this to `AnchorMode.All` to be able to present the full information. +This has an impact on the speed of training, due to overhead of transferring +more data to the host machine. +To learn about all values for `AnchorMode`, please see the [PopTorch API documentation](https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/reference.html?highlight=anchorMode#poptorch.Options.anchorMode). +""" +model_opts = model_opts.anchorMode(poptorch.AnchorMode.All) +""" +We can check if the model is assembled correctly by printing the string +representation of the model object. +""" +print(model_with_loss) +""" +Now we apply the model wrapping function, which will perform a shallow copy +of the PyTorch model. To train the model, we will use [SGD](https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/reference.html#poptorch.optim.SGD), +the Stochastic Gradient Descent with no momentum. +""" +training_model = poptorch.trainingModel( + model_with_loss, + model_opts, + optimizer=optim.SGD(model.parameters(), lr=learning_rate) +) +""" +We are ready to start training. However to track the accuracy while training +we need to define one more helper function. During the training, not every +samples prediction is returned for efficiency reasons, so this helper function +will check accuracy for labels where prediction is available. This behavior +is controlled by setting `AnchorMode` in `poptorch.Options()`. +""" def accuracy(predictions, labels): _, ind = torch.max(predictions, 1) - # provide labels only for samples, where prediction is available (during the training, not every samples prediction is returned for efficiency reasons) labels = labels[-predictions.size()[0]:] - accuracy = torch.sum(torch.eq(ind, labels)).item() / labels.size()[0] * 100.0 + accuracy = \ + torch.sum(torch.eq(ind, labels)).item() / labels.size()[0] * 100.0 return accuracy +""" +This code will perform the training over the requested amount of epochs +and batches using the configured Graphcore IPUs. +""" +nr_batches = len(training_data) - -def train(training_model, training_data, opts): - nr_batches = len(training_data) - for epoch in range(1, opts.epochs+1): - print("Epoch {0}/{1}".format(epoch, opts.epochs)) - bar = tqdm(training_data, total=nr_batches) +for epoch in tqdm(range(1, epochs+1), leave=True, desc="Epochs", total=epochs): + with tqdm(training_data, total=nr_batches, leave=False) as bar: for data, labels in bar: preds, losses = training_model(data, labels) - with torch.no_grad(): - mean_loss = torch.mean(losses).item() - acc = accuracy(preds, labels) - bar.set_description("Loss:{:0.4f} | Accuracy:{:0.2f}%".format(mean_loss, acc)) - - -def test(inference_model, test_data): - nr_batches = len(test_data) - sum_acc = 0.0 - with torch.no_grad(): - for data, labels in tqdm(test_data, total=nr_batches): - output = inference_model(data) - sum_acc += accuracy(output, labels) - print("Accuracy on test set: {:0.2f}%".format(sum_acc / len(test_data))) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='MNIST training in PopTorch') - parser.add_argument('--batch-size', type=int, default=8, help='batch size for training (default: 8)') - parser.add_argument('--batches-per-step', type=int, default=50, help='device iteration (default:50)') - parser.add_argument('--test-batch-size', type=int, default=80, help='batch size for testing (default: 80)') - parser.add_argument('--epochs', type=int, default=10, help='number of epochs to train (default: 10)') - parser.add_argument('--lr', type=float, default=0.05, help='learning rate (default: 0.05)') - opts = parser.parse_args() - - training_data, test_data = get_mnist_data(opts) - model = Network() - model_with_loss = TrainingModelWithLoss(model) - model_opts = poptorch.Options().deviceIterations(opts.batches_per_step) - training_model = poptorch.trainingModel(model_with_loss, model_opts, optimizer=optim.SGD(model.parameters(), lr=opts.lr)) - - inference_model = poptorch.inferenceModel(model) - - # run training, on IPU - train(training_model, training_data, opts) - - # Update the weights in model by copying from the training IPU. This updates (model.parameters()) - training_model.copyWeightsToHost() - - # Check validation loss on IPU once trained. Because PopTorch will be compiled on first call the - # weights in model.parameters() will be copied implicitly. Subsequent calls will need to call - # inference_model.copyWeightsToDevice() - test(inference_model, test_data) + + mean_loss = torch.mean(losses).item() + + acc = accuracy(preds, labels) + bar.set_description( + "Loss: {:0.4f} | Accuracy: {:05.2F}% ".format(mean_loss, acc) + ) +# sst_hide_output +""" +Release resources: +""" +training_model.detachFromDevice() +""" +## Evaluating the trained model + +Let's check the validation loss on IPU using the trained model. The weights +in `model.parameters()` will be copied from the IPU to the host. The weights +from the trained model will be reused to compile the new inference model. +""" +inference_model = poptorch.inferenceModel(model) +""" +Perform validation. +""" +nr_batches = len(test_data) +sum_acc = 0.0 +with tqdm(test_data, total=nr_batches, leave=False) as bar: + for data, labels in bar: + output = inference_model(data) + sum_acc += accuracy(output, labels) +# sst_hide_output +""" +Finally the accuracy on the test set is: +""" +print("Accuracy on test set: {:0.2f}%".format(sum_acc / len(test_data))) +""" +Release resources: +""" +inference_model.detachFromDevice() diff --git a/simple_applications/pytorch/mnist/mnist_poptorch_code_only.py b/simple_applications/pytorch/mnist/mnist_poptorch_code_only.py new file mode 100644 index 00000000..4f0c80c8 --- /dev/null +++ b/simple_applications/pytorch/mnist/mnist_poptorch_code_only.py @@ -0,0 +1,158 @@ +# Copyright (c) 2020 Graphcore Ltd. All rights reserved. +# Learning rate. +learning_rate = 0.03 + +# Number of epochs to train. +epochs = 10 + +# Batch size for training. +batch_size = 8 + +# Batch size for testing. +test_batch_size = 80 + +# Device iteration - batches per step. Number of iterations the device should +# run over the data before returning to the user. +# This is equivalent to running the IPU in a loop over that the specified +# number of iterations, with a new batch of data each time. However, increasing +# deviceIterations is more efficient because the loop runs on the IPU directly. +device_iterations = 50 + +from tqdm.auto import tqdm +import torch +import torch.nn as nn +import torchvision +import poptorch +import torch.optim as optim + +local_dataset_path = '~/.torch/datasets' + +transform_mnist = torchvision.transforms.Compose( + [ + torchvision.transforms.ToTensor(), + torchvision.transforms.Normalize((0.1307, ), (0.3081, )) + ] +) + +training_dataset = torchvision.datasets.MNIST( + local_dataset_path, + train=True, + download=True, + transform=transform_mnist +) + +training_data = torch.utils.data.DataLoader( + training_dataset, + batch_size=batch_size * device_iterations, + shuffle=True, + drop_last=True +) + +test_dataset = torchvision.datasets.MNIST( + local_dataset_path, + train=False, + download=True, + transform=transform_mnist +) + +test_data = torch.utils.data.DataLoader( + test_dataset, + batch_size=test_batch_size, + shuffle=True, + drop_last=True +) + +class Block(nn.Module): + def __init__(self, in_channels, num_filters, kernel_size, pool_size): + super(Block, self).__init__() + self.conv = nn.Conv2d(in_channels, + num_filters, + kernel_size=kernel_size) + self.pool = nn.MaxPool2d(kernel_size=pool_size) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.conv(x) + x = self.pool(x) + x = self.relu(x) + return x + +class Network(nn.Module): + def __init__(self): + super(Network, self).__init__() + self.layer1 = Block(1, 32, 3, 2) + self.layer2 = Block(32, 64, 3, 2) + self.layer3 = nn.Linear(1600, 128) + self.layer3_act = nn.ReLU() + self.layer3_dropout = torch.nn.Dropout(0.5) + self.layer4 = nn.Linear(128, 10) + + def forward(self, x): + x = self.layer1(x) + x = self.layer2(x) + # Flatten layer + x = x.view(-1, 1600) + x = self.layer3_act(self.layer3(x)) + x = self.layer4(self.layer3_dropout(x)) + return x + +class TrainingModelWithLoss(torch.nn.Module): + def __init__(self, model): + super().__init__() + self.model = model + self.loss = torch.nn.CrossEntropyLoss() + + def forward(self, args, loss_inputs=None): + output = self.model(args) + loss = self.loss(output, loss_inputs) + return output, loss + +model = Network() +model_with_loss = TrainingModelWithLoss(model) +model_opts = poptorch.Options().deviceIterations(device_iterations) + +model_opts = model_opts.anchorMode(poptorch.AnchorMode.All) + +print(model_with_loss) + +training_model = poptorch.trainingModel( + model_with_loss, + model_opts, + optimizer=optim.SGD(model.parameters(), lr=learning_rate) +) + +def accuracy(predictions, labels): + _, ind = torch.max(predictions, 1) + labels = labels[-predictions.size()[0]:] + accuracy = \ + torch.sum(torch.eq(ind, labels)).item() / labels.size()[0] * 100.0 + return accuracy + +nr_batches = len(training_data) + +for epoch in tqdm(range(1, epochs+1), leave=True, desc="Epochs", total=epochs): + with tqdm(training_data, total=nr_batches, leave=False) as bar: + for data, labels in bar: + preds, losses = training_model(data, labels) + + mean_loss = torch.mean(losses).item() + + acc = accuracy(preds, labels) + bar.set_description( + "Loss: {:0.4f} | Accuracy: {:05.2F}% ".format(mean_loss, acc) + ) + +training_model.detachFromDevice() + +inference_model = poptorch.inferenceModel(model) + +nr_batches = len(test_data) +sum_acc = 0.0 +with tqdm(test_data, total=nr_batches, leave=False) as bar: + for data, labels in bar: + output = inference_model(data) + sum_acc += accuracy(output, labels) + +print("Accuracy on test set: {:0.2f}%".format(sum_acc / len(test_data))) + +inference_model.detachFromDevice()