-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathbehavioural_cloning.py
212 lines (172 loc) · 8.77 KB
/
behavioural_cloning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
from tqdm import tqdm
import numpy as np
import torch as th
from torch import nn
import gym
import minerl
"""
Your task: Implement behavioural cloning for MineRLTreechop-v0.
Behavioural cloning is perhaps the simplest way of using a dataset of demonstrations to train an agent:
learn to predict what actions they would take, and take those actions.
In other machine learning terms, this is almost like building a classifier to classify observations to
different actions, and taking those actions.
For simplicity, we build a limited set of actions ("agent actions"), map dataset actions to these actions
and train on the agent actions. During evaluation, we transform these agent actions (integerse) back into
MineRL actions (dictionaries).
To do this task, fill in the "TODO"s and remove `raise NotImplementedError`s.
Note: For this task you need to download the "MineRLTreechop-v0" dataset. See here:
https://minerl.readthedocs.io/en/latest/tutorials/data_sampling.html#downloading-the-minerl-dataset-with-minerl-data-download
"""
class ConvNet(nn.Module):
"""
:param input_shape: A three-item tuple telling image dimensions in (C, H, W)
:param output_dim: Dimensionality of the output vector
"""
def __init__(self, input_shape, output_dim):
super().__init__()
# TODO Create a torch neural network here to turn images (of shape `input_shape`) into
# a vector of shape `output_dim`. This output_dim matches number of available actions.
# See examples of doing CNN networks here https://pytorch.org/tutorials/beginner/nn_tutorial.html#switch-to-cnn
raise NotImplementedError("TODO implement a simple convolutional neural network here")
def forward(self, observations: th.Tensor) -> th.Tensor:
# TODO with the layers you created in __init__, transform the `observations` (a tensor of shape (B, C, H, W)) to
# a tensor of shape (B, D), where D is the `output_dim`
raise NotImplementedError("TODO implement forward function of the neural network")
def agent_action_to_environment(noop_action, agent_action):
"""
Turn an agent action (an integer) into an environment action.
This should match `environment_action_batch_to_agent_actions`,
e.g. if attack=1 action was mapped to agent_action=0, then agent_action=0
should be mapped back to attack=1.
noop_action is a MineRL action that does nothing. You may want to
use this as a template for the action you return.
"""
raise NotImplementedError("TODO implement agent_action_to_environment (see docstring)")
def environment_action_batch_to_agent_actions(dataset_actions):
"""
Turn a batch of actions from environment (from BufferedBatchIterator) to a numpy
array of agent actions.
Agent actions _have to_ start from 0 and go up from there!
For MineRLTreechop, you want to have actions for the following at the very least:
- Forward movement
- Jumping
- Turning camera left, right, up and down
- Attack
For example, you could have seven agent actions that mean following:
0 = forward
1 = jump
2 = turn camera left
3 = turn camera right
4 = turn camera up
5 = turn camera down
6 = attack
This should match `agent_action_to_environment`, by converting dictionary
actions into individual integeres.
If dataset action (dict) does not have a mapping to agent action (int),
then set it "-1"
"""
# There are dummy dimensions of shape one
batch_size = len(dataset_actions["camera"])
actions = np.zeros((batch_size,), dtype=np.int)
for i in range(batch_size):
# TODO this will make all actions invalid. Replace with something
# more clever
actions[i] = -1
raise NotImplementedError("TODO map dataset action at index i to an agent action, or if no mapping, -1")
return actions
def train():
# Path to where MineRL dataset resides (should contain "MineRLTreechop-v0" directory)
DATA_DIR = "."
# How many times we train over dataset and how large batches we use.
# Larger batch size takes more memory but generally provides stabler learning.
EPOCHS = 1
BATCH_SIZE = 32
# TODO create data iterators for going over MineRL data using BufferedBatchIterator
# https://minerl.readthedocs.io/en/latest/tutorials/data_sampling.html#sampling-the-dataset-with-buffered-batch-iter
# NOTE: You have to download the Treechop dataset first for this to work, see:
# https://minerl.readthedocs.io/en/latest/tutorials/data_sampling.html#downloading-the-minerl-dataset-with-minerl-data-download
raise NotImplementedError("TODO create dataset samplers")
iterator = None
number_of_actions = None
# TODO we need to tell the network how many possible actions there are,
# so assign the value in above variable
raise NotImplementedError("TODO add number of actions to `number_of_actions`")
network = ConvNet((3, 64, 64), number_of_actions).cuda()
# TODO create optimizer and loss functions for training
# see examples here https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html
raise NotImplementedError("TODO Create an optimizer and a loss function.")
optimizer = None
loss_function = None
iter_count = 0
losses = []
for dataset_obs, dataset_actions, _, _, _ in tqdm(iterator.buffered_batch_iter(num_epochs=EPOCHS, batch_size=BATCH_SIZE)):
# We only use camera observations here
obs = dataset_obs["pov"].astype(np.float32)
# Transpose observations to be channel-first (BCHW instead of BHWC)
obs = obs.transpose(0, 3, 1, 2)
# Normalize observations, otherwise the neural network will get spooked
obs /= 255.0
# Turn dataset actions into agent actions
actions = environment_action_batch_to_agent_actions(dataset_actions)
assert actions.shape == (obs.shape[0],), "Array from environment_action_batch_to_agent_actions should be of shape {}".format((obs.shape[0],))
# Remove samples that had no corresponding action
mask = actions != -1
obs = obs[mask]
actions = actions[mask]
# TODO perform optimization step:
# - Predict actions using the neural network (input is `obs`)
# - Compute loss with the predictions and true actions. Store loss into variable `loss`
# - Use optimizer to do a single update step
# See https://pytorch.org/tutorials/beginner/basics/optimization_tutorial.html
# for a tutorial
# NOTE: Variables `obs` and `actions` are numpy arrays. You need to convert them into torch tensors.
# Keep track of how training is going by printing out the loss
iter_count += 1
losses.append(loss.item())
if (iter_count % 1000) == 0:
mean_loss = sum(losses) / len(losses)
tqdm.write("Iteration {}. Loss {:<10.3f}".format(iter_count, mean_loss))
losses.clear()
# Store the network
th.save(network, "behavioural_cloning.pth")
def enjoy():
# Load up the trained network
network = th.load("behavioural_cloning.pth").cuda()
env = gym.make('MineRLTreechop-v0')
# Play 10 games with the model
for game_i in range(10):
obs = env.reset()
done = False
reward_sum = 0
while not done:
# TODO Process the observation:
# - Take only the camera observation
# - Add/remove batch dimensions
# - Transpose image (needs to be channels-last)
# - Normalize image
# - Store network output to `logits`
# For hints, see what preprocessing was done during training
raise NotImplementedError("TODO process the observation and run it through network")
logits = None
# Turn logits into probabilities
probabilities = th.softmax(logits, dim=1)[0]
# Into numpy
probabilities = probabilities.detach().cpu().numpy()
# TODO Pick an action based from the probabilities above.
# The `probabilities` vector tells the probability of choosing one of the agent actions.
# You have two options:
# 1) Pick action with the highest probability
# 2) Sample action based on probabilities
# Option 2 works better emperically.
agent_action = None
noop_action = env.action_space.noop()
environment_action = agent_action_to_environment(noop_action, agent_action)
obs, reward, done, info = env.step(environment_action)
reward_sum += reward
print("Game {}, total reward {}".format(game_i, reward_sum))
env.close()
if __name__ == "__main__":
# First train the model...
train()
# ... then play it on the environment to see how it does
enjoy()