rlcode · Oct 2, 2017
diff --git a/‎__init__.py b/‎__init__.py
diff --git a/‎autostart.sh
+12 b/‎autostart.sh
+12
diff --git a/‎ddpg_actor.h5
769 KB b/‎ddpg_actor.h5
769 KB
diff --git a/‎ddpg_critic.h5
2.13 MB b/‎ddpg_critic.h5
2.13 MB
diff --git a/‎example_1.py
+29 b/‎example_1.py
+29
diff --git a/‎example_2.py
+59 b/‎example_2.py
+59
diff --git a/‎example_ddpg.py
+197 b/‎example_ddpg.py
+197
diff --git a/‎gym_torcs.py
+302 b/‎gym_torcs.py
+302
diff --git a/‎gym_torcs.pyc
7.38 KB b/‎gym_torcs.pyc
7.38 KB
diff --git a/‎snakeoil3_gym.py
+638 b/‎snakeoil3_gym.py
+638
diff --git a/‎snakeoil3_gym.pyc
17.9 KB b/‎snakeoil3_gym.pyc
17.9 KB
@@ -0,0 +1,12 @@
+#!/bin/bash
+xte 'key Return'
+xte 'usleep 100000'
+xte 'key Return'
+xte 'usleep 100000'
+xte 'key Up'
+xte 'usleep 100000'
+xte 'key Up'
+xte 'usleep 100000'
+xte 'key Return'
+xte 'usleep 100000'
+xte 'key Return'
@@ -0,0 +1,29 @@
+from gym_torcs import TorcsEnv
+
+env = TorcsEnv(vision=False, throttle=True, gear_change=False)
+
+print('testing environment')
+
+for i in range(10):
+    print("episode: ", i)
+
+    if i % 3 == 0:
+        env.reset(relaunch=True)
+    else:
+        env.reset()
+
+    for i in range(100):
+        # action = np.random.random(3)
+        # print(action)
+        action = [0.2, 1, 0]
+        observe, reward, done, info = env.step(action)
+
+        # print(observe.rpm)
+        # print(observe.wheelSpinVel)
+        # print(observe.track, done)
+        # -1 ~ 1
+        # print('the distance between car and track: ', observe.trackPos)
+        # -1 : -180 , +1 : +180
+        # print('the angle between car and track: ', observe.angle)
+        # print("x speed of car: ", observe.speedX, " y speed of car: ",
+        #       observe.speedY, " z speed of car: ", observe.speedZ)
@@ -0,0 +1,59 @@
+from gym_torcs import TorcsEnv
+from collections import deque
+import numpy as np
+from keras.layers import Dense, Input, merge
+from keras.initializations import normal
+from keras.models import Model
+
+
+class DDPGAgent:
+    def __init__(self):
+        self.actor = self.build_actor()
+        self.memory = deque(maxlen=100000)
+
+    def build_actor(self):
+        print("build actor network")
+        input = Input(shape=[29])
+        h1 = Dense(300, activation='relu')(input)
+        h2 = Dense(600, activation='relu')(h1)
+        steer = Dense(1, activation='tanh', init=lambda shape,
+                      name: normal(shape, scale=1e-4, name=name))(h2)
+        accel = Dense(1, activation='sigmoid', init=lambda shape,
+                      name: normal(shape, scale=1e-4, name=name))(h2)
+        brake = Dense(1, activation='sigmoid', init=lambda shape,
+                      name: normal(shape, scale=1e-4, name=name))(h2)
+        action = merge([steer, accel, brake], mode='concat')
+        actor = Model(input=input, output=action)
+        return actor
+
+    def get_action(self, state):
+        action = self.actor.predict(state)[0]
+        return action
+
+
+agent = DDPGAgent()
+env = TorcsEnv(vision=False, throttle=True, gear_change=False)
+
+print('testing sample agent on torcs')
+
+for i in range(10):
+    if i % 3 == 0:
+        observe = env.reset(relaunch=True)
+    else:
+        observe = env.reset()
+
+    state = np.hstack((observe.angle, observe.track, observe.trackPos,
+                       observe.speedX, observe.speedY, observe.speedZ,
+                       observe.wheelSpinVel / 100.0, observe.rpm))
+    state = np.reshape(state, [1, np.shape(state)[0]])
+    done = False
+
+    while not done:
+        action = agent.get_action(state)
+        observe, reward, done, info = env.step(action)
+        next_state = np.hstack((observe.angle, observe.track, observe.trackPos,
+                                observe.speedX, observe.speedY, observe.speedZ,
+                                observe.wheelSpinVel / 100.0, observe.rpm))
+        next_state = np.reshape(next_state, [1, np.shape(next_state)[0]])
+
+        state = next_state
@@ -0,0 +1,197 @@
+from gym_torcs import TorcsEnv
+from collections import deque
+import numpy as np
+from keras.layers import Dense, Input, Add, Concatenate
+from keras.optimizers import Adam
+from keras.models import Model
+from keras import backend as K
+import tensorflow as tf
+import random
+
+
+def ou_noise(x, mu, theta, sigma):
+    return theta * (mu - x) + sigma * np.random.randn(1)
+
+
+def normal(shape, scale=0.05, name=None):
+    return K.variable(np.random.normal(loc=0.0, scale=scale, size=shape),
+                      name=name)
+
+
+class DDPGAgent:
+    def __init__(self):
+        self.action_size = 3
+        self.state_size = 29
+
+        self.actor, self.actor_weight = self.build_actor()
+        self.actor_target, self.actor_target_weight = self.build_actor()
+        self.critic, self.critic_state, self.critic_action = self.build_critic()
+        self.critic_target, _, _ = self.build_critic()
+
+        # actor optimizer
+        self.action_grads = K.placeholder(shape=[None, self.action_size])
+        params_grad = tf.gradients(self.actor.output, self.actor_weight,
+                                   -self.action_grads)
+        grads = zip(params_grad, self.actor_weight)
+        self.optimize = tf.train.AdamOptimizer(0.0001).apply_gradients(grads)
+
+        self.memory = deque(maxlen=100000)
+        self.batch_size = 32
+        self.discount_factor = 0.99
+        self.tau = 0.001
+        self.epsilon = 1
+        self.epsilon_decay = 1/100000
+
+        self.sess = tf.Session()
+        K.set_session(self.sess)
+        self.sess.run(tf.global_variables_initializer())
+
+    def build_actor(self):
+        print("building actor network")
+        input = Input(shape=[self.state_size])
+        h1 = Dense(300, activation='relu')(input)
+        h2 = Dense(600, activation='relu')(h1)
+        steer = Dense(1, activation='tanh')(h2)
+        accel = Dense(1, activation='sigmoid')(h2)
+        brake = Dense(1, activation='sigmoid')(h2)
+        action = Concatenate()([steer, accel, brake])
+        actor = Model(inputs=input, outputs=action)
+        return actor, actor.trainable_weights
+
+    '''
+    def actor_optimizer(self):
+        self.action_grads = K.placeholder(shape=[None, self.action_size])
+        # loss = -self.actor.output * action_grads
+        params_grad = tf.gradients(self.actor.output, self.actor_weight,
+                                   -self.action_grads)
+        grads = zip(params_grad, self.actor_weight)
+        self.optimize = tf.train.AdamOptimizer(0.0001).apply_gradients(grads)
+        # optimizer = Adam(lr=0.0001)
+        # updates = optimizer.get_updates(self.actor_weight, [], loss)
+        # train = K.function([self.actor.input, action_grads], [], updates=updates)
+        # return train
+    '''
+
+    def update_actor(self, states, gradient):
+        self.sess.run(self.optimize, feed_dict={
+            self.actor.input: states,
+            self.action_grads: gradient
+        })
+
+    def build_critic(self):
+        print("building critic network")
+        state = Input(shape=[29])
+        action = Input(shape=[3], name='action_input')
+        w1 = Dense(300, activation='relu')(state)
+        h1 = Dense(600, activation='linear')(w1)
+        a1 = Dense(600, activation='linear')(action)
+        h2 = Add()([h1, a1])
+        h3 = Dense(600, activation='relu')(h2)
+        V = Dense(1, activation='linear')(h3)
+        model = Model(inputs=[state, action], outputs=V)
+        model.compile(loss='mse', optimizer=Adam(lr=0.001))
+        # model.summary()
+        return model, state, action
+
+    def get_action(self, state):
+        self.epsilon -= self.epsilon_decay
+        noise = np.zeros([self.action_size])
+        action = self.actor.predict(state)[0]
+        noise[0] = max(self.epsilon, 0) * ou_noise(action[0], 0.0, 0.60, 0.30)
+        noise[1] = max(self.epsilon, 0) * ou_noise(action[1], 0.5, 1.00, 0.10)
+        noise[2] = max(self.epsilon, 0) * ou_noise(action[2], -0.1, 1.00, 0.05)
+        real = action + noise
+        return real
+
+    def save_sample(self, state, action, reward, next_state, done):
+        self.memory.append((state, action, reward, next_state, done))
+
+    def train_model(self):
+        mini_batch = random.sample(self.memory, self.batch_size)
+
+        states = np.asarray([e[0] for e in mini_batch])
+        actions = np.asarray([e[1] for e in mini_batch])
+        rewards = np.asarray([e[2] for e in mini_batch])
+        next_states = np.asarray([e[3] for e in mini_batch])
+        dones = np.asarray([e[4] for e in mini_batch])
+
+        target_q_values = self.critic_target.predict(
+            [next_states, self.actor_target.predict(next_states)])
+
+        targets = np.zeros([self.batch_size, 1])
+        for i in range(self.batch_size):
+            if dones[i]:
+                targets[i] = rewards[i]
+            else:
+                targets[i] = rewards[i] + self.discount_factor * target_q_values[i]
+
+        loss = 0
+        loss += self.critic.train_on_batch([states, actions], targets)
+
+        a_for_grad = self.actor.predict(states)
+
+        action_grads = tf.gradients(self.critic.output, self.critic_action)
+        grads = self.sess.run(action_grads, feed_dict={
+            self.critic_state: states, self.critic_action: a_for_grad})[0]
+        self.update_actor(states, grads)
+
+        actor_weights = self.actor.get_weights()
+        actor_target_weights = self.actor_target.get_weights()
+        for i in range(len(actor_weights)):
+            actor_target_weights[i] = self.tau * actor_weights[i] + \
+                                       (1 - self.tau) * \
+                                       actor_target_weights[i]
+        self.actor_target.set_weights(actor_target_weights)
+
+        critic_weights = self.critic.get_weights()
+        critic_target_weights = self.critic_target.get_weights()
+        for i in range(len(critic_weights)):
+            critic_target_weights[i] = self.tau * critic_weights[i] + \
+                                       (1 - self.tau) * \
+                                       critic_target_weights[i]
+        self.critic_target.set_weights(critic_target_weights)
+
+
+agent = DDPGAgent()
+env = TorcsEnv(vision=False, throttle=True, gear_change=False)
+
+print('testing sample agent on torcs')
+global_step = 0
+
+for e in range(2000):
+    step = 0
+    score = 0
+    if e % 10 == 0:
+        observe = env.reset(relaunch=True)
+        print("Now we save model")
+        agent.actor.save_weights("ddpg_actor.h5", overwrite=True)
+        agent.critic.save_weights("ddpg_critic.h5", overwrite=True)
+    else:
+        observe = env.reset()
+
+    state = np.hstack((observe.angle, observe.track, observe.trackPos,
+                       observe.speedX, observe.speedY, observe.speedZ,
+                       observe.wheelSpinVel / 100.0, observe.rpm))
+    done = False
+
+    while not done:
+        step += 1
+        global_step += 1
+        action = agent.get_action(state.reshape(1, state.shape[0]))
+        observe, reward, done, info = env.step(action)
+        score += reward
+        next_state = np.hstack((observe.angle, observe.track, observe.trackPos,
+                                observe.speedX, observe.speedY, observe.speedZ,
+                                observe.wheelSpinVel / 100.0, observe.rpm))
+
+        agent.save_sample(state, action, reward, next_state, done)
+
+        if global_step > 1000:
+            agent.train_model()
+
+        # print(' step: ', step, ' action: ', action, ' reward: ', reward)
+        state = next_state
+
+        if done:
+            print('episode: ', e, ' score: ', score, ' step: ', global_step,
+                  ' epsilon: ', agent.epsilon)
@@ -0,0 +1,302 @@
+import gym
+from gym import spaces
+import numpy as np
+# from os import path
+import snakeoil3_gym as snakeoil3
+import numpy as np
+import copy
+import collections as col
+import os
+import time
+
+
+class TorcsEnv:
+    terminal_judge_start = 100  # If after 100 timestep still no progress, terminated
+    termination_limit_progress = 5  # [km/h], episode terminates if car is running slower than this limit
+    default_speed = 50
+
+    initial_reset = True
+
+    def __init__(self, vision=False, throttle=False, gear_change=False):
+        self.vision = vision
+        self.throttle = throttle
+        self.gear_change = gear_change
+
+        self.initial_run = True
+
+        ##print("launch torcs")
+        os.system('pkill torcs')
+        time.sleep(0.5)
+        if self.vision is True:
+            os.system('torcs -nofuel -nodamage -nolaptime -vision &')
+        else:
+            os.system('torcs -nofuel -nolaptime &')
+        time.sleep(0.5)
+        os.system('sh autostart.sh')
+        time.sleep(0.5)
+
+        """
+        # Modify here if you use multiple tracks in the environment
+        self.client = snakeoil3.Client(p=3101, vision=self.vision)  # Open new UDP in vtorcs
+        self.client.MAX_STEPS = np.inf
+
+        client = self.client
+        client.get_servers_input()  # Get the initial input from torcs
+
+        obs = client.S.d  # Get the current full-observation from torcs
+        """
+        if throttle is False:
+            self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(1,))
+        else:
+            self.action_space = spaces.Box(low=-1.0, high=1.0, shape=(2,))
+
+        if vision is False:
+            high = np.array(
+                [1., np.inf, np.inf, np.inf, 1., np.inf, 1., np.inf])
+            low = np.array(
+                [0., -np.inf, -np.inf, -np.inf, 0., -np.inf, 0., -np.inf])
+            self.observation_space = spaces.Box(low=low, high=high)
+        else:
+            high = np.array(
+                [1., np.inf, np.inf, np.inf, 1., np.inf, 1., np.inf, 255])
+            low = np.array(
+                [0., -np.inf, -np.inf, -np.inf, 0., -np.inf, 0., -np.inf, 0])
+            self.observation_space = spaces.Box(low=low, high=high)
+
+    def step(self, u):
+        # print("Step")
+        # convert thisAction to the actual torcs actionstr
+        client = self.client
+
+        this_action = self.agent_to_torcs(u)
+
+        # Apply Action
+        action_torcs = client.R.d
+
+        # Steering
+        action_torcs['steer'] = this_action['steer']  # in [-1, 1]
+
+        #  Simple Autnmatic Throttle Control by Snakeoil
+        if self.throttle is False:
+            target_speed = self.default_speed
+            if client.S.d['speedX'] < target_speed - (client.R.d['steer'] * 50):
+                client.R.d['accel'] += .01
+            else:
+                client.R.d['accel'] -= .01
+
+            if client.R.d['accel'] > 0.2:
+                client.R.d['accel'] = 0.2
+
+            if client.S.d['speedX'] < 10:
+                client.R.d['accel'] += 1 / (client.S.d['speedX'] + .1)
+
+            # Traction Control System
+            if ((client.S.d['wheelSpinVel'][2] + client.S.d['wheelSpinVel'][
+                3]) -
+                    (client.S.d['wheelSpinVel'][0] + client.S.d['wheelSpinVel'][
+                        1]) > 5):
+                action_torcs['accel'] -= .2
+        else:
+            action_torcs['accel'] = this_action['accel']
+            action_torcs['brake'] = this_action['brake']
+
+            # Automatic Gear Change by Snakeoil
+        if self.gear_change is True:
+            action_torcs['gear'] = this_action['gear']
+        else:
+            #  Automatic Gear Change by Snakeoil is possible
+            action_torcs['gear'] = 1
+            if self.throttle:
+                if client.S.d['speedX'] > 50:
+                    action_torcs['gear'] = 2
+                if client.S.d['speedX'] > 80:
+                    action_torcs['gear'] = 3
+                if client.S.d['speedX'] > 110:
+                    action_torcs['gear'] = 4
+                if client.S.d['speedX'] > 140:
+                    action_torcs['gear'] = 5
+                if client.S.d['speedX'] > 170:
+                    action_torcs['gear'] = 6
+        # Save the privious full-obs from torcs for the reward calculation
+        obs_pre = copy.deepcopy(client.S.d)
+
+        # One-Step Dynamics Update #################################
+        # Apply the Agent's action into torcs
+        client.respond_to_server()
+        # Get the response of TORCS
+        client.get_servers_input()
+
+        # Get the current full-observation from torcs
+        obs = client.S.d
+
+        # Make an obsevation from a raw observation vector from TORCS
+        self.observation = self.make_observaton(obs)
+
+        # Reward setting Here #######################################
+        # direction-dependent positive reward
+        track = np.array(obs['track'])
+        trackPos = np.array(obs['trackPos'])
+        sp = np.array(obs['speedX'])
+        damage = np.array(obs['damage'])
+        rpm = np.array(obs['rpm'])
+
+        progress = sp * np.cos(obs['angle']) - np.abs(
+            sp * np.sin(obs['angle'])) - sp * np.abs(obs['trackPos'])
+        reward = progress
+
+        # collision detection
+        if obs['damage'] - obs_pre['damage'] > 0:
+            reward = -1
+
+        # Termination judgement #########################
+        episode_terminate = False
+        # if (abs(track.any()) > 1 or abs(trackPos) > 1):  # Episode is terminated if the car is out of track
+        #    reward = -200
+        #    episode_terminate = True
+        #    client.R.d['meta'] = True
+
+        # if self.terminal_judge_start < self.time_step: # Episode terminates if the progress of agent is small
+        #    if progress < self.termination_limit_progress:
+        #        print("No progress")
+        #        episode_terminate = True
+        #        client.R.d['meta'] = True
+
+        if np.cos(obs[
+                      'angle']) < 0:  # Episode is terminated if the agent runs backward
+            episode_terminate = True
+            client.R.d['meta'] = True
+
+        if client.R.d['meta'] is True:  # Send a reset signal
+            self.initial_run = False
+            client.respond_to_server()
+
+        self.time_step += 1
+
+        return self.get_obs(), reward, client.R.d['meta'], {}
+
+    def reset(self, relaunch=False):
+        # print("Reset")
+
+        self.time_step = 0
+
+        if self.initial_reset is not True:
+            self.client.R.d['meta'] = True
+            self.client.respond_to_server()
+
+            ## TENTATIVE. Restarting TORCS every episode suffers the memory leak bug!
+            if relaunch is True:
+                self.reset_torcs()
+                print("### TORCS is RELAUNCHED ###")
+
+        # Modify here if you use multiple tracks in the environment
+        self.client = snakeoil3.Client(p=3101,
+                                       vision=self.vision)  # Open new UDP in vtorcs
+        self.client.MAX_STEPS = np.inf
+
+        client = self.client
+        client.get_servers_input()  # Get the initial input from torcs
+
+        obs = client.S.d  # Get the current full-observation from torcs
+        self.observation = self.make_observaton(obs)
+
+        self.last_u = None
+
+        self.initial_reset = False
+        return self.get_obs()
+
+    def end(self):
+        os.system('pkill torcs')
+
+    def get_obs(self):
+        return self.observation
+
+    def reset_torcs(self):
+        # print("relaunch torcs")
+        os.system('pkill torcs')
+        time.sleep(0.5)
+        if self.vision is True:
+            os.system('torcs -nofuel -nodamage -nolaptime -vision &')
+        else:
+            os.system('torcs -nofuel -nolaptime &')
+        time.sleep(0.5)
+        os.system('sh autostart.sh')
+        time.sleep(0.5)
+
+    def agent_to_torcs(self, u):
+        torcs_action = {'steer': u[0]}
+
+        if self.throttle is True:  # throttle action is enabled
+            torcs_action.update({'accel': u[1]})
+            torcs_action.update({'brake': u[2]})
+
+        if self.gear_change is True:  # gear change action is enabled
+            torcs_action.update({'gear': int(u[3])})
+
+        return torcs_action
+
+    def obs_vision_to_image_rgb(self, obs_image_vec):
+        image_vec = obs_image_vec
+        r = image_vec[0:len(image_vec):3]
+        g = image_vec[1:len(image_vec):3]
+        b = image_vec[2:len(image_vec):3]
+
+        sz = (64, 64)
+        r = np.array(r).reshape(sz)
+        g = np.array(g).reshape(sz)
+        b = np.array(b).reshape(sz)
+        return np.array([r, g, b], dtype=np.uint8)
+
+    def make_observaton(self, raw_obs):
+        if self.vision is False:
+            names = ['focus',
+                     'speedX', 'speedY', 'speedZ', 'angle', 'damage',
+                     'opponents',
+                     'rpm',
+                     'track',
+                     'trackPos',
+                     'wheelSpinVel']
+            Observation = col.namedtuple('Observaion', names)
+            return Observation(
+                focus=np.array(raw_obs['focus'], dtype=np.float32) / 200.,
+                speedX=np.array(raw_obs['speedX'], dtype=np.float32) / 300.0,
+                speedY=np.array(raw_obs['speedY'], dtype=np.float32) / 300.0,
+                speedZ=np.array(raw_obs['speedZ'], dtype=np.float32) / 300.0,
+                angle=np.array(raw_obs['angle'], dtype=np.float32) / 3.1416,
+                damage=np.array(raw_obs['damage'], dtype=np.float32),
+                opponents=np.array(raw_obs['opponents'],
+                                   dtype=np.float32) / 200.,
+                rpm=np.array(raw_obs['rpm'], dtype=np.float32) / 10000,
+                track=np.array(raw_obs['track'], dtype=np.float32) / 200.,
+                trackPos=np.array(raw_obs['trackPos'], dtype=np.float32) / 1.,
+                wheelSpinVel=np.array(raw_obs['wheelSpinVel'],
+                                      dtype=np.float32))
+        else:
+            names = ['focus',
+                     'speedX', 'speedY', 'speedZ', 'angle',
+                     'opponents',
+                     'rpm',
+                     'track',
+                     'trackPos',
+                     'wheelSpinVel',
+                     'img']
+            Observation = col.namedtuple('Observaion', names)
+
+            # Get RGB from observation
+            image_rgb = self.obs_vision_to_image_rgb(raw_obs[names[8]])
+
+            return Observation(
+                focus=np.array(raw_obs['focus'], dtype=np.float32) / 200.,
+                speedX=np.array(raw_obs['speedX'],
+                                dtype=np.float32) / self.default_speed,
+                speedY=np.array(raw_obs['speedY'],
+                                dtype=np.float32) / self.default_speed,
+                speedZ=np.array(raw_obs['speedZ'],
+                                dtype=np.float32) / self.default_speed,
+                opponents=np.array(raw_obs['opponents'],
+                                   dtype=np.float32) / 200.,
+                rpm=np.array(raw_obs['rpm'], dtype=np.float32),
+                track=np.array(raw_obs['track'], dtype=np.float32) / 200.,
+                trackPos=np.array(raw_obs['trackPos'], dtype=np.float32) / 1.,
+                wheelSpinVel=np.array(raw_obs['wheelSpinVel'],
+                                      dtype=np.float32),
+                img=image_rgb)