update

okrusch · okrusch · commit abdb7e9b5238 · 2024-04-28T08:59:56.000+02:00
diff --git a/agents/BasicAgent.py b/agents/BasicAgent.py
@@ -4,6 +4,8 @@
 class BasicAgent(AbstractAgent):
     def __init__(self, train, screen_size):
         super(BasicAgent, self).__init__(screen_size)
+        self.old_state = None
+        self.old_action = None
 
     def step(self, obs):
         if self._MOVE_SCREEN.id in obs.observation.available_actions:
@@ -32,6 +34,10 @@ def step(self, obs):
 
             assert move != ""
 
+            self.old_state = marine_coordinates
+            self.old_action = move
+            x = obs.reward
+
             return self._dir_to_sc2_action(move, marine_coordinates)
         else:
             return self._SELECT_ARMY
diff --git a/agents/QLearningAgent.py b/agents/QLearningAgent.py
@@ -1,3 +1,5 @@
+import random
+
 from agents.AbstractAgent import AbstractAgent
 import pandas as pd
 import numpy as np
@@ -25,33 +27,64 @@ def __init__(self, train, screen_size, explore=1):
         self.states = []
         for x in range(-64, 65):
             for y in range(-64, 65):
-                self.states.append((x, y))
+                self.states.append("("+str(x) + "," + str(y) + ")")
         self.q_table = self.init_q_table()
         self.alpha = 0.1
         self.gamma = 0.9
         self.old_state = None
         self.old_action = None
 
-    def step(self, obs):
+    def step(self, obs, epsilon):
         # TODO step method
         if self._MOVE_SCREEN.id in obs.observation.available_actions:
+            # get q_state from position
             marine = self._get_marine(obs)
             if marine is None:
                 return self._NO_OP
             marine_coordinates = self._get_unit_pos(marine)
-            action = self.get_new_action(marine_coordinates)
+            beacon = self._get_beacon(obs)
+            if beacon is None:
+                return self._NO_OP
+            beacon_coordinates = self._get_unit_pos(beacon)
+
+            q_state = self.get_q_state_from_position(marine_position=marine_coordinates,
+                                                     beacon_position=beacon_coordinates)
+
+            # epsilon integration
+            rnd = random.random()
+            if rnd > epsilon:
+                action = self.get_new_action(q_state)
+            else:
+                action = random.choice(list(self.actions))
+
             if self.train:
-                pass
+                if self.old_state == None and self.old_action == None:
+                    # first step where there is no previous state
+                    self.old_state = get_row_index_in_string_format(q_state)
+                    self.old_action = action
+                else:
+                    t = obs.reward == 1 # terminate when beacon reached
+                    self.update_q_value(self.old_state, self.old_action, marine_coordinates, obs.reward, t) # update q_value
+
+                    # set previous state and action
+                    self.old_state = get_row_index_in_string_format(q_state)
+                    self.old_action = action
+
+                return self._dir_to_sc2_action(action, marine_coordinates)
             else:
                 return self._dir_to_sc2_action(action, marine_coordinates)
         else:
+            self.old_state = None
+            self.old_action = None
             return self._SELECT_ARMY    # initialize army in first step
 
     def save_model(self, path):
-        self.q_table.to_pickle(path)
+        # save model as pkl
+        self.q_table.to_pickle(path + ".pkl")
 
     def load_model(self, path):
-        self.q_table = pd.read_pickle(path)
+        # load model from pkl
+        self.q_table = pd.read_pickle(path + ".pkl")
 
     def get_new_action(self, state):
         """
@@ -65,8 +98,12 @@ def get_new_action(self, state):
         """
         # TODO get_new_action method
         index = get_row_index_in_string_format(state)
-        action = np.argmax(self.q_table.loc[index])
-        return self.actions[action]
+        options = self.q_table.loc[index]
+        m = max(options)
+        indices = [index for index, value in enumerate(options) if value == m]
+        choice = random.choice(indices)
+        action = list(self.actions)[choice]
+        return action
 
     def get_q_value(self, q_table_column_index, q_table_row_index):
         """
@@ -80,20 +117,22 @@ def get_q_value(self, q_table_column_index, q_table_row_index):
                             action (float): The value for the given indices.
         """
         # TODO get_new_action method
-        q_value = self.q_table.loc[q_table_row_index, q_table_column_index]
+        q_value = self.q_table.loc[q_table_column_index][q_table_row_index]
         return float(q_value)
 
     def update_q_value(self, old_state, old_action, new_state, reward, terminal):
         # TODO update_q_value method
-        old_state_str = get_row_index_in_string_format(old_state)
         new_state_str = get_row_index_in_string_format(new_state)
-        q_value = self.q_table[old_state_str, old_action]
+        q_value = self.get_q_value(q_table_column_index=old_state,
+                                   q_table_row_index=old_action)
         if not terminal:
-            new_q_value = q_value + self.alpha + (reward + self.gamma * max(self.q_table[new_state_str]) + q_value)
+            max_new = max(self.q_table.loc[new_state_str])
+            new_q_value = q_value + self.alpha * (reward + (self.gamma * max_new) - q_value)
         else:
-            new_q_value = q_value + self.alpha + (reward - q_value)
+            new_q_value = q_value + self.alpha * (reward - q_value)
+            print("final", old_state, new_q_value)
 
-        self.q_table[old_state_str, old_action] = new_q_value
+        self.q_table.at[old_state, old_action] = new_q_value
 
 
 
@@ -122,4 +161,6 @@ def init_q_table(self):
                                                        The row indices must be in the format '(x,y)'
                                                        The column indices must be in the format 'action' (e.g. 'W')
         """
-        return pd.DataFrame(np.random.rand(len(self.states), len(self.actions)), index=self.states, columns=self.actions)
+        return pd.DataFrame(np.random.rand(len(self.states), len(self.actions)), index=self.states, columns=self.actions)
+
+        #return pd.DataFrame(0, index=self.states, columns=self.actions)
diff --git a/runners/main_runner.py b/runners/main_runner.py
@@ -15,7 +15,9 @@ def __init__(self, agent, env, train):
 
         self.moving_avg = collections.deque(maxlen=50)
         self.score = 0
+        self.scores = []
         self.episode = 1
+        self.epsilon = 1.0
 
         self.graph_path = Path("..", "graphs", type(agent).__name__, datetime.datetime.now().strftime("%y%m%d_%H%M"),
                                'train' if self.train else 'run')
@@ -28,31 +30,40 @@ def __init__(self, agent, env, train):
 
         if not self.train and os.path.isdir(self.weights_path_load):
             self.agent.load_model(str(self.weights_path_load))
+            pass
         else:
             self.weights_path_load.mkdir(parents=True, exist_ok=True)
 
     def summarize(self):
         # Graphs in tensorboard
         self.writer.add_scalar('Score per Episode', self.score, global_step=self.episode)
-
+        self.writer.add_scalar('Epsilon', self.epsilon, global_step=self.episode)
         if self.train and self.episode % 10 == 0:
             self.agent.save_model(str(self.weights_path_save))
             try:
                 self.agent.update_target_model()
             except AttributeError:
                 ...
+        self.scores.append(self.score)
+        if len(self.scores) > 50:
+            self.scores.pop(0)
+            self.writer.add_scalar('Moving Average', (sum(self.scores) / 50), global_step=self.episode)
 
         self.episode += 1
         self.score = 0
         self.writer.flush()
 
-    def run(self, episodes):
-        while self.episode <= episodes:
+    def run(self):
+        while self.score < 20:
             obs = self.env.reset()
             while True:
-                action = self.agent.step(obs)
+                action = self.agent.step(obs, self.epsilon)
                 if obs.last():
+                    # epsilon decreases linear until 0.01
+                    if self.epsilon > 0.1:
+                        self.epsilon -= 0.0001
                     break
+
                 obs = self.env.step(action)
                 self.score += obs.reward
 
diff --git a/trainScripts/trainQLAgent.py b/trainScripts/trainQLAgent.py
@@ -2,11 +2,10 @@
 from absl import app
 
 from env import Env
-from runners.basic_runner import Runner
+from runners.main_runner import Runner
 from agents.QLearningAgent import QLearningAgent
 
 _CONFIG = dict(
-    episodes=100,
     screen_size=64,
     minimap_size=64,
     visualize=False,
@@ -33,7 +32,7 @@ def main(unused_argv):
         train=_CONFIG['train']
     )
 
-    runner.run(episodes=_CONFIG['episodes'])
+    runner.run()
 
 
 if __name__ == "__main__":