fix

2020-04-21 23:00:41 +02:00 · 2020-04-21 23:00:41 +02:00 · 8d7f01f010
commit 8d7f01f010
parent 427a67e77a
3 changed files with 246 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -4,4 +4,3 @@ pyenv
 .vscode
 __pycache__
 .idea
-Deep_Q_Learning
--- a/Deep_Q_Learning/Deep_Q_Learning.py
+++ b/Deep_Q_Learning/Deep_Q_Learning.py
@ -0,0 +1,177 @@
+import numpy as np
+from time import time
+import keras.backend.tensorflow_backend as backend
+from keras import backend as K
+from keras.models import Sequential
+from keras.layers import Dense, Dropout, Input, Activation, Flatten
+from keras.optimizers import Adam
+from keras.callbacks import TensorBoard
+import tensorflow as tf
+from collections import deque
+import random
+from Deep_Q_Learning.GC_Env import GC_Env
+
+DISCOUNT = 0.99
+REPLAY_MEMORY_SIZE = 50_000  # How many last steps to keep for model training
+# Minimum number of steps in a memory to start training
+MIN_REPLAY_MEMORY_SIZE = 1_000
+MINIBATCH_SIZE = 64  # How many steps (samples) to use for training
+UPDATE_TARGET_EVERY = 5  # Terminal states (end of episodes)
+MODEL_NAME = 'l-3_n-26-12-6-relu4l_adjusted_rewards'
+MIN_REWARD = -200  # For model save
+MEMORY_FRACTION = 0.20
+
+# Environment settings
+EPISODES = 20_000
+
+# Exploration settings
+epsilon = 1  # not a constant, going to be decayed
+EPSILON_DECAY = 0.99975
+MIN_EPSILON = 0.001
+
+#  Stats settings
+AGGREGATE_STATS_EVERY = 50  # episodes
+SHOW_PREVIEW = False
+
+# Further, whenever we call load_model(remember, we needed it for the target network), we will need to pass custom_objects={'huber_loss': huber_loss as an argument to tell Keras where to find huber_loss.
+def huber_loss(a, b, in_keras=True):
+    error = a - b
+    quadratic_term = error*error / 2
+    linear_term = abs(error) - 1/2
+    use_linear_term = (abs(error) > 1.0)
+    use_linear_term = K.cast(use_linear_term, 'float32')
+    return use_linear_term * linear_term + (1-use_linear_term) * quadratic_term
+
+
+# Own Tensorboard class
+
+
+class ModifiedTensorBoard(TensorBoard):
+
+    # Overriding init to set initial step and writer (we want one log file for all .fit() calls)
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.step = 1
+        self.writer = tf.summary.FileWriter(self.log_dir)
+
+    # Overriding this method to stop creating default log writer
+    def set_model(self, model):
+        pass
+
+    # Overrided, saves logs with our step number
+    # (otherwise every .fit() will start writing from 0th step)
+    def on_epoch_end(self, epoch, logs=None):
+        self.update_stats(**logs)
+
+    # Overrided
+    # We train for one batch only, no need to save anything at epoch end
+    def on_batch_end(self, batch, logs=None):
+        pass
+
+    # Overrided, so won't close writer
+    def on_train_end(self, _):
+        pass
+
+    # Custom method for saving own metrics
+    # Creates writer, writes custom metrics and closes writer
+    def update_stats(self, **stats):
+        self._write_logs(stats, self.step)
+
+
+class DQNAgent:
+    def __init__(self, env):
+
+        self.env = env
+
+        # Main model
+        self.model = self.create_model()
+
+        # Target network
+        self.target_model = self.create_model()
+        self.target_model.set_weights(self.model.get_weights())
+
+        # An array with last n steps for training
+        self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)
+
+        # Custom tensorboard object
+        self.tensorboard = ModifiedTensorBoard(
+            log_dir="logs/{}-{}".format(MODEL_NAME, int(time())))
+
+        # Used to count when to update target network with main network's weights
+        self.target_update_counter = 0
+
+    def create_model(self):
+        model = Sequential([
+            Dense(26, input_shape=self.env.OBSERVATION_SPACE_VALUES, activation='relu'),
+            Dense(12, activation='relu'),
+            Dense(6, activation='relu'),
+            Dense(self.env.ACTION_SPACE_SIZE, activation='softmax'),
+        ])
+        model.compile(loss='mse', optimizer=Adam(
+            lr=0.001), metrics=['accuracy'])
+        return model
+
+    # Adds step's data to a memory replay array
+    # (observation space, action, reward, new observation space, done)
+    def update_replay_memory(self, transition):
+        self.replay_memory.append(transition)
+
+    # Trains main network every step during episode
+    def train(self, terminal_state, step):
+
+        # Start training only if certain number of samples is already saved
+        if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
+            return
+
+        # Get a minibatch of random samples from memory replay table
+        minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE)
+
+        # Get current states from minibatch, then query NN model for Q values
+        current_states = np.array([transition[0]
+                                   for transition in minibatch])
+        current_qs_list = self.model.predict(current_states)
+
+        # Get future states from minibatch, then query NN model for Q values
+        # When using target network, query it, otherwise main network should be queried
+        new_current_states = np.array(
+            [transition[3] for transition in minibatch])
+        future_qs_list = self.target_model.predict(new_current_states)
+
+        X = []
+        y = []
+
+        # Now we need to enumerate our batches
+        for index, (current_state, action, reward, new_current_state, done) in enumerate(minibatch):
+
+            # If not a terminal state, get new q from future states, otherwise set it to 0
+            # almost like with Q Learning, but we use just part of equation here
+            if not done:
+                max_future_q = np.max(future_qs_list[index])
+                new_q = reward + DISCOUNT * max_future_q
+            else:
+                new_q = reward
+
+            # Update Q value for given state
+            current_qs = current_qs_list[index]
+            current_qs[action] = new_q
+
+            # And append to our training data
+            X.append(current_state)
+            y.append(current_qs)
+
+        # Fit on all samples as one batch, log only on terminal state
+        self.model.fit(np.array(X), np.array(y), batch_size=MINIBATCH_SIZE, verbose=0,
+                       shuffle=False, callbacks=[self.tensorboard] if terminal_state else None)
+
+        # Update target network counter every episode
+        if terminal_state:
+            self.target_update_counter += 1
+
+        # If counter reaches set value, update target network with weights of main network
+        if self.target_update_counter > UPDATE_TARGET_EVERY:
+            self.target_model.set_weights(self.model.get_weights())
+            self.target_update_counter = 0
+
+    # Queries main network for Q values given current observation space (environment state)
+    def get_qs(self, state):
+        return self.model.predict(np.array(state).reshape(-1, 30))
--- a/Deep_Q_Learning/GC_Env.py
+++ b/Deep_Q_Learning/GC_Env.py
@ -0,0 +1,69 @@
+from models.Garbage_Collector import Garbage_Collector
+from helpler import Render_Element
+from models.House import House
+from config import MAP_WIDTH, MAP_HEIGHT
+import numpy as np
+
+
+class GC_Env:
+    OBSERVATION_SPACE_VALUES = (2 + 1 * 4 + 6 * 4,)
+    ACTION_SPACE_SIZE = 6
+
+    def reset(self):
+        self.draw_items = {(x, y): Render_Element(x, y)
+                           for x in range(MAP_WIDTH) for y in range(MAP_HEIGHT)}
+        self.gc = Garbage_Collector(self.draw_items)
+        self.actions = {
+            0: self.gc.move_up,
+            1: self.gc.move_down,
+            2: self.gc.move_left,
+            3: self.gc.move_right,
+            4: self.gc.pick_trash,
+            5: self.gc.leave_trash
+        }
+        houses = list(map(lambda item: self.draw_items[item], list(filter(lambda item: isinstance(
+            self.draw_items[item], House), self.draw_items))))
+        observation = [
+            self.gc.col/(MAP_WIDTH - 1),
+            self.gc.row / (MAP_HEIGHT - 1),
+            self.gc.mixed / self.gc.limit, self.gc.paper / self.gc.limit,
+            self.gc.glass / self.gc.limit, self.gc.plastic / self.gc.limit,
+        ]
+        for house in houses:
+            for item in ["mixed", "paper", "glass", "plastic"]:
+                observation.append(getattr(house, item) / house.limit)
+
+        return observation
+
+    def step(self, action):
+        action_result = self.actions[action]()
+        houses = list(map(lambda item: self.draw_items[item], list(filter(lambda item: isinstance(
+            self.draw_items[item], House), self.draw_items))))
+
+        new_observation = [
+            self.gc.col/(MAP_WIDTH - 1),
+            self.gc.row / (MAP_HEIGHT - 1),
+            self.gc.mixed / self.gc.limit, self.gc.paper / self.gc.limit,
+            self.gc.glass / self.gc.limit, self.gc.plastic / self.gc.limit,
+        ]
+        for house in houses:
+            for item in ["mixed", "paper", "glass", "plastic"]:
+                new_observation.append(getattr(house, item) / house.limit)
+
+        if action_result == False:
+            reward = -10
+        elif action_result == True:
+            reward = -0.1
+        else:
+            reward = action_result
+
+        done = True
+        if not self.gc.is_empty():
+            done = False
+        else:
+            for item in self.draw_items:
+                if isinstance(self.draw_items[item], House) and not self.draw_items[item].is_empty():
+                    done = False
+                    break
+
+        return new_observation, reward, done