fix
This commit is contained in:
parent
427a67e77a
commit
8d7f01f010
1
.gitignore
vendored
1
.gitignore
vendored
@ -4,4 +4,3 @@ pyenv
|
||||
.vscode
|
||||
__pycache__
|
||||
.idea
|
||||
Deep_Q_Learning
|
||||
|
177
Deep_Q_Learning/Deep_Q_Learning.py
Normal file
177
Deep_Q_Learning/Deep_Q_Learning.py
Normal file
@ -0,0 +1,177 @@
|
||||
import numpy as np
|
||||
from time import time
|
||||
import keras.backend.tensorflow_backend as backend
|
||||
from keras import backend as K
|
||||
from keras.models import Sequential
|
||||
from keras.layers import Dense, Dropout, Input, Activation, Flatten
|
||||
from keras.optimizers import Adam
|
||||
from keras.callbacks import TensorBoard
|
||||
import tensorflow as tf
|
||||
from collections import deque
|
||||
import random
|
||||
from Deep_Q_Learning.GC_Env import GC_Env
|
||||
|
||||
DISCOUNT = 0.99
|
||||
REPLAY_MEMORY_SIZE = 50_000 # How many last steps to keep for model training
|
||||
# Minimum number of steps in a memory to start training
|
||||
MIN_REPLAY_MEMORY_SIZE = 1_000
|
||||
MINIBATCH_SIZE = 64 # How many steps (samples) to use for training
|
||||
UPDATE_TARGET_EVERY = 5 # Terminal states (end of episodes)
|
||||
MODEL_NAME = 'l-3_n-26-12-6-relu4l_adjusted_rewards'
|
||||
MIN_REWARD = -200 # For model save
|
||||
MEMORY_FRACTION = 0.20
|
||||
|
||||
# Environment settings
|
||||
EPISODES = 20_000
|
||||
|
||||
# Exploration settings
|
||||
epsilon = 1 # not a constant, going to be decayed
|
||||
EPSILON_DECAY = 0.99975
|
||||
MIN_EPSILON = 0.001
|
||||
|
||||
# Stats settings
|
||||
AGGREGATE_STATS_EVERY = 50 # episodes
|
||||
SHOW_PREVIEW = False
|
||||
|
||||
# Further, whenever we call load_model(remember, we needed it for the target network), we will need to pass custom_objects={'huber_loss': huber_loss as an argument to tell Keras where to find huber_loss.
|
||||
def huber_loss(a, b, in_keras=True):
|
||||
error = a - b
|
||||
quadratic_term = error*error / 2
|
||||
linear_term = abs(error) - 1/2
|
||||
use_linear_term = (abs(error) > 1.0)
|
||||
use_linear_term = K.cast(use_linear_term, 'float32')
|
||||
return use_linear_term * linear_term + (1-use_linear_term) * quadratic_term
|
||||
|
||||
|
||||
# Own Tensorboard class
|
||||
|
||||
|
||||
class ModifiedTensorBoard(TensorBoard):
|
||||
|
||||
# Overriding init to set initial step and writer (we want one log file for all .fit() calls)
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.step = 1
|
||||
self.writer = tf.summary.FileWriter(self.log_dir)
|
||||
|
||||
# Overriding this method to stop creating default log writer
|
||||
def set_model(self, model):
|
||||
pass
|
||||
|
||||
# Overrided, saves logs with our step number
|
||||
# (otherwise every .fit() will start writing from 0th step)
|
||||
def on_epoch_end(self, epoch, logs=None):
|
||||
self.update_stats(**logs)
|
||||
|
||||
# Overrided
|
||||
# We train for one batch only, no need to save anything at epoch end
|
||||
def on_batch_end(self, batch, logs=None):
|
||||
pass
|
||||
|
||||
# Overrided, so won't close writer
|
||||
def on_train_end(self, _):
|
||||
pass
|
||||
|
||||
# Custom method for saving own metrics
|
||||
# Creates writer, writes custom metrics and closes writer
|
||||
def update_stats(self, **stats):
|
||||
self._write_logs(stats, self.step)
|
||||
|
||||
|
||||
class DQNAgent:
|
||||
def __init__(self, env):
|
||||
|
||||
self.env = env
|
||||
|
||||
# Main model
|
||||
self.model = self.create_model()
|
||||
|
||||
# Target network
|
||||
self.target_model = self.create_model()
|
||||
self.target_model.set_weights(self.model.get_weights())
|
||||
|
||||
# An array with last n steps for training
|
||||
self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)
|
||||
|
||||
# Custom tensorboard object
|
||||
self.tensorboard = ModifiedTensorBoard(
|
||||
log_dir="logs/{}-{}".format(MODEL_NAME, int(time())))
|
||||
|
||||
# Used to count when to update target network with main network's weights
|
||||
self.target_update_counter = 0
|
||||
|
||||
def create_model(self):
|
||||
model = Sequential([
|
||||
Dense(26, input_shape=self.env.OBSERVATION_SPACE_VALUES, activation='relu'),
|
||||
Dense(12, activation='relu'),
|
||||
Dense(6, activation='relu'),
|
||||
Dense(self.env.ACTION_SPACE_SIZE, activation='softmax'),
|
||||
])
|
||||
model.compile(loss='mse', optimizer=Adam(
|
||||
lr=0.001), metrics=['accuracy'])
|
||||
return model
|
||||
|
||||
# Adds step's data to a memory replay array
|
||||
# (observation space, action, reward, new observation space, done)
|
||||
def update_replay_memory(self, transition):
|
||||
self.replay_memory.append(transition)
|
||||
|
||||
# Trains main network every step during episode
|
||||
def train(self, terminal_state, step):
|
||||
|
||||
# Start training only if certain number of samples is already saved
|
||||
if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
|
||||
return
|
||||
|
||||
# Get a minibatch of random samples from memory replay table
|
||||
minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE)
|
||||
|
||||
# Get current states from minibatch, then query NN model for Q values
|
||||
current_states = np.array([transition[0]
|
||||
for transition in minibatch])
|
||||
current_qs_list = self.model.predict(current_states)
|
||||
|
||||
# Get future states from minibatch, then query NN model for Q values
|
||||
# When using target network, query it, otherwise main network should be queried
|
||||
new_current_states = np.array(
|
||||
[transition[3] for transition in minibatch])
|
||||
future_qs_list = self.target_model.predict(new_current_states)
|
||||
|
||||
X = []
|
||||
y = []
|
||||
|
||||
# Now we need to enumerate our batches
|
||||
for index, (current_state, action, reward, new_current_state, done) in enumerate(minibatch):
|
||||
|
||||
# If not a terminal state, get new q from future states, otherwise set it to 0
|
||||
# almost like with Q Learning, but we use just part of equation here
|
||||
if not done:
|
||||
max_future_q = np.max(future_qs_list[index])
|
||||
new_q = reward + DISCOUNT * max_future_q
|
||||
else:
|
||||
new_q = reward
|
||||
|
||||
# Update Q value for given state
|
||||
current_qs = current_qs_list[index]
|
||||
current_qs[action] = new_q
|
||||
|
||||
# And append to our training data
|
||||
X.append(current_state)
|
||||
y.append(current_qs)
|
||||
|
||||
# Fit on all samples as one batch, log only on terminal state
|
||||
self.model.fit(np.array(X), np.array(y), batch_size=MINIBATCH_SIZE, verbose=0,
|
||||
shuffle=False, callbacks=[self.tensorboard] if terminal_state else None)
|
||||
|
||||
# Update target network counter every episode
|
||||
if terminal_state:
|
||||
self.target_update_counter += 1
|
||||
|
||||
# If counter reaches set value, update target network with weights of main network
|
||||
if self.target_update_counter > UPDATE_TARGET_EVERY:
|
||||
self.target_model.set_weights(self.model.get_weights())
|
||||
self.target_update_counter = 0
|
||||
|
||||
# Queries main network for Q values given current observation space (environment state)
|
||||
def get_qs(self, state):
|
||||
return self.model.predict(np.array(state).reshape(-1, 30))
|
69
Deep_Q_Learning/GC_Env.py
Normal file
69
Deep_Q_Learning/GC_Env.py
Normal file
@ -0,0 +1,69 @@
|
||||
from models.Garbage_Collector import Garbage_Collector
|
||||
from helpler import Render_Element
|
||||
from models.House import House
|
||||
from config import MAP_WIDTH, MAP_HEIGHT
|
||||
import numpy as np
|
||||
|
||||
|
||||
class GC_Env:
|
||||
OBSERVATION_SPACE_VALUES = (2 + 1 * 4 + 6 * 4,)
|
||||
ACTION_SPACE_SIZE = 6
|
||||
|
||||
def reset(self):
|
||||
self.draw_items = {(x, y): Render_Element(x, y)
|
||||
for x in range(MAP_WIDTH) for y in range(MAP_HEIGHT)}
|
||||
self.gc = Garbage_Collector(self.draw_items)
|
||||
self.actions = {
|
||||
0: self.gc.move_up,
|
||||
1: self.gc.move_down,
|
||||
2: self.gc.move_left,
|
||||
3: self.gc.move_right,
|
||||
4: self.gc.pick_trash,
|
||||
5: self.gc.leave_trash
|
||||
}
|
||||
houses = list(map(lambda item: self.draw_items[item], list(filter(lambda item: isinstance(
|
||||
self.draw_items[item], House), self.draw_items))))
|
||||
observation = [
|
||||
self.gc.col/(MAP_WIDTH - 1),
|
||||
self.gc.row / (MAP_HEIGHT - 1),
|
||||
self.gc.mixed / self.gc.limit, self.gc.paper / self.gc.limit,
|
||||
self.gc.glass / self.gc.limit, self.gc.plastic / self.gc.limit,
|
||||
]
|
||||
for house in houses:
|
||||
for item in ["mixed", "paper", "glass", "plastic"]:
|
||||
observation.append(getattr(house, item) / house.limit)
|
||||
|
||||
return observation
|
||||
|
||||
def step(self, action):
|
||||
action_result = self.actions[action]()
|
||||
houses = list(map(lambda item: self.draw_items[item], list(filter(lambda item: isinstance(
|
||||
self.draw_items[item], House), self.draw_items))))
|
||||
|
||||
new_observation = [
|
||||
self.gc.col/(MAP_WIDTH - 1),
|
||||
self.gc.row / (MAP_HEIGHT - 1),
|
||||
self.gc.mixed / self.gc.limit, self.gc.paper / self.gc.limit,
|
||||
self.gc.glass / self.gc.limit, self.gc.plastic / self.gc.limit,
|
||||
]
|
||||
for house in houses:
|
||||
for item in ["mixed", "paper", "glass", "plastic"]:
|
||||
new_observation.append(getattr(house, item) / house.limit)
|
||||
|
||||
if action_result == False:
|
||||
reward = -10
|
||||
elif action_result == True:
|
||||
reward = -0.1
|
||||
else:
|
||||
reward = action_result
|
||||
|
||||
done = True
|
||||
if not self.gc.is_empty():
|
||||
done = False
|
||||
else:
|
||||
for item in self.draw_items:
|
||||
if isinstance(self.draw_items[item], House) and not self.draw_items[item].is_empty():
|
||||
done = False
|
||||
break
|
||||
|
||||
return new_observation, reward, done
|
Loading…
Reference in New Issue
Block a user