model
This commit is contained in:
parent
a209fae515
commit
be83e7028c
@ -1,91 +0,0 @@
|
||||
from os.path import isfile
|
||||
import numpy as np
|
||||
from time import time
|
||||
from models.House import House
|
||||
from random import randint
|
||||
from helpler import Render_Element
|
||||
from models.Garbage_Collector import Garbage_Collector
|
||||
from config import MAP_WIDTH, MAP_HEIGHT
|
||||
from models.Road import Road
|
||||
|
||||
from keras.models import Sequential
|
||||
from keras.layers import Dense, Activation, Flatten
|
||||
from keras.optimizers import Adam
|
||||
|
||||
from rl.agents.dqn import DQNAgent
|
||||
from rl.policy import EpsGreedyQPolicy
|
||||
from rl.memory import SequentialMemory
|
||||
|
||||
|
||||
class Deep_Q_Learning:
|
||||
def __init__(self, gc=None, draw_items=None, q_table=None):
|
||||
self.q_table = q_table
|
||||
self.gc = gc
|
||||
self.draw_items = draw_items
|
||||
self.actions = {
|
||||
0: self.gc.move_up,
|
||||
1: self.gc.move_down,
|
||||
2: self.gc.move_left,
|
||||
3: self.gc.move_right,
|
||||
4: self.gc.pick_trash,
|
||||
5: self.gc.leave_trash
|
||||
}
|
||||
self.runs = 0
|
||||
|
||||
def reset(self):
|
||||
self.draw_items = {(x, y): Render_Element(x, y)
|
||||
for x in range(MAP_WIDTH) for y in range(MAP_HEIGHT)}
|
||||
self.gc = Garbage_Collector(self.draw_items)
|
||||
|
||||
def step(self, action):
|
||||
action_result = self.actions[action]()
|
||||
houses = list(map(lambda item: self.draw_items[item], list(filter(lambda item: isinstance(
|
||||
self.draw_items[item], House), self.draw_items))))
|
||||
|
||||
new_observation = {"gc-pos": (self.gc.col/(MAP_WIDTH - 1),
|
||||
self.gc.row / (MAP_HEIGHT - 1)),
|
||||
"gc-trash": (self.gc.mixed / self.gc.limit, self.gc.paper / self.gc.limit,
|
||||
self.gc.glass / self.gc.limit, self.gc.plastic / self.gc.limit),
|
||||
"houses": ((house.mixed / house.limit, house.paper / house.limit, house.glass / house.limit, house.plastic / house.limit) for house in houses)
|
||||
}
|
||||
|
||||
if action_result == False:
|
||||
reward = -10
|
||||
elif action_result == True:
|
||||
reward = -0.1
|
||||
else:
|
||||
reward = action_result
|
||||
|
||||
done = True
|
||||
if not self.gc.is_empty():
|
||||
done = False
|
||||
else:
|
||||
for item in self.draw_items:
|
||||
if isinstance(self.draw_items[item], House) and not self.draw_items[item].is_empty():
|
||||
done = False
|
||||
break
|
||||
|
||||
return new_observation, reward, done
|
||||
|
||||
def run(self):
|
||||
# self.set_raw_game()
|
||||
self.houses = list(map(lambda item: self.draw_items[item], list(filter(lambda item: isinstance(
|
||||
self.draw_items[item], House), self.draw_items))))
|
||||
|
||||
input_shape = (2 * 4 * len(self.houses) * 4,)
|
||||
model = Sequential([
|
||||
Dense(units=512, activation="sigmoid", input_shape=input_shape),
|
||||
Dense(512, activation='sigmoid'),
|
||||
Dense(units=6, activation='softmax')
|
||||
])
|
||||
print(model.summary())
|
||||
|
||||
policy = EpsGreedyQPolicy()
|
||||
memory = SequentialMemory(limit=50000, window_length=1)
|
||||
dqn = DQNAgent(model=model, nb_actions=len(self.actions), memory=memory, nb_steps_warmup=10,
|
||||
target_model_update=1e-2, policy=policy)
|
||||
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
|
||||
|
||||
|
||||
dql = Deep_Q_Learning()
|
||||
dql.run()
|
166
Deep_Q_Learning/Deep_Q_Learning.py
Normal file
166
Deep_Q_Learning/Deep_Q_Learning.py
Normal file
@ -0,0 +1,166 @@
|
||||
import numpy as np
|
||||
from time import time
|
||||
import keras.backend.tensorflow_backend as backend
|
||||
from keras.models import Sequential, Model
|
||||
from keras.layers import Dense, Dropout, Input, Activation, Flatten
|
||||
from keras.optimizers import Adam
|
||||
from keras.callbacks import TensorBoard
|
||||
import tensorflow as tf
|
||||
from collections import deque
|
||||
import random
|
||||
from Deep_Q_Learning.GC_Env import GC_Env
|
||||
|
||||
DISCOUNT = 0.99
|
||||
REPLAY_MEMORY_SIZE = 50_000 # How many last steps to keep for model training
|
||||
# Minimum number of steps in a memory to start training
|
||||
MIN_REPLAY_MEMORY_SIZE = 1_000
|
||||
MINIBATCH_SIZE = 64 # How many steps (samples) to use for training
|
||||
UPDATE_TARGET_EVERY = 5 # Terminal states (end of episodes)
|
||||
MODEL_NAME = '2x256'
|
||||
MIN_REWARD = -200 # For model save
|
||||
MEMORY_FRACTION = 0.20
|
||||
|
||||
# Environment settings
|
||||
EPISODES = 20_000
|
||||
|
||||
# Exploration settings
|
||||
epsilon = 1 # not a constant, going to be decayed
|
||||
EPSILON_DECAY = 0.99975
|
||||
MIN_EPSILON = 0.001
|
||||
|
||||
# Stats settings
|
||||
AGGREGATE_STATS_EVERY = 50 # episodes
|
||||
SHOW_PREVIEW = False
|
||||
|
||||
|
||||
# Own Tensorboard class
|
||||
|
||||
|
||||
class ModifiedTensorBoard(TensorBoard):
|
||||
|
||||
# Overriding init to set initial step and writer (we want one log file for all .fit() calls)
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.step = 1
|
||||
self.writer = tf.summary.FileWriter(self.log_dir)
|
||||
|
||||
# Overriding this method to stop creating default log writer
|
||||
def set_model(self, model):
|
||||
pass
|
||||
|
||||
# Overrided, saves logs with our step number
|
||||
# (otherwise every .fit() will start writing from 0th step)
|
||||
def on_epoch_end(self, epoch, logs=None):
|
||||
self.update_stats(**logs)
|
||||
|
||||
# Overrided
|
||||
# We train for one batch only, no need to save anything at epoch end
|
||||
def on_batch_end(self, batch, logs=None):
|
||||
pass
|
||||
|
||||
# Overrided, so won't close writer
|
||||
def on_train_end(self, _):
|
||||
pass
|
||||
|
||||
# Custom method for saving own metrics
|
||||
# Creates writer, writes custom metrics and closes writer
|
||||
def update_stats(self, **stats):
|
||||
self._write_logs(stats, self.step)
|
||||
|
||||
|
||||
class DQNAgent:
|
||||
def __init__(self, env):
|
||||
|
||||
self.env = env
|
||||
|
||||
# Main model
|
||||
self.model = self.create_model()
|
||||
|
||||
# Target network
|
||||
self.target_model = self.create_model()
|
||||
self.target_model.set_weights(self.model.get_weights())
|
||||
|
||||
# An array with last n steps for training
|
||||
self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)
|
||||
|
||||
# Custom tensorboard object
|
||||
self.tensorboard = ModifiedTensorBoard(
|
||||
log_dir="logs/{}-{}".format(MODEL_NAME, int(time())))
|
||||
|
||||
# Used to count when to update target network with main network's weights
|
||||
self.target_update_counter = 0
|
||||
|
||||
def create_model(self):
|
||||
model = Sequential([
|
||||
Dense(32, input_shape=self.env.OBSERVATION_SPACE_VALUES),
|
||||
Activation('relu'),
|
||||
Dense(self.env.ACTION_SPACE_SIZE, activation='linear'),
|
||||
])
|
||||
model.compile(loss="mse", optimizer=Adam(
|
||||
lr=0.001), metrics=['accuracy'])
|
||||
return model
|
||||
|
||||
# Adds step's data to a memory replay array
|
||||
# (observation space, action, reward, new observation space, done)
|
||||
def update_replay_memory(self, transition):
|
||||
self.replay_memory.append(transition)
|
||||
|
||||
# Trains main network every step during episode
|
||||
def train(self, terminal_state, step):
|
||||
|
||||
# Start training only if certain number of samples is already saved
|
||||
if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
|
||||
return
|
||||
|
||||
# Get a minibatch of random samples from memory replay table
|
||||
minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE)
|
||||
|
||||
# Get current states from minibatch, then query NN model for Q values
|
||||
current_states = np.array([transition[0]
|
||||
for transition in minibatch])
|
||||
current_qs_list = self.model.predict(current_states)
|
||||
|
||||
# Get future states from minibatch, then query NN model for Q values
|
||||
# When using target network, query it, otherwise main network should be queried
|
||||
new_current_states = np.array(
|
||||
[transition[3] for transition in minibatch])
|
||||
future_qs_list = self.target_model.predict(new_current_states)
|
||||
|
||||
X = []
|
||||
y = []
|
||||
|
||||
# Now we need to enumerate our batches
|
||||
for index, (current_state, action, reward, new_current_state, done) in enumerate(minibatch):
|
||||
|
||||
# If not a terminal state, get new q from future states, otherwise set it to 0
|
||||
# almost like with Q Learning, but we use just part of equation here
|
||||
if not done:
|
||||
max_future_q = np.max(future_qs_list[index])
|
||||
new_q = reward + DISCOUNT * max_future_q
|
||||
else:
|
||||
new_q = reward
|
||||
|
||||
# Update Q value for given state
|
||||
current_qs = current_qs_list[index]
|
||||
current_qs[action] = new_q
|
||||
|
||||
# And append to our training data
|
||||
X.append(current_state)
|
||||
y.append(current_qs)
|
||||
|
||||
# Fit on all samples as one batch, log only on terminal state
|
||||
self.model.fit(np.array(X), np.array(y), batch_size=MINIBATCH_SIZE, verbose=0,
|
||||
shuffle=False, callbacks=[self.tensorboard] if terminal_state else None)
|
||||
|
||||
# Update target network counter every episode
|
||||
if terminal_state:
|
||||
self.target_update_counter += 1
|
||||
|
||||
# If counter reaches set value, update target network with weights of main network
|
||||
if self.target_update_counter > UPDATE_TARGET_EVERY:
|
||||
self.target_model.set_weights(self.model.get_weights())
|
||||
self.target_update_counter = 0
|
||||
|
||||
# Queries main network for Q values given current observation space (environment state)
|
||||
def get_qs(self, state):
|
||||
return self.model.predict(np.array(state).reshape(-1, *state.shape))[0]
|
73
Deep_Q_Learning/GC_Env.py
Normal file
73
Deep_Q_Learning/GC_Env.py
Normal file
@ -0,0 +1,73 @@
|
||||
from models.Garbage_Collector import Garbage_Collector
|
||||
from helpler import Render_Element
|
||||
from models.House import House
|
||||
from config import MAP_WIDTH, MAP_HEIGHT
|
||||
import numpy as np
|
||||
|
||||
|
||||
class GC_Env:
|
||||
OBSERVATION_SPACE_VALUES = (2 + 1 * 4 + 6 * 4,)
|
||||
ACTION_SPACE_SIZE = 6
|
||||
|
||||
def __init__(self):
|
||||
self.reset()
|
||||
self.actions = {
|
||||
0: self.gc.move_up,
|
||||
1: self.gc.move_down,
|
||||
2: self.gc.move_left,
|
||||
3: self.gc.move_right,
|
||||
4: self.gc.pick_trash,
|
||||
5: self.gc.leave_trash
|
||||
}
|
||||
self.runs = 0
|
||||
|
||||
def reset(self):
|
||||
self.draw_items = {(x, y): Render_Element(x, y)
|
||||
for x in range(MAP_WIDTH) for y in range(MAP_HEIGHT)}
|
||||
self.gc = Garbage_Collector(self.draw_items)
|
||||
houses = list(map(lambda item: self.draw_items[item], list(filter(lambda item: isinstance(
|
||||
self.draw_items[item], House), self.draw_items))))
|
||||
observation = [
|
||||
self.gc.col/(MAP_WIDTH - 1),
|
||||
self.gc.row / (MAP_HEIGHT - 1),
|
||||
self.gc.mixed / self.gc.limit, self.gc.paper / self.gc.limit,
|
||||
self.gc.glass / self.gc.limit, self.gc.plastic / self.gc.limit,
|
||||
]
|
||||
for house in houses:
|
||||
for item in ["mixed", "paper", "glass", "plastic"]:
|
||||
observation.append(getattr(house, item) / house.limit)
|
||||
|
||||
return observation
|
||||
|
||||
def step(self, action):
|
||||
action_result = self.actions[action]()
|
||||
houses = list(map(lambda item: self.draw_items[item], list(filter(lambda item: isinstance(
|
||||
self.draw_items[item], House), self.draw_items))))
|
||||
|
||||
new_observation = [
|
||||
self.gc.col/(MAP_WIDTH - 1),
|
||||
self.gc.row / (MAP_HEIGHT - 1),
|
||||
self.gc.mixed / self.gc.limit, self.gc.paper / self.gc.limit,
|
||||
self.gc.glass / self.gc.limit, self.gc.plastic / self.gc.limit,
|
||||
]
|
||||
for house in houses:
|
||||
for item in ["mixed", "paper", "glass", "plastic"]:
|
||||
new_observation.append(getattr(house, item) / house.limit)
|
||||
|
||||
if action_result == False:
|
||||
reward = -10
|
||||
elif action_result == True:
|
||||
reward = -0.1
|
||||
else:
|
||||
reward = action_result
|
||||
|
||||
done = True
|
||||
if not self.gc.is_empty():
|
||||
done = False
|
||||
else:
|
||||
for item in self.draw_items:
|
||||
if isinstance(self.draw_items[item], House) and not self.draw_items[item].is_empty():
|
||||
done = False
|
||||
break
|
||||
|
||||
return new_observation, reward, done
|
0
Deep_Q_Learning/__init__.py
Normal file
0
Deep_Q_Learning/__init__.py
Normal file
@ -25,7 +25,7 @@ MIN_REWARD = -200 # For model save
|
||||
MEMORY_FRACTION = 0.20
|
||||
|
||||
# Environment settings
|
||||
EPISODES = 20_0
|
||||
EPISODES = 20_000
|
||||
|
||||
# Exploration settings
|
||||
epsilon = 1 # not a constant, going to be decayed
|
||||
@ -335,6 +335,8 @@ class DQNAgent:
|
||||
self.model.fit(np.array(X)/255, np.array(y), batch_size=MINIBATCH_SIZE, verbose=0,
|
||||
shuffle=False, callbacks=[self.tensorboard] if terminal_state else None)
|
||||
|
||||
arr = np.array(X)/255
|
||||
|
||||
# Update target network counter every episode
|
||||
if terminal_state:
|
||||
self.target_update_counter += 1
|
2
Makefile
2
Makefile
@ -11,4 +11,4 @@ ql-test:
|
||||
bash -c "source env/bin/activate && python ql_tester.py"
|
||||
|
||||
dql:
|
||||
bash -c "source env/bin/activate && python dql.py"
|
||||
bash -c "source env/bin/activate && python dql_runner.py"
|
16
compile.py
16
compile.py
@ -1,16 +0,0 @@
|
||||
from distutils.core import setup
|
||||
from distutils.extension import Extension
|
||||
from Cython.Distutils import build_ext
|
||||
|
||||
ext_modules = [
|
||||
Extension("q_learning", ["q_learning.py"]),
|
||||
# ... all your modules that need be compiled ...]setup(
|
||||
Extension("ql_runner", ["ql_runner.py"]),
|
||||
|
||||
]
|
||||
|
||||
setup(
|
||||
name='ql-learn',
|
||||
cmdclass={'build_ext': build_ext},
|
||||
ext_modules=ext_modules
|
||||
)
|
96
dql_runner.py
Normal file
96
dql_runner.py
Normal file
@ -0,0 +1,96 @@
|
||||
from Deep_Q_Learning.Deep_Q_Learning import DQNAgent
|
||||
from numpy import genfromtxt
|
||||
import os
|
||||
import csv
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
from Deep_Q_Learning.GC_Env import GC_Env
|
||||
|
||||
DISCOUNT = 0.99
|
||||
REPLAY_MEMORY_SIZE = 50_000 # How many last steps to keep for model training
|
||||
# Minimum number of steps in a memory to start training
|
||||
MIN_REPLAY_MEMORY_SIZE = 1_000
|
||||
MINIBATCH_SIZE = 64 # How many steps (samples) to use for training
|
||||
UPDATE_TARGET_EVERY = 5 # Terminal states (end of episodes)
|
||||
MODEL_NAME = '2x256'
|
||||
MIN_REWARD = -200 # For model save
|
||||
MEMORY_FRACTION = 0.20
|
||||
|
||||
# Environment settings
|
||||
EPISODES = 20_000
|
||||
|
||||
# Exploration settings
|
||||
epsilon = 1 # not a constant, going to be decayed
|
||||
EPSILON_DECAY = 0.99975
|
||||
MIN_EPSILON = 0.001
|
||||
|
||||
# Stats settings
|
||||
AGGREGATE_STATS_EVERY = 50 # episodes
|
||||
SHOW_PREVIEW = False
|
||||
|
||||
env = GC_Env()
|
||||
|
||||
# For stats
|
||||
ep_rewards = [-200]
|
||||
|
||||
agent = DQNAgent(env=env)
|
||||
|
||||
for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'):
|
||||
|
||||
# Update tensorboard step every episode
|
||||
agent.tensorboard.step = episode
|
||||
|
||||
# Restarting episode - reset episode reward and step number
|
||||
episode_reward = 0
|
||||
step = 1
|
||||
|
||||
# Reset environment and get initial state
|
||||
current_state = env.reset()
|
||||
|
||||
# Reset flag and start iterating until episode ends
|
||||
done = False
|
||||
while not done:
|
||||
|
||||
# This part stays mostly the same, the change is to query a model for Q values
|
||||
if np.random.random() > epsilon:
|
||||
# Get action from Q table
|
||||
action = np.argmax(agent.get_qs(current_state))
|
||||
else:
|
||||
# Get random action
|
||||
action = np.random.randint(0, env.ACTION_SPACE_SIZE)
|
||||
|
||||
new_state, reward, done = env.step(action)
|
||||
|
||||
# Transform new continous state to new discrete state and count reward
|
||||
episode_reward += reward
|
||||
|
||||
if SHOW_PREVIEW and not episode % AGGREGATE_STATS_EVERY:
|
||||
env.render()
|
||||
|
||||
# Every step we update replay memory and train main network
|
||||
agent.update_replay_memory(
|
||||
(current_state, action, reward, new_state, done))
|
||||
agent.train(done, step)
|
||||
|
||||
current_state = new_state
|
||||
step += 1
|
||||
|
||||
# Append episode reward to a list and log stats (every given number of episodes)
|
||||
ep_rewards.append(episode_reward)
|
||||
if not episode % AGGREGATE_STATS_EVERY or episode == 1:
|
||||
average_reward = sum(
|
||||
ep_rewards[-AGGREGATE_STATS_EVERY:])/len(ep_rewards[-AGGREGATE_STATS_EVERY:])
|
||||
min_reward = min(ep_rewards[-AGGREGATE_STATS_EVERY:])
|
||||
max_reward = max(ep_rewards[-AGGREGATE_STATS_EVERY:])
|
||||
agent.tensorboard.update_stats(
|
||||
reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward, epsilon=epsilon)
|
||||
|
||||
# Save model, but only when min reward is greater or equal a set value
|
||||
if min_reward >= MIN_REWARD:
|
||||
agent.model.save(
|
||||
f'models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model')
|
||||
|
||||
# Decay epsilon
|
||||
if epsilon > MIN_EPSILON:
|
||||
epsilon *= EPSILON_DECAY
|
||||
epsilon = max(MIN_EPSILON, epsilon)
|
0
models/__init__.py
Normal file
0
models/__init__.py
Normal file
Loading…
Reference in New Issue
Block a user