styling
This commit is contained in:
parent
c230182b01
commit
2d57249021
@ -1,16 +1,11 @@
|
||||
import numpy as np
|
||||
import random
|
||||
from collections import deque
|
||||
from datetime import datetime
|
||||
from time import asctime
|
||||
import keras.backend.tensorflow_backend as backend
|
||||
from keras import backend as K
|
||||
import numpy as np
|
||||
from keras.models import Sequential
|
||||
from keras.layers import Dense, Dropout, Input, Activation, Flatten, Conv2D
|
||||
from keras.optimizers import Adam
|
||||
from keras.layers import Dense
|
||||
from keras.callbacks import TensorBoard
|
||||
import tensorflow as tf
|
||||
from collections import deque
|
||||
import random
|
||||
from Deep_Q_Learning.GC_Env import GC_Env
|
||||
|
||||
DISCOUNT = 0.9
|
||||
REPLAY_MEMORY_SIZE = 500_000 # How many last steps to keep for model training
|
||||
@ -40,7 +35,7 @@ class ModifiedTensorBoard(TensorBoard):
|
||||
|
||||
# Overrided, saves logs with our step number
|
||||
# (otherwise every .fit() will start writing from 0th step)
|
||||
def on_epoch_end(self, epoch, logs=None):
|
||||
def on_epoch_end(self, _, logs=None):
|
||||
self.update_stats(**logs)
|
||||
|
||||
# Overrided
|
||||
@ -113,7 +108,7 @@ class DQNAgent:
|
||||
# self.negative_memory.append(transition)
|
||||
|
||||
# Trains main network every step during episode
|
||||
def train(self, terminal_state, step):
|
||||
def train(self, terminal_state):
|
||||
|
||||
# Start training only if certain number of samples is already saved
|
||||
if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
|
||||
@ -133,15 +128,17 @@ class DQNAgent:
|
||||
[transition[3] for transition in minibatch])
|
||||
future_qs_list = self.target_model.predict(new_current_states)
|
||||
|
||||
X = []
|
||||
y = []
|
||||
__x__ = []
|
||||
__y__ = []
|
||||
|
||||
# Now we need to enumerate our batches
|
||||
for index, (current_state, action, reward, new_current_state, old_state, done) in enumerate(minibatch):
|
||||
for index, (current_state, action, reward, new_current_state, old_state, done) \
|
||||
in enumerate(minibatch):
|
||||
|
||||
# If not a terminal state, get new q from future states, otherwise set it to 0
|
||||
# almost like with Q Learning, but we use just part of equation here
|
||||
if not done and not np.array_equal(current_state, new_current_state) and not np.array_equal(old_state, new_current_state):
|
||||
if not done and not np.array_equal(current_state, new_current_state) and \
|
||||
not np.array_equal(old_state, new_current_state):
|
||||
max_future_q = np.max(future_qs_list[index])
|
||||
new_q = reward + DISCOUNT * max_future_q
|
||||
else:
|
||||
@ -152,11 +149,11 @@ class DQNAgent:
|
||||
current_qs[action] = new_q
|
||||
|
||||
# And append to our training data
|
||||
X.append(current_state)
|
||||
y.append(current_qs)
|
||||
__x__.append(current_state)
|
||||
__y__.append(current_qs)
|
||||
|
||||
# Fit on all samples as one batch, log only on terminal state
|
||||
self.model.fit(np.array(X), np.array(y), batch_size=MINIBATCH_SIZE, verbose=0,
|
||||
self.model.fit(np.array(__x__), np.array(__y__), batch_size=MINIBATCH_SIZE, verbose=0,
|
||||
shuffle=False, callbacks=[self.tensorboard] if terminal_state else None)
|
||||
|
||||
# Update target network counter every episode
|
||||
|
@ -1,12 +1,11 @@
|
||||
import pygame
|
||||
from config import CELL_SIZE, MAP_HEIGHT, MAP_WIDTH, MAP, FONT, BLACK, BLUE, GREEN, YELLOW, GARBAGE_COLLECTOR_IMAGE, TRASH_TYPES
|
||||
from random import randint
|
||||
from config import MAP_HEIGHT, MAP_WIDTH, MAP, TRASH_TYPES
|
||||
from models.__house__ import House
|
||||
from models.__numbers__ import Numbers
|
||||
from models.__trash__ import Trash
|
||||
|
||||
|
||||
class Garbage_Collector(Numbers):
|
||||
class GarbageCollector(Numbers):
|
||||
def __init__(self, draw_items):
|
||||
self.road_positions = {row_index: {
|
||||
col_index: (True if MAP[row_index][col_index] == "Road" else False)
|
||||
@ -64,7 +63,8 @@ class Garbage_Collector(Numbers):
|
||||
return False
|
||||
|
||||
def pick_trash(self):
|
||||
if self.mixed == self.limit and self.glass == self.limit and self.paper == self.limit and self.plastic == self.limit:
|
||||
if self.mixed == self.limit and self.glass == self.limit and \
|
||||
self.paper == self.limit and self.plastic == self.limit:
|
||||
return - 1
|
||||
|
||||
to_check = [
|
||||
@ -76,7 +76,8 @@ class Garbage_Collector(Numbers):
|
||||
houses_around = False
|
||||
transfered = 0
|
||||
for field in to_check:
|
||||
if field["row"] >= 0 and field["row"] < MAP_HEIGHT and field["col"] >= 0 and field["col"] < MAP_WIDTH:
|
||||
if field["row"] >= 0 and field["row"] < MAP_HEIGHT and \
|
||||
field["col"] >= 0 and field["col"] < MAP_WIDTH:
|
||||
item = self.draw_items[(field["col"], field["row"])]
|
||||
if isinstance(item, House):
|
||||
houses_around = True
|
||||
@ -94,8 +95,7 @@ class Garbage_Collector(Numbers):
|
||||
|
||||
if houses_around and transfered:
|
||||
return 1
|
||||
else:
|
||||
return -1
|
||||
return -1
|
||||
|
||||
def leave_trash(self):
|
||||
to_check = [
|
||||
@ -107,7 +107,8 @@ class Garbage_Collector(Numbers):
|
||||
transfered = 0
|
||||
trashes_around = False
|
||||
for field in to_check:
|
||||
if field["row"] >= 0 and field["row"] < MAP_HEIGHT and field["col"] >= 0 and field["col"] < MAP_WIDTH:
|
||||
if field["row"] >= 0 and field["row"] < MAP_HEIGHT and \
|
||||
field["col"] >= 0 and field["col"] < MAP_WIDTH:
|
||||
item = self.draw_items[(field["col"], field["row"])]
|
||||
if isinstance(item, Trash):
|
||||
trashes_around = True
|
||||
@ -121,5 +122,4 @@ class Garbage_Collector(Numbers):
|
||||
|
||||
if trashes_around and transfered:
|
||||
return 1
|
||||
else:
|
||||
return -1
|
||||
return -1
|
@ -1,36 +1,40 @@
|
||||
from Deep_Q_Learning.q_gc import Garbage_Collector
|
||||
import numpy as np
|
||||
from Deep_Q_Learning.__gc__ import GarbageCollector
|
||||
from helpler import __render_element__
|
||||
from models.__house__ import House
|
||||
from models.__road__ import Road
|
||||
from config import MAP_WIDTH, MAP_HEIGHT, NUMBER_OF_HOUSES
|
||||
import numpy as np
|
||||
from timeit import default_timer as timer
|
||||
|
||||
|
||||
class GC_Env:
|
||||
class GcEnv:
|
||||
OBSERVATION_SPACE_VALUES = (36 + NUMBER_OF_HOUSES,)
|
||||
ACTION_SPACE_SIZE = 6
|
||||
|
||||
def __init__(self):
|
||||
self.draw_items = None
|
||||
self.__gc__ = None
|
||||
self.actions = None
|
||||
|
||||
def reset(self):
|
||||
self.draw_items = {(x, y): __render_element__(x, y)
|
||||
for x in range(MAP_WIDTH) for y in range(MAP_HEIGHT)}
|
||||
self.gc = Garbage_Collector(self.draw_items)
|
||||
self.__gc__ = GarbageCollector(self.draw_items)
|
||||
self.actions = {
|
||||
0: self.gc.move_up,
|
||||
1: self.gc.move_down,
|
||||
2: self.gc.move_left,
|
||||
3: self.gc.move_right,
|
||||
4: self.gc.pick_trash,
|
||||
5: self.gc.leave_trash
|
||||
0: self.__gc__.move_up,
|
||||
1: self.__gc__.move_down,
|
||||
2: self.__gc__.move_left,
|
||||
3: self.__gc__.move_right,
|
||||
4: self.__gc__.pick_trash,
|
||||
5: self.__gc__.leave_trash
|
||||
}
|
||||
|
||||
return self.observe(self.gc, self.draw_items)
|
||||
return self.observe(self.__gc__, self.draw_items)
|
||||
|
||||
def observe(self, gc, draw_items):
|
||||
def observe(self, __gc__, draw_items):
|
||||
roads = list(filter(lambda item: isinstance(
|
||||
draw_items[item], Road), draw_items))
|
||||
|
||||
gc_pos = roads.index((gc.col, gc.row))
|
||||
gc_pos = roads.index((__gc__.col, __gc__.row))
|
||||
|
||||
observation = np.full(self.OBSERVATION_SPACE_VALUES, -1)
|
||||
observation[gc_pos] = 1
|
||||
@ -53,7 +57,7 @@ class GC_Env:
|
||||
def step(self, action):
|
||||
action_result = self.actions[action]()
|
||||
|
||||
new_observation = self.observe(self.gc, self.draw_items)
|
||||
new_observation = self.observe(self.__gc__, self.draw_items)
|
||||
|
||||
if action_result is False:
|
||||
reward = -1
|
||||
@ -63,7 +67,7 @@ class GC_Env:
|
||||
reward = action_result
|
||||
|
||||
done = True
|
||||
if not self.gc.is_empty():
|
||||
if not self.__gc__.is_empty():
|
||||
done = False
|
||||
else:
|
||||
for item in self.draw_items:
|
0
Deep_Q_Learning/__init__.py
Normal file
0
Deep_Q_Learning/__init__.py
Normal file
@ -1,417 +0,0 @@
|
||||
import numpy as np
|
||||
import keras.backend.tensorflow_backend as backend
|
||||
from keras.models import Sequential
|
||||
from keras.layers import Dense, Dropout, Conv2D, MaxPooling2D, Activation, Flatten
|
||||
from keras.optimizers import Adam
|
||||
from keras.callbacks import TensorBoard
|
||||
import tensorflow as tf
|
||||
from collections import deque
|
||||
import time
|
||||
import random
|
||||
from tqdm import tqdm
|
||||
import os
|
||||
from PIL import Image
|
||||
import cv2
|
||||
|
||||
|
||||
DISCOUNT = 0.99
|
||||
REPLAY_MEMORY_SIZE = 50_000 # How many last steps to keep for model training
|
||||
# Minimum number of steps in a memory to start training
|
||||
MIN_REPLAY_MEMORY_SIZE = 1_000
|
||||
MINIBATCH_SIZE = 64 # How many steps (samples) to use for training
|
||||
UPDATE_TARGET_EVERY = 5 # Terminal states (end of episodes)
|
||||
MODEL_NAME = '2x256'
|
||||
MIN_REWARD = -200 # For model save
|
||||
MEMORY_FRACTION = 0.20
|
||||
|
||||
# Environment settings
|
||||
EPISODES = 20_000
|
||||
|
||||
# Exploration settings
|
||||
epsilon = 1 # not a constant, going to be decayed
|
||||
EPSILON_DECAY = 0.99975
|
||||
MIN_EPSILON = 0.001
|
||||
|
||||
# Stats settings
|
||||
AGGREGATE_STATS_EVERY = 50 # episodes
|
||||
SHOW_PREVIEW = False
|
||||
|
||||
|
||||
class Blob:
|
||||
def __init__(self, size):
|
||||
self.size = size
|
||||
self.x = np.random.randint(0, size)
|
||||
self.y = np.random.randint(0, size)
|
||||
|
||||
def __str__(self):
|
||||
return f"Blob ({self.x}, {self.y})"
|
||||
|
||||
def __sub__(self, other):
|
||||
return (self.x-other.x, self.y-other.y)
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.x == other.x and self.y == other.y
|
||||
|
||||
def action(self, choice):
|
||||
'''
|
||||
Gives us 9 total movement options. (0,1,2,3,4,5,6,7,8)
|
||||
'''
|
||||
if choice == 0:
|
||||
self.move(x=1, y=1)
|
||||
elif choice == 1:
|
||||
self.move(x=-1, y=-1)
|
||||
elif choice == 2:
|
||||
self.move(x=-1, y=1)
|
||||
elif choice == 3:
|
||||
self.move(x=1, y=-1)
|
||||
|
||||
elif choice == 4:
|
||||
self.move(x=1, y=0)
|
||||
elif choice == 5:
|
||||
self.move(x=-1, y=0)
|
||||
|
||||
elif choice == 6:
|
||||
self.move(x=0, y=1)
|
||||
elif choice == 7:
|
||||
self.move(x=0, y=-1)
|
||||
|
||||
elif choice == 8:
|
||||
self.move(x=0, y=0)
|
||||
|
||||
def move(self, x=False, y=False):
|
||||
|
||||
# If no value for x, move randomly
|
||||
if not x:
|
||||
self.x += np.random.randint(-1, 2)
|
||||
else:
|
||||
self.x += x
|
||||
|
||||
# If no value for y, move randomly
|
||||
if not y:
|
||||
self.y += np.random.randint(-1, 2)
|
||||
else:
|
||||
self.y += y
|
||||
|
||||
# If we are out of bounds, fix!
|
||||
if self.x < 0:
|
||||
self.x = 0
|
||||
elif self.x > self.size-1:
|
||||
self.x = self.size-1
|
||||
if self.y < 0:
|
||||
self.y = 0
|
||||
elif self.y > self.size-1:
|
||||
self.y = self.size-1
|
||||
|
||||
|
||||
class BlobEnv:
|
||||
SIZE = 10
|
||||
RETURN_IMAGES = True
|
||||
MOVE_PENALTY = 1
|
||||
ENEMY_PENALTY = 300
|
||||
FOOD_REWARD = 25
|
||||
OBSERVATION_SPACE_VALUES = (SIZE, SIZE, 3) # 4
|
||||
ACTION_SPACE_SIZE = 9
|
||||
PLAYER_N = 1 # player key in dict
|
||||
FOOD_N = 2 # food key in dict
|
||||
ENEMY_N = 3 # enemy key in dict
|
||||
# the dict! (colors)
|
||||
d = {1: (255, 175, 0),
|
||||
2: (0, 255, 0),
|
||||
3: (0, 0, 255)}
|
||||
|
||||
def reset(self):
|
||||
self.player = Blob(self.SIZE)
|
||||
self.food = Blob(self.SIZE)
|
||||
while self.food == self.player:
|
||||
self.food = Blob(self.SIZE)
|
||||
self.enemy = Blob(self.SIZE)
|
||||
while self.enemy == self.player or self.enemy == self.food:
|
||||
self.enemy = Blob(self.SIZE)
|
||||
|
||||
self.episode_step = 0
|
||||
|
||||
if self.RETURN_IMAGES:
|
||||
observation = np.array(self.get_image())
|
||||
else:
|
||||
observation = (self.player-self.food) + (self.player-self.enemy)
|
||||
return observation
|
||||
|
||||
def step(self, action):
|
||||
self.episode_step += 1
|
||||
self.player.action(action)
|
||||
|
||||
#### MAYBE ###
|
||||
# enemy.move()
|
||||
# food.move()
|
||||
##############
|
||||
|
||||
if self.RETURN_IMAGES:
|
||||
new_observation = np.array(self.get_image())
|
||||
else:
|
||||
new_observation = (self.player-self.food) + \
|
||||
(self.player-self.enemy)
|
||||
|
||||
if self.player == self.enemy:
|
||||
reward = -self.ENEMY_PENALTY
|
||||
elif self.player == self.food:
|
||||
reward = self.FOOD_REWARD
|
||||
else:
|
||||
reward = -self.MOVE_PENALTY
|
||||
|
||||
done = False
|
||||
if reward == self.FOOD_REWARD or reward == -self.ENEMY_PENALTY or self.episode_step >= 200:
|
||||
done = True
|
||||
|
||||
return new_observation, reward, done
|
||||
|
||||
def render(self):
|
||||
img = self.get_image()
|
||||
# resizing so we can see our agent in all its glory.
|
||||
img = img.resize((300, 300))
|
||||
cv2.imshow("image", np.array(img)) # show it!
|
||||
cv2.waitKey(1)
|
||||
|
||||
# FOR CNN #
|
||||
def get_image(self):
|
||||
# starts an rbg of our size
|
||||
env = np.zeros((self.SIZE, self.SIZE, 3), dtype=np.uint8)
|
||||
# sets the food location tile to green color
|
||||
env[self.food.x][self.food.y] = self.d[self.FOOD_N]
|
||||
# sets the enemy location to red
|
||||
env[self.enemy.x][self.enemy.y] = self.d[self.ENEMY_N]
|
||||
# sets the player tile to blue
|
||||
env[self.player.x][self.player.y] = self.d[self.PLAYER_N]
|
||||
# reading to rgb. Apparently. Even tho color definitions are bgr. ???
|
||||
img = Image.fromarray(env, 'RGB')
|
||||
return img
|
||||
|
||||
|
||||
env = BlobEnv()
|
||||
|
||||
# For stats
|
||||
ep_rewards = [-200]
|
||||
|
||||
# For more repetitive results
|
||||
random.seed(1)
|
||||
np.random.seed(1)
|
||||
tf.set_random_seed(1)
|
||||
|
||||
# Memory fraction, used mostly when trai8ning multiple agents
|
||||
#gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=MEMORY_FRACTION)
|
||||
# backend.set_session(tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)))
|
||||
|
||||
# Create models folder
|
||||
if not os.path.isdir('models'):
|
||||
os.makedirs('models')
|
||||
|
||||
|
||||
# Own Tensorboard class
|
||||
class ModifiedTensorBoard(TensorBoard):
|
||||
|
||||
# Overriding init to set initial step and writer (we want one log file for all .fit() calls)
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.step = 1
|
||||
self.writer = tf.summary.FileWriter(self.log_dir)
|
||||
|
||||
# Overriding this method to stop creating default log writer
|
||||
def set_model(self, model):
|
||||
pass
|
||||
|
||||
# Overrided, saves logs with our step number
|
||||
# (otherwise every .fit() will start writing from 0th step)
|
||||
def on_epoch_end(self, epoch, logs=None):
|
||||
self.update_stats(**logs)
|
||||
|
||||
# Overrided
|
||||
# We train for one batch only, no need to save anything at epoch end
|
||||
def on_batch_end(self, batch, logs=None):
|
||||
pass
|
||||
|
||||
# Overrided, so won't close writer
|
||||
def on_train_end(self, _):
|
||||
pass
|
||||
|
||||
# Custom method for saving own metrics
|
||||
# Creates writer, writes custom metrics and closes writer
|
||||
def update_stats(self, **stats):
|
||||
self._write_logs(stats, self.step)
|
||||
|
||||
|
||||
# Agent class
|
||||
class DQNAgent:
|
||||
def __init__(self):
|
||||
|
||||
# Main model
|
||||
self.model = self.create_model()
|
||||
|
||||
# Target network
|
||||
self.target_model = self.create_model()
|
||||
self.target_model.set_weights(self.model.get_weights())
|
||||
|
||||
# An array with last n steps for training
|
||||
self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)
|
||||
|
||||
# Custom tensorboard object
|
||||
self.tensorboard = ModifiedTensorBoard(
|
||||
log_dir="logs/{}-{}".format(MODEL_NAME, int(time.time())))
|
||||
|
||||
# Used to count when to update target network with main network's weights
|
||||
self.target_update_counter = 0
|
||||
|
||||
def create_model(self):
|
||||
model = Sequential()
|
||||
|
||||
# OBSERVATION_SPACE_VALUES = (10, 10, 3) a 10x10 RGB image.
|
||||
model.add(Conv2D(256, (3, 3), input_shape=env.OBSERVATION_SPACE_VALUES))
|
||||
model.add(Activation('relu'))
|
||||
model.add(MaxPooling2D(pool_size=(2, 2)))
|
||||
model.add(Dropout(0.2))
|
||||
|
||||
model.add(Conv2D(256, (3, 3)))
|
||||
model.add(Activation('relu'))
|
||||
model.add(MaxPooling2D(pool_size=(2, 2)))
|
||||
model.add(Dropout(0.2))
|
||||
|
||||
# this converts our 3D feature maps to 1D feature vectors
|
||||
model.add(Flatten())
|
||||
model.add(Dense(64))
|
||||
|
||||
# ACTION_SPACE_SIZE = how many choices (9)
|
||||
model.add(Dense(env.ACTION_SPACE_SIZE, activation='linear'))
|
||||
model.compile(loss="mse", optimizer=Adam(
|
||||
lr=0.001), metrics=['accuracy'])
|
||||
print(model.summary())
|
||||
return model
|
||||
|
||||
# Adds step's data to a memory replay array
|
||||
# (observation space, action, reward, new observation space, done)
|
||||
def update_replay_memory(self, transition):
|
||||
self.replay_memory.append(transition)
|
||||
|
||||
# Trains main network every step during episode
|
||||
def train(self, terminal_state, step):
|
||||
|
||||
# Start training only if certain number of samples is already saved
|
||||
if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
|
||||
return
|
||||
|
||||
# Get a minibatch of random samples from memory replay table
|
||||
minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE)
|
||||
|
||||
# Get current states from minibatch, then query NN model for Q values
|
||||
current_states = np.array([transition[0]
|
||||
for transition in minibatch])/255
|
||||
current_qs_list = self.model.predict(current_states)
|
||||
|
||||
# Get future states from minibatch, then query NN model for Q values
|
||||
# When using target network, query it, otherwise main network should be queried
|
||||
new_current_states = np.array(
|
||||
[transition[3] for transition in minibatch])/255
|
||||
future_qs_list = self.target_model.predict(new_current_states)
|
||||
|
||||
X = []
|
||||
y = []
|
||||
|
||||
# Now we need to enumerate our batches
|
||||
for index, (current_state, action, reward, new_current_state, done) in enumerate(minibatch):
|
||||
|
||||
# If not a terminal state, get new q from future states, otherwise set it to 0
|
||||
# almost like with Q Learning, but we use just part of equation here
|
||||
if not done:
|
||||
max_future_q = np.max(future_qs_list[index])
|
||||
new_q = reward + DISCOUNT * max_future_q
|
||||
else:
|
||||
new_q = reward
|
||||
|
||||
# Update Q value for given state
|
||||
current_qs = current_qs_list[index]
|
||||
current_qs[action] = new_q
|
||||
|
||||
# And append to our training data
|
||||
X.append(current_state)
|
||||
y.append(current_qs)
|
||||
|
||||
# Fit on all samples as one batch, log only on terminal state
|
||||
self.model.fit(np.array(X)/255, np.array(y), batch_size=MINIBATCH_SIZE, verbose=0,
|
||||
shuffle=False, callbacks=[self.tensorboard] if terminal_state else None)
|
||||
|
||||
# Update target network counter every episode
|
||||
if terminal_state:
|
||||
self.target_update_counter += 1
|
||||
|
||||
# If counter reaches set value, update target network with weights of main network
|
||||
if self.target_update_counter > UPDATE_TARGET_EVERY:
|
||||
self.target_model.set_weights(self.model.get_weights())
|
||||
self.target_update_counter = 0
|
||||
|
||||
# Queries main network for Q values given current observation space (environment state)
|
||||
def get_qs(self, state):
|
||||
arr = np.array(state).reshape(-1, *state.shape)/255
|
||||
rate = self.model.predict(
|
||||
np.array(state).reshape(-1, *state.shape)/255)[0]
|
||||
return rate
|
||||
|
||||
|
||||
agent = DQNAgent()
|
||||
|
||||
# Iterate over episodes
|
||||
for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'):
|
||||
|
||||
# Update tensorboard step every episode
|
||||
agent.tensorboard.step = episode
|
||||
|
||||
# Restarting episode - reset episode reward and step number
|
||||
episode_reward = 0
|
||||
step = 1
|
||||
|
||||
# Reset environment and get initial state
|
||||
current_state = env.reset()
|
||||
|
||||
# Reset flag and start iterating until episode ends
|
||||
done = False
|
||||
while not done:
|
||||
|
||||
# This part stays mostly the same, the change is to query a model for Q values
|
||||
if np.random.random() > epsilon:
|
||||
# Get action from Q table
|
||||
action = np.argmax(agent.get_qs(current_state))
|
||||
else:
|
||||
# Get random action
|
||||
action = np.random.randint(0, env.ACTION_SPACE_SIZE)
|
||||
|
||||
new_state, reward, done = env.step(action)
|
||||
|
||||
# Transform new continous state to new discrete state and count reward
|
||||
episode_reward += reward
|
||||
|
||||
if SHOW_PREVIEW and not episode % AGGREGATE_STATS_EVERY:
|
||||
env.render()
|
||||
|
||||
# Every step we update replay memory and train main network
|
||||
agent.update_replay_memory(
|
||||
(current_state, action, reward, new_state, done))
|
||||
agent.train(done, step)
|
||||
|
||||
current_state = new_state
|
||||
step += 1
|
||||
|
||||
# Append episode reward to a list and log stats (every given number of episodes)
|
||||
ep_rewards.append(episode_reward)
|
||||
if not episode % AGGREGATE_STATS_EVERY or episode == 1:
|
||||
average_reward = sum(
|
||||
ep_rewards[-AGGREGATE_STATS_EVERY:])/len(ep_rewards[-AGGREGATE_STATS_EVERY:])
|
||||
min_reward = min(ep_rewards[-AGGREGATE_STATS_EVERY:])
|
||||
max_reward = max(ep_rewards[-AGGREGATE_STATS_EVERY:])
|
||||
agent.tensorboard.update_stats(
|
||||
reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward, epsilon=epsilon)
|
||||
|
||||
# Save model, but only when min reward is greater or equal a set value
|
||||
if min_reward >= MIN_REWARD:
|
||||
agent.model.save(
|
||||
f'models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{int(time.time())}.model')
|
||||
|
||||
# Decay epsilon
|
||||
if epsilon > MIN_EPSILON:
|
||||
epsilon *= EPSILON_DECAY
|
||||
epsilon = max(MIN_EPSILON, epsilon)
|
52
Knowledge.py
52
Knowledge.py
@ -1,52 +0,0 @@
|
||||
# from models.House import House
|
||||
# from models.Trash import Trash
|
||||
|
||||
|
||||
# class Knowledge:
|
||||
# def __init__(self, draw_items, gc):
|
||||
# self.draw_items = draw_items
|
||||
# self.gc = gc
|
||||
# self.update()
|
||||
|
||||
# def add_to_dict(self, item, quantity, trash_quantity_variable):
|
||||
# if quantity in trash_quantity_variable:
|
||||
# trash_quantity_variable[quantity].append(
|
||||
# {"col": item.col, "row": item.row})
|
||||
# else:
|
||||
# trash_quantity_variable[quantity] = [
|
||||
# {"col": item.col, "row": item.row}]
|
||||
|
||||
# def update(self):
|
||||
# self.mixed_trash_quantity_houses = {}
|
||||
# self.paper_trash_quantity_houses = {}
|
||||
# self.glass_trash_quantity_houses = {}
|
||||
# self.plastic_trash_quantity_houses = {}
|
||||
# self.trashes = {}
|
||||
# for item in self.draw_items:
|
||||
|
||||
# if isinstance(self.draw_items[item], House):
|
||||
# if not self.draw_items[item].mixed and not self.draw_items[item].paper and not self.draw_items[item].glass and not self.draw_items[item].plastic:
|
||||
# # print(self.draw_items[item].col, self.draw_items[item].row)
|
||||
# pass
|
||||
# if self.draw_items[item].mixed:
|
||||
# self.add_to_dict(self.draw_items[item], self.draw_items[item].mixed,
|
||||
# self.mixed_trash_quantity_houses)
|
||||
# if self.draw_items[item].paper:
|
||||
# self.add_to_dict(self.draw_items[item], self.draw_items[item].paper,
|
||||
# self.paper_trash_quantity_houses)
|
||||
# if self.draw_items[item].glass:
|
||||
# self.add_to_dict(self.draw_items[item], self.draw_items[item].glass,
|
||||
# self.glass_trash_quantity_houses)
|
||||
# if self.draw_items[item].plastic:
|
||||
# self.add_to_dict(self.draw_items[item], self.draw_items[item].plastic,
|
||||
# self.plastic_trash_quantity_houses)
|
||||
# elif isinstance(self.draw_items[item], Trash):
|
||||
# self.trashes[self.draw_items[item].trash_type] = {
|
||||
# "col": self.draw_items[item].col, "row": self.draw_items[item].row, "trash": self.draw_items[item].trash}
|
||||
|
||||
# def show(self):
|
||||
# print({"Trash": {"mixed": self.trashes["Mixed"], "glass": self.trashes["Paper"],
|
||||
# "paper": self.trashes["Glass"], "plastic": self.trashes["Plastic"]}},
|
||||
# {"Garbage Collector": {"mixed": self.gc.mixed, "glass": self.gc.glass,
|
||||
# "paper": self.gc.paper, "plastic": self.gc.plastic}}
|
||||
# )
|
@ -1,4 +1,3 @@
|
||||
from platform import system
|
||||
CELL_SIZE = 64
|
||||
MAP_HEIGHT = 7
|
||||
MAP_WIDTH = 9
|
||||
|
@ -1,11 +1,9 @@
|
||||
import time
|
||||
from datetime import datetime
|
||||
import numpy as np
|
||||
from numpy.random import random
|
||||
from tqdm import tqdm
|
||||
from Deep_Q_Learning.Deep_Q_Learning import DQNAgent, MODEL_NAME
|
||||
from Deep_Q_Learning.GC_Env import GC_Env
|
||||
from keras.utils import plot_model
|
||||
from keras.models import load_model
|
||||
from datetime import datetime
|
||||
from Deep_Q_Learning.__gc_env__ import GcEnv
|
||||
|
||||
MIN_REWARD = 0 # For model save
|
||||
STEP_LIMIT = 500
|
||||
@ -14,37 +12,37 @@ STEP_LIMIT = 500
|
||||
EPISODES = 20_000
|
||||
|
||||
# Exploration settings
|
||||
epsilon = 1 # not a constant, going to be decayed
|
||||
EPSILON = 1 # not a constant, going to be decayed
|
||||
EPSILON_DECAY = 0.99975
|
||||
MIN_EPSILON = 0.01
|
||||
|
||||
# Stats settings
|
||||
AGGREGATE_STATS_EVERY = 20 # episodes
|
||||
|
||||
env = GC_Env()
|
||||
ENV = GcEnv()
|
||||
|
||||
# For stats
|
||||
ep_rewards = []
|
||||
steps = []
|
||||
EP_REWARDS = []
|
||||
STEPS = []
|
||||
|
||||
# model = load_model(
|
||||
# 'trained_models\\lr=0.001_gamma=0.5___-35.90max_-1172.30avg_-4394.80min__2020-05-01_23-03.model')
|
||||
# '')
|
||||
|
||||
model = None
|
||||
MODEL = None
|
||||
|
||||
agent = DQNAgent(env=env, model=model)
|
||||
AGENT = DQNAgent(env=ENV, model=MODEL)
|
||||
|
||||
for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'):
|
||||
|
||||
# Update tensorboard step every episode
|
||||
agent.tensorboard.step = episode
|
||||
AGENT.tensorboard.step = episode
|
||||
|
||||
# Restarting episode - reset episode reward and step number
|
||||
episode_reward = 0
|
||||
step = 1
|
||||
|
||||
# Reset environment and get initial state
|
||||
current_state = env.reset()
|
||||
current_state = ENV.reset()
|
||||
old_state = np.zeros(1)
|
||||
|
||||
# Reset flag and start iterating until episode ends
|
||||
@ -52,54 +50,57 @@ for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'):
|
||||
while not done and step <= STEP_LIMIT:
|
||||
|
||||
# This part stays mostly the same, the change is to query a model for Q values
|
||||
if np.random.random() > epsilon:
|
||||
if random() > EPSILON:
|
||||
# Get action from Q table
|
||||
action = np.argmax(agent.get_qs(current_state))
|
||||
action = np.argmax(AGENT.get_qs(current_state))
|
||||
else:
|
||||
# Get random action
|
||||
action = np.random.randint(0, env.ACTION_SPACE_SIZE)
|
||||
action = np.random.randint(0, ENV.ACTION_SPACE_SIZE)
|
||||
|
||||
new_state, reward, done = env.step(action)
|
||||
new_state, reward, done = ENV.step(action)
|
||||
|
||||
# Transform new continous state to new discrete state and count reward
|
||||
episode_reward += reward
|
||||
|
||||
# Every step we update replay memory and train main network
|
||||
agent.update_replay_memory(
|
||||
AGENT.update_replay_memory(
|
||||
(current_state, action, reward, new_state, old_state, done))
|
||||
agent.train(done or step >= STEP_LIMIT, step)
|
||||
AGENT.train(done or step >= STEP_LIMIT)
|
||||
|
||||
old_state = current_state
|
||||
current_state = new_state
|
||||
step += 1
|
||||
|
||||
agent.tensorboard.update_stats(reward=episode_reward)
|
||||
AGENT.tensorboard.update_stats(reward=episode_reward)
|
||||
|
||||
# Append episode reward to a list and log stats (every given number of episodes)
|
||||
ep_rewards.append(episode_reward)
|
||||
steps.append(step)
|
||||
EP_REWARDS.append(episode_reward)
|
||||
STEPS.append(step)
|
||||
if not episode % AGGREGATE_STATS_EVERY or episode == 1:
|
||||
average_reward = sum(
|
||||
ep_rewards[-AGGREGATE_STATS_EVERY:]) / len(ep_rewards[-AGGREGATE_STATS_EVERY:])
|
||||
min_reward = min(ep_rewards[-AGGREGATE_STATS_EVERY:])
|
||||
max_reward = max(ep_rewards[-AGGREGATE_STATS_EVERY:])
|
||||
average_steps = sum(steps[-AGGREGATE_STATS_EVERY:]) / \
|
||||
len(steps[-AGGREGATE_STATS_EVERY:])
|
||||
agent.tensorboard.update_stats(
|
||||
reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward, epsilon=epsilon, average_steps=average_steps)
|
||||
EP_REWARDS[-AGGREGATE_STATS_EVERY:]) / len(EP_REWARDS[-AGGREGATE_STATS_EVERY:])
|
||||
min_reward = min(EP_REWARDS[-AGGREGATE_STATS_EVERY:])
|
||||
max_reward = max(EP_REWARDS[-AGGREGATE_STATS_EVERY:])
|
||||
average_steps = sum(STEPS[-AGGREGATE_STATS_EVERY:]) / \
|
||||
len(STEPS[-AGGREGATE_STATS_EVERY:])
|
||||
AGENT.tensorboard.update_stats(
|
||||
reward_avg=average_reward, reward_min=min_reward,
|
||||
reward_max=max_reward, epsilon=EPSILON, average_steps=average_steps)
|
||||
|
||||
# Save model, but only when min reward is greater or equal a set value
|
||||
if min_reward >= MIN_REWARD:
|
||||
agent.model.save(
|
||||
f'trained_models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{datetime.now().strftime("%Y-%m-%d_%H-%M")}.model')
|
||||
AGENT.model.save(
|
||||
f'trained_models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f} \
|
||||
avg_{min_reward:_>7.2f}min__{datetime.now().strftime("%Y-%m-%d_%H-%M")}.model')
|
||||
|
||||
if not episode % EPISODES:
|
||||
agent.model.save(
|
||||
f'trained_models/{MODEL_NAME}__{max_reward:_>7.2f}max_{average_reward:_>7.2f}avg_{min_reward:_>7.2f}min__{datetime.now().strftime("%Y-%m-%d_%H-%M")}.model')
|
||||
AGENT.model.save(
|
||||
f'trained_models/{MODEL_NAME}__{max_reward: _ > 7.2f}max_{average_reward: _ > 7.2f} \
|
||||
avg_{min_reward: _ > 7.2f}min__{datetime.now().strftime("%Y-%m-%d_%H-%M")}.model')
|
||||
|
||||
# plot_model(agent.model, to_file='model.png')
|
||||
|
||||
# Decay epsilon
|
||||
if epsilon > MIN_EPSILON:
|
||||
epsilon *= EPSILON_DECAY
|
||||
epsilon = max(MIN_EPSILON, epsilon)
|
||||
if EPSILON > MIN_EPSILON:
|
||||
EPSILON *= EPSILON_DECAY
|
||||
EPSILON = max(MIN_EPSILON, EPSILON)
|
||||
|
Loading…
Reference in New Issue
Block a user