Individual Project #2; s442720
This commit is contained in:
parent
a73862b48b
commit
71dc3e81a2
489
main_training.py
Normal file
489
main_training.py
Normal file
@ -0,0 +1,489 @@
|
|||||||
|
from __future__ import print_function
|
||||||
|
import os, sys, time, datetime, json, random
|
||||||
|
import numpy as np
|
||||||
|
from keras.models import Sequential
|
||||||
|
from keras.layers.core import Dense, Activation
|
||||||
|
from keras.optimizers import SGD , Adam, RMSprop
|
||||||
|
from keras.layers.advanced_activations import PReLU
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import pickle
|
||||||
|
|
||||||
|
visited_mark = 0.8 # Cells visited by the rat will be painted by gray 0.8
|
||||||
|
rat_mark = 0.5 # The current rat cell will be painteg by gray 0.5
|
||||||
|
LEFT = 0
|
||||||
|
UP = 1
|
||||||
|
RIGHT = 2
|
||||||
|
DOWN = 3
|
||||||
|
|
||||||
|
# Actions dictionary
|
||||||
|
actions_dict = {
|
||||||
|
LEFT: 'left',
|
||||||
|
UP: 'up',
|
||||||
|
RIGHT: 'right',
|
||||||
|
DOWN: 'down',
|
||||||
|
}
|
||||||
|
|
||||||
|
num_actions = len(actions_dict)
|
||||||
|
|
||||||
|
# Exploration factor
|
||||||
|
epsilon = 0.1
|
||||||
|
file_name_num = 1
|
||||||
|
win_targets = [(4, 4),(4, 9),(4, 14),(9, 4)]
|
||||||
|
|
||||||
|
class Qmaze(object):
|
||||||
|
def __init__(self, maze, rat=(12,12)):
|
||||||
|
global win_targets
|
||||||
|
self._maze = np.array(maze)
|
||||||
|
nrows, ncols = self._maze.shape
|
||||||
|
#self.target = (nrows-1, ncols-1) # target cell where the "cheese" is
|
||||||
|
self.target = win_targets[0]
|
||||||
|
self.free_cells = [(r,c) for r in range(nrows) for c in range(ncols) if self._maze[r,c] == 1.0]
|
||||||
|
self.free_cells.remove(win_targets[-1])
|
||||||
|
if self._maze[self.target] == 0.0:
|
||||||
|
raise Exception("Invalid maze: target cell cannot be blocked!")
|
||||||
|
if not rat in self.free_cells:
|
||||||
|
raise Exception("Invalid Rat Location: must sit on a free cell")
|
||||||
|
self.reset(rat)
|
||||||
|
|
||||||
|
def reset(self, rat):
|
||||||
|
global win_targets
|
||||||
|
self.rat = rat
|
||||||
|
self.maze = np.copy(self._maze)
|
||||||
|
nrows, ncols = self.maze.shape
|
||||||
|
row, col = rat
|
||||||
|
self.maze[row, col] = rat_mark
|
||||||
|
self.state = (row, col, 'start')
|
||||||
|
self.min_reward = -0.5 * self.maze.size
|
||||||
|
self.total_reward = 0
|
||||||
|
self.visited = list()
|
||||||
|
self.curr_win_targets = win_targets[:]
|
||||||
|
|
||||||
|
def update_state(self, action):
|
||||||
|
nrows, ncols = self.maze.shape
|
||||||
|
nrow, ncol, nmode = rat_row, rat_col, mode = self.state
|
||||||
|
|
||||||
|
if self.maze[rat_row, rat_col] > 0.0:
|
||||||
|
self.visited.append((rat_row, rat_col)) # mark visited cell
|
||||||
|
|
||||||
|
valid_actions = self.valid_actions()
|
||||||
|
|
||||||
|
if not valid_actions:
|
||||||
|
nmode = 'blocked'
|
||||||
|
elif action in valid_actions:
|
||||||
|
nmode = 'valid'
|
||||||
|
if action == LEFT:
|
||||||
|
ncol -= 1
|
||||||
|
elif action == UP:
|
||||||
|
nrow -= 1
|
||||||
|
if action == RIGHT:
|
||||||
|
ncol += 1
|
||||||
|
elif action == DOWN:
|
||||||
|
nrow += 1
|
||||||
|
else: # invalid action, no change in rat position
|
||||||
|
mode = 'invalid'
|
||||||
|
|
||||||
|
# new state
|
||||||
|
self.state = (nrow, ncol, nmode)
|
||||||
|
|
||||||
|
def get_reward(self):
|
||||||
|
win_target_x, win_target_y = self.target
|
||||||
|
rat_row, rat_col, mode = self.state
|
||||||
|
nrows, ncols = self.maze.shape
|
||||||
|
if rat_row == win_target_x and rat_col == win_target_y:
|
||||||
|
return 1.0
|
||||||
|
if mode == 'blocked': # move to the block in the grid
|
||||||
|
return -1.0
|
||||||
|
if (rat_row, rat_col) in self.visited:
|
||||||
|
return -0.5 # default -0.25 -> -0.5
|
||||||
|
if mode == 'invalid':
|
||||||
|
return -0.75 # default -0.75 move to the boundary
|
||||||
|
if mode == 'valid': # default -0.04 -> -0.1
|
||||||
|
return -0.04
|
||||||
|
if (rat_row, rat_col) in self.curr_win_targets:
|
||||||
|
return 1.0
|
||||||
|
|
||||||
|
def act(self, action):
|
||||||
|
self.update_state(action)
|
||||||
|
reward = self.get_reward()
|
||||||
|
self.total_reward += reward
|
||||||
|
status = self.game_status()
|
||||||
|
envstate = self.observe()
|
||||||
|
return envstate, reward, status
|
||||||
|
|
||||||
|
def observe(self):
|
||||||
|
canvas = self.draw_env()
|
||||||
|
envstate = canvas.reshape((1, -1))
|
||||||
|
return envstate
|
||||||
|
|
||||||
|
def draw_env(self):
|
||||||
|
canvas = np.copy(self.maze)
|
||||||
|
nrows, ncols = self.maze.shape
|
||||||
|
# clear all visual marks
|
||||||
|
for r in range(nrows):
|
||||||
|
for c in range(ncols):
|
||||||
|
if canvas[r,c] > 0.0:
|
||||||
|
canvas[r,c] = 1.0
|
||||||
|
# draw the rat
|
||||||
|
row, col, valid = self.state
|
||||||
|
canvas[row, col] = rat_mark
|
||||||
|
return canvas
|
||||||
|
|
||||||
|
def game_status(self):
|
||||||
|
if self.total_reward < self.min_reward:
|
||||||
|
return 'lose'
|
||||||
|
rat_row, rat_col, mode = self.state
|
||||||
|
nrows, ncols = self.maze.shape
|
||||||
|
|
||||||
|
curPos = (rat_row, rat_col)
|
||||||
|
|
||||||
|
if curPos in self.curr_win_targets:
|
||||||
|
self.curr_win_targets.remove(curPos)
|
||||||
|
if len(self.curr_win_targets) == 0:
|
||||||
|
return 'win'
|
||||||
|
else:
|
||||||
|
self.target = self.curr_win_targets[0]
|
||||||
|
|
||||||
|
return 'not_over'
|
||||||
|
|
||||||
|
def valid_actions(self, cell=None):
|
||||||
|
if cell is None:
|
||||||
|
row, col, mode = self.state
|
||||||
|
else:
|
||||||
|
row, col = cell
|
||||||
|
actions = [0, 1, 2, 3]
|
||||||
|
nrows, ncols = self.maze.shape
|
||||||
|
if row == 0:
|
||||||
|
actions.remove(1)
|
||||||
|
elif row == nrows-1:
|
||||||
|
actions.remove(3)
|
||||||
|
|
||||||
|
if col == 0:
|
||||||
|
actions.remove(0)
|
||||||
|
elif col == ncols-1:
|
||||||
|
actions.remove(2)
|
||||||
|
|
||||||
|
if row>0 and self.maze[row-1,col] == 0.0:
|
||||||
|
actions.remove(1)
|
||||||
|
if row<nrows-1 and self.maze[row+1,col] == 0.0:
|
||||||
|
actions.remove(3)
|
||||||
|
|
||||||
|
if col>0 and self.maze[row,col-1] == 0.0:
|
||||||
|
actions.remove(0)
|
||||||
|
if col<ncols-1 and self.maze[row,col+1] == 0.0:
|
||||||
|
actions.remove(2)
|
||||||
|
|
||||||
|
return actions
|
||||||
|
|
||||||
|
def show(qmaze):
|
||||||
|
global win_target
|
||||||
|
win_target_row, win_target_col = win_target
|
||||||
|
plt.grid('on')
|
||||||
|
nrows, ncols = qmaze.maze.shape
|
||||||
|
ax = plt.gca()
|
||||||
|
ax.set_xticks(np.arange(0.5, nrows, 1))
|
||||||
|
ax.set_yticks(np.arange(0.5, ncols, 1))
|
||||||
|
ax.set_xticklabels([])
|
||||||
|
ax.set_yticklabels([])
|
||||||
|
canvas = np.copy(qmaze.maze)
|
||||||
|
for row,col in qmaze.visited:
|
||||||
|
canvas[row,col] = 0.6
|
||||||
|
rat_row, rat_col, _ = qmaze.state
|
||||||
|
canvas[rat_row, rat_col] = 0.3 # rat cell
|
||||||
|
canvas[win_target_row, win_target_col] = 0.9 # cheese cell
|
||||||
|
img = plt.imshow(canvas, interpolation='none', cmap='gray')
|
||||||
|
return img
|
||||||
|
|
||||||
|
|
||||||
|
def save_pic(qmaze):
|
||||||
|
global file_name_num
|
||||||
|
global win_target
|
||||||
|
win_target_row, win_target_col = win_target
|
||||||
|
plt.grid('on')
|
||||||
|
nrows, ncols = qmaze.maze.shape
|
||||||
|
ax = plt.gca()
|
||||||
|
ax.set_xticks(np.arange(0.5, nrows, 1))
|
||||||
|
ax.set_yticks(np.arange(0.5, ncols, 1))
|
||||||
|
ax.set_xticklabels([])
|
||||||
|
ax.set_yticklabels([])
|
||||||
|
canvas = np.copy(qmaze.maze)
|
||||||
|
for row,col in qmaze.visited:
|
||||||
|
canvas[row,col] = 0.6
|
||||||
|
rat_row, rat_col, _ = qmaze.state
|
||||||
|
canvas[rat_row, rat_col] = 0.3 # rat cell
|
||||||
|
canvas[win_target_row, win_target_col] = 0.9 # cheese cell
|
||||||
|
plt.imshow(canvas, interpolation='none', cmap='gray')
|
||||||
|
plt.savefig(str(file_name_num) + ".png")
|
||||||
|
file_name_num += 1
|
||||||
|
|
||||||
|
def output_route(qmaze):
|
||||||
|
global win_target
|
||||||
|
win_target_row, win_target_col = win_target
|
||||||
|
print(qmaze._maze)
|
||||||
|
|
||||||
|
def play_game(model, qmaze, rat_cell):
|
||||||
|
qmaze.reset(rat_cell)
|
||||||
|
envstate = qmaze.observe()
|
||||||
|
while True:
|
||||||
|
prev_envstate = envstate
|
||||||
|
# get next action
|
||||||
|
q = model.predict(prev_envstate)
|
||||||
|
action = np.argmax(q[0])
|
||||||
|
|
||||||
|
# apply action, get rewards and new state
|
||||||
|
envstate, reward, game_status = qmaze.act(action)
|
||||||
|
if game_status == 'win':
|
||||||
|
return True
|
||||||
|
elif game_status == 'lose':
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def completion_check(model, qmaze):
|
||||||
|
for cell in qmaze.free_cells:
|
||||||
|
if not qmaze.valid_actions(cell):
|
||||||
|
return False
|
||||||
|
if not play_game(model, qmaze, cell):
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
class Experience(object):
|
||||||
|
def __init__(self, model, max_memory=100, discount=0.9):
|
||||||
|
self.model = model
|
||||||
|
self.max_memory = max_memory
|
||||||
|
self.discount = discount
|
||||||
|
self.memory = list()
|
||||||
|
self.num_actions = model.output_shape[-1]
|
||||||
|
|
||||||
|
def remember(self, episode):
|
||||||
|
# episode = [envstate, action, reward, envstate_next, game_over]
|
||||||
|
# memory[i] = episode
|
||||||
|
# envstate == flattened 1d maze cells info, including rat cell (see method: observe)
|
||||||
|
self.memory.append(episode)
|
||||||
|
if len(self.memory) > self.max_memory:
|
||||||
|
del self.memory[0]
|
||||||
|
|
||||||
|
def predict(self, envstate):
|
||||||
|
return self.model.predict(envstate)[0]
|
||||||
|
|
||||||
|
def get_data(self, data_size=10):
|
||||||
|
env_size = self.memory[0][0].shape[1] # envstate 1d size (1st element of episode)
|
||||||
|
mem_size = len(self.memory)
|
||||||
|
data_size = min(mem_size, data_size)
|
||||||
|
inputs = np.zeros((data_size, env_size))
|
||||||
|
targets = np.zeros((data_size, self.num_actions))
|
||||||
|
for i, j in enumerate(np.random.choice(range(mem_size), data_size, replace=False)):
|
||||||
|
envstate, action, reward, envstate_next, game_over = self.memory[j]
|
||||||
|
inputs[i] = envstate
|
||||||
|
# There should be no target values for actions not taken.
|
||||||
|
targets[i] = self.predict(envstate)
|
||||||
|
# Q_sa = derived policy = max quality env/action = max_a' Q(s', a')
|
||||||
|
Q_sa = np.max(self.predict(envstate_next))
|
||||||
|
if game_over:
|
||||||
|
targets[i, action] = reward
|
||||||
|
else:
|
||||||
|
# reward + gamma * max_a' Q(s', a')
|
||||||
|
targets[i, action] = reward + self.discount * Q_sa
|
||||||
|
return inputs, targets
|
||||||
|
|
||||||
|
def qtrain(model, maze, **opt):
|
||||||
|
global epsilon
|
||||||
|
n_epoch = opt.get('n_epoch', 15000)
|
||||||
|
max_memory = opt.get('max_memory', 1000)
|
||||||
|
data_size = opt.get('data_size', 50)
|
||||||
|
weights_file = opt.get('weights_file', "")
|
||||||
|
name = opt.get('name', 'model')
|
||||||
|
start_time = datetime.datetime.now()
|
||||||
|
|
||||||
|
# If you want to continue training from a previous model,
|
||||||
|
# just supply the h5 file name to weights_file option
|
||||||
|
if weights_file:
|
||||||
|
print("loading weights from file: %s" % (weights_file,))
|
||||||
|
model.load_weights(weights_file)
|
||||||
|
|
||||||
|
# Construct environment/game from numpy array: maze (see above)
|
||||||
|
qmaze = Qmaze(maze)
|
||||||
|
|
||||||
|
# Initialize experience replay object
|
||||||
|
experience = Experience(model, max_memory=max_memory)
|
||||||
|
|
||||||
|
win_history = [] # history of win/lose game
|
||||||
|
n_free_cells = len(qmaze.free_cells)
|
||||||
|
hsize = qmaze.maze.size//2 # history window size
|
||||||
|
win_rate = 0.0
|
||||||
|
imctr = 1
|
||||||
|
pre_episodes = 2**31 - 1
|
||||||
|
|
||||||
|
for epoch in range(n_epoch):
|
||||||
|
loss = 0.0
|
||||||
|
#rat_cell = random.choice(qmaze.free_cells)
|
||||||
|
#rat_cell = (0, 0)
|
||||||
|
rat_cell = (12, 12)
|
||||||
|
|
||||||
|
qmaze.reset(rat_cell)
|
||||||
|
game_over = False
|
||||||
|
|
||||||
|
# get initial envstate (1d flattened canvas)
|
||||||
|
envstate = qmaze.observe()
|
||||||
|
|
||||||
|
n_episodes = 0
|
||||||
|
while not game_over:
|
||||||
|
valid_actions = qmaze.valid_actions()
|
||||||
|
if not valid_actions: break
|
||||||
|
prev_envstate = envstate
|
||||||
|
# Get next action
|
||||||
|
if np.random.rand() < epsilon:
|
||||||
|
action = random.choice(valid_actions)
|
||||||
|
else:
|
||||||
|
action = np.argmax(experience.predict(prev_envstate))
|
||||||
|
|
||||||
|
# Apply action, get reward and new envstate
|
||||||
|
envstate, reward, game_status = qmaze.act(action)
|
||||||
|
if game_status == 'win':
|
||||||
|
print("win")
|
||||||
|
win_history.append(1)
|
||||||
|
game_over = True
|
||||||
|
# save_pic(qmaze)
|
||||||
|
if n_episodes <= pre_episodes:
|
||||||
|
# output_route(qmaze)
|
||||||
|
print(qmaze.visited)
|
||||||
|
with open('res.data', 'wb') as filehandle:
|
||||||
|
pickle.dump(qmaze.visited, filehandle)
|
||||||
|
pre_episodes = n_episodes
|
||||||
|
|
||||||
|
elif game_status == 'lose':
|
||||||
|
print("lose")
|
||||||
|
win_history.append(0)
|
||||||
|
game_over = True
|
||||||
|
# save_pic(qmaze)
|
||||||
|
else:
|
||||||
|
game_over = False
|
||||||
|
|
||||||
|
# Store episode (experience)
|
||||||
|
episode = [prev_envstate, action, reward, envstate, game_over]
|
||||||
|
experience.remember(episode)
|
||||||
|
n_episodes += 1
|
||||||
|
|
||||||
|
# Train neural network model
|
||||||
|
inputs, targets = experience.get_data(data_size=data_size)
|
||||||
|
h = model.fit(
|
||||||
|
inputs,
|
||||||
|
targets,
|
||||||
|
epochs=8,
|
||||||
|
batch_size=16,
|
||||||
|
verbose=0,
|
||||||
|
)
|
||||||
|
loss = model.evaluate(inputs, targets, verbose=0)
|
||||||
|
|
||||||
|
|
||||||
|
if len(win_history) > hsize:
|
||||||
|
win_rate = sum(win_history[-hsize:]) / hsize
|
||||||
|
|
||||||
|
dt = datetime.datetime.now() - start_time
|
||||||
|
t = format_time(dt.total_seconds())
|
||||||
|
|
||||||
|
template = "Epoch: {:03d}/{:d} | Loss: {:.4f} | Episodes: {:d} | Win count: {:d} | Win rate: {:.3f} | time: {}"
|
||||||
|
print(template.format(epoch, n_epoch-1, loss, n_episodes, sum(win_history), win_rate, t))
|
||||||
|
# we simply check if training has exhausted all free cells and if in all
|
||||||
|
# cases the agent won
|
||||||
|
if win_rate > 0.9 : epsilon = 0.05
|
||||||
|
train_max = 192
|
||||||
|
# print(sum(win_history[-192*1.5:]))
|
||||||
|
# print(192)
|
||||||
|
if sum(win_history[-192:]) >= 192:
|
||||||
|
print("Reached 100%% win rate at epoch: %d" % (epoch,))
|
||||||
|
break
|
||||||
|
|
||||||
|
# Save trained model weights and architecture, this will be used by the visualization code
|
||||||
|
h5file = name + ".h5"
|
||||||
|
json_file = name + ".json"
|
||||||
|
model.save_weights(h5file, overwrite=True)
|
||||||
|
with open(json_file, "w") as outfile:
|
||||||
|
json.dump(model.to_json(), outfile)
|
||||||
|
end_time = datetime.datetime.now()
|
||||||
|
dt = datetime.datetime.now() - start_time
|
||||||
|
seconds = dt.total_seconds()
|
||||||
|
t = format_time(seconds)
|
||||||
|
print('files: %s, %s' % (h5file, json_file))
|
||||||
|
print("n_epoch: %d, max_mem: %d, data: %d, time: %s" % (epoch, max_memory, data_size, t))
|
||||||
|
return seconds
|
||||||
|
|
||||||
|
# This is a small utility for printing readable time strings:
|
||||||
|
def format_time(seconds):
|
||||||
|
if seconds < 400:
|
||||||
|
s = float(seconds)
|
||||||
|
return "%.1f seconds" % (s,)
|
||||||
|
elif seconds < 4000:
|
||||||
|
m = seconds / 60.0
|
||||||
|
return "%.2f minutes" % (m,)
|
||||||
|
else:
|
||||||
|
h = seconds / 3600.0
|
||||||
|
return "%.2f hours" % (h,)
|
||||||
|
|
||||||
|
def build_model(maze, lr=0.001):
|
||||||
|
model = Sequential()
|
||||||
|
model.add(Dense(maze.size, input_shape=(maze.size,)))
|
||||||
|
model.add(PReLU())
|
||||||
|
model.add(Dense(maze.size))
|
||||||
|
model.add(PReLU())
|
||||||
|
model.add(Dense(num_actions))
|
||||||
|
model.compile(optimizer='adam', loss='mse')
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class Table:
|
||||||
|
def __init__(self, coordinate_i, coordinate_j):
|
||||||
|
self.coordinate_i = coordinate_i
|
||||||
|
self.coordinate_j = coordinate_j
|
||||||
|
change_value(coordinate_i, coordinate_j, 2, 0.)
|
||||||
|
def get_destination_coor(self):
|
||||||
|
return [self.coordinate_i, self.coordinate_j-1]
|
||||||
|
|
||||||
|
class Kitchen:
|
||||||
|
def __init__(self, coordinate_i, coordinate_j):
|
||||||
|
self.coordinate_i = coordinate_i
|
||||||
|
self.coordinate_j = coordinate_j
|
||||||
|
change_value(coordinate_i, coordinate_j, 3, 0.)
|
||||||
|
|
||||||
|
if __name__== "__main__":
|
||||||
|
|
||||||
|
def change_value(i, j, width, n):
|
||||||
|
for r in range (i, i+width):
|
||||||
|
for c in range (j, j+width):
|
||||||
|
grid[r][c] = n
|
||||||
|
|
||||||
|
grid = [[1 for x in range(16)] for y in range(16)]
|
||||||
|
table1 = Table(2, 2)
|
||||||
|
table2 = Table (2,7)
|
||||||
|
table3 = Table(2, 12)
|
||||||
|
table4 = Table(7, 2)
|
||||||
|
table5 = Table(7, 7)
|
||||||
|
table6 = Table(7, 12)
|
||||||
|
table7 = Table(12, 2)
|
||||||
|
table8 = Table(12, 7)
|
||||||
|
|
||||||
|
|
||||||
|
kitchen = Kitchen(13, 13)
|
||||||
|
maze = np.array(grid)
|
||||||
|
|
||||||
|
# print(maze)
|
||||||
|
# maze = np.array([
|
||||||
|
# [ 1., 0., 1., 1., 1., 1., 1., 1.],
|
||||||
|
# [ 1., 1., 1., 0., 0., 1., 0., 1.],
|
||||||
|
# [ 1., 1., 1., 1., 1., 1., 0., 1.],
|
||||||
|
# [ 1., 1., 1., 1., 0., 0., 1., 1.],
|
||||||
|
# [ 1., 0., 0., 0., 1., 1., 1., 1.],
|
||||||
|
# [ 1., 0., 1., 1., 1., 1., 1., 1.],
|
||||||
|
# [ 1., 1., 1., 0., 1., 1., 1., 1.]
|
||||||
|
# ])
|
||||||
|
# print(maze)
|
||||||
|
|
||||||
|
|
||||||
|
# qmaze = Qmaze(maze)
|
||||||
|
# show(qmaze)
|
||||||
|
|
||||||
|
model = build_model(maze)
|
||||||
|
qtrain(model, maze, epochs=1000, max_memory=8*maze.size, data_size=32)
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user