Waiter_group/main_training.py

490 lines
16 KiB
Python

from __future__ import print_function
import os, sys, time, datetime, json, random
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Activation
from keras.optimizers import SGD , Adam, RMSprop
from keras.layers.advanced_activations import PReLU
import matplotlib.pyplot as plt
import pickle
visited_mark = 0.8 # Cells visited by the rat will be painted by gray 0.8
rat_mark = 0.5 # The current rat cell will be painteg by gray 0.5
LEFT = 0
UP = 1
RIGHT = 2
DOWN = 3
# Actions dictionary
actions_dict = {
LEFT: 'left',
UP: 'up',
RIGHT: 'right',
DOWN: 'down',
}
num_actions = len(actions_dict)
# Exploration factor
epsilon = 0.1
file_name_num = 1
win_targets = [(4, 4),(4, 9),(4, 14),(9, 4)]
class Qmaze(object):
def __init__(self, maze, rat=(12,12)):
global win_targets
self._maze = np.array(maze)
nrows, ncols = self._maze.shape
#self.target = (nrows-1, ncols-1) # target cell where the "cheese" is
self.target = win_targets[0]
self.free_cells = [(r,c) for r in range(nrows) for c in range(ncols) if self._maze[r,c] == 1.0]
self.free_cells.remove(win_targets[-1])
if self._maze[self.target] == 0.0:
raise Exception("Invalid maze: target cell cannot be blocked!")
if not rat in self.free_cells:
raise Exception("Invalid Rat Location: must sit on a free cell")
self.reset(rat)
def reset(self, rat):
global win_targets
self.rat = rat
self.maze = np.copy(self._maze)
nrows, ncols = self.maze.shape
row, col = rat
self.maze[row, col] = rat_mark
self.state = (row, col, 'start')
self.min_reward = -0.5 * self.maze.size
self.total_reward = 0
self.visited = list()
self.curr_win_targets = win_targets[:]
def update_state(self, action):
nrows, ncols = self.maze.shape
nrow, ncol, nmode = rat_row, rat_col, mode = self.state
if self.maze[rat_row, rat_col] > 0.0:
self.visited.append((rat_row, rat_col)) # mark visited cell
valid_actions = self.valid_actions()
if not valid_actions:
nmode = 'blocked'
elif action in valid_actions:
nmode = 'valid'
if action == LEFT:
ncol -= 1
elif action == UP:
nrow -= 1
if action == RIGHT:
ncol += 1
elif action == DOWN:
nrow += 1
else: # invalid action, no change in rat position
mode = 'invalid'
# new state
self.state = (nrow, ncol, nmode)
def get_reward(self):
win_target_x, win_target_y = self.target
rat_row, rat_col, mode = self.state
nrows, ncols = self.maze.shape
if rat_row == win_target_x and rat_col == win_target_y:
return 1.0
if mode == 'blocked': # move to the block in the grid
return -1.0
if (rat_row, rat_col) in self.visited:
return -0.5 # default -0.25 -> -0.5
if mode == 'invalid':
return -0.75 # default -0.75 move to the boundary
if mode == 'valid': # default -0.04 -> -0.1
return -0.04
if (rat_row, rat_col) in self.curr_win_targets:
return 1.0
def act(self, action):
self.update_state(action)
reward = self.get_reward()
self.total_reward += reward
status = self.game_status()
envstate = self.observe()
return envstate, reward, status
def observe(self):
canvas = self.draw_env()
envstate = canvas.reshape((1, -1))
return envstate
def draw_env(self):
canvas = np.copy(self.maze)
nrows, ncols = self.maze.shape
# clear all visual marks
for r in range(nrows):
for c in range(ncols):
if canvas[r,c] > 0.0:
canvas[r,c] = 1.0
# draw the rat
row, col, valid = self.state
canvas[row, col] = rat_mark
return canvas
def game_status(self):
if self.total_reward < self.min_reward:
return 'lose'
rat_row, rat_col, mode = self.state
nrows, ncols = self.maze.shape
curPos = (rat_row, rat_col)
if curPos in self.curr_win_targets:
self.curr_win_targets.remove(curPos)
if len(self.curr_win_targets) == 0:
return 'win'
else:
self.target = self.curr_win_targets[0]
return 'not_over'
def valid_actions(self, cell=None):
if cell is None:
row, col, mode = self.state
else:
row, col = cell
actions = [0, 1, 2, 3]
nrows, ncols = self.maze.shape
if row == 0:
actions.remove(1)
elif row == nrows-1:
actions.remove(3)
if col == 0:
actions.remove(0)
elif col == ncols-1:
actions.remove(2)
if row>0 and self.maze[row-1,col] == 0.0:
actions.remove(1)
if row<nrows-1 and self.maze[row+1,col] == 0.0:
actions.remove(3)
if col>0 and self.maze[row,col-1] == 0.0:
actions.remove(0)
if col<ncols-1 and self.maze[row,col+1] == 0.0:
actions.remove(2)
return actions
def show(qmaze):
global win_target
win_target_row, win_target_col = win_target
plt.grid('on')
nrows, ncols = qmaze.maze.shape
ax = plt.gca()
ax.set_xticks(np.arange(0.5, nrows, 1))
ax.set_yticks(np.arange(0.5, ncols, 1))
ax.set_xticklabels([])
ax.set_yticklabels([])
canvas = np.copy(qmaze.maze)
for row,col in qmaze.visited:
canvas[row,col] = 0.6
rat_row, rat_col, _ = qmaze.state
canvas[rat_row, rat_col] = 0.3 # rat cell
canvas[win_target_row, win_target_col] = 0.9 # cheese cell
img = plt.imshow(canvas, interpolation='none', cmap='gray')
return img
def save_pic(qmaze):
global file_name_num
global win_target
win_target_row, win_target_col = win_target
plt.grid('on')
nrows, ncols = qmaze.maze.shape
ax = plt.gca()
ax.set_xticks(np.arange(0.5, nrows, 1))
ax.set_yticks(np.arange(0.5, ncols, 1))
ax.set_xticklabels([])
ax.set_yticklabels([])
canvas = np.copy(qmaze.maze)
for row,col in qmaze.visited:
canvas[row,col] = 0.6
rat_row, rat_col, _ = qmaze.state
canvas[rat_row, rat_col] = 0.3 # rat cell
canvas[win_target_row, win_target_col] = 0.9 # cheese cell
plt.imshow(canvas, interpolation='none', cmap='gray')
plt.savefig(str(file_name_num) + ".png")
file_name_num += 1
def output_route(qmaze):
global win_target
win_target_row, win_target_col = win_target
print(qmaze._maze)
def play_game(model, qmaze, rat_cell):
qmaze.reset(rat_cell)
envstate = qmaze.observe()
while True:
prev_envstate = envstate
# get next action
q = model.predict(prev_envstate)
action = np.argmax(q[0])
# apply action, get rewards and new state
envstate, reward, game_status = qmaze.act(action)
if game_status == 'win':
return True
elif game_status == 'lose':
return False
def completion_check(model, qmaze):
for cell in qmaze.free_cells:
if not qmaze.valid_actions(cell):
return False
if not play_game(model, qmaze, cell):
return False
return True
class Experience(object):
def __init__(self, model, max_memory=100, discount=0.9):
self.model = model
self.max_memory = max_memory
self.discount = discount
self.memory = list()
self.num_actions = model.output_shape[-1]
def remember(self, episode):
# episode = [envstate, action, reward, envstate_next, game_over]
# memory[i] = episode
# envstate == flattened 1d maze cells info, including rat cell (see method: observe)
self.memory.append(episode)
if len(self.memory) > self.max_memory:
del self.memory[0]
def predict(self, envstate):
return self.model.predict(envstate)[0]
def get_data(self, data_size=10):
env_size = self.memory[0][0].shape[1] # envstate 1d size (1st element of episode)
mem_size = len(self.memory)
data_size = min(mem_size, data_size)
inputs = np.zeros((data_size, env_size))
targets = np.zeros((data_size, self.num_actions))
for i, j in enumerate(np.random.choice(range(mem_size), data_size, replace=False)):
envstate, action, reward, envstate_next, game_over = self.memory[j]
inputs[i] = envstate
# There should be no target values for actions not taken.
targets[i] = self.predict(envstate)
# Q_sa = derived policy = max quality env/action = max_a' Q(s', a')
Q_sa = np.max(self.predict(envstate_next))
if game_over:
targets[i, action] = reward
else:
# reward + gamma * max_a' Q(s', a')
targets[i, action] = reward + self.discount * Q_sa
return inputs, targets
def qtrain(model, maze, **opt):
global epsilon
n_epoch = opt.get('n_epoch', 15000)
max_memory = opt.get('max_memory', 1000)
data_size = opt.get('data_size', 50)
weights_file = opt.get('weights_file', "")
name = opt.get('name', 'model')
start_time = datetime.datetime.now()
# If you want to continue training from a previous model,
# just supply the h5 file name to weights_file option
if weights_file:
print("loading weights from file: %s" % (weights_file,))
model.load_weights(weights_file)
# Construct environment/game from numpy array: maze (see above)
qmaze = Qmaze(maze)
# Initialize experience replay object
experience = Experience(model, max_memory=max_memory)
win_history = [] # history of win/lose game
n_free_cells = len(qmaze.free_cells)
hsize = qmaze.maze.size//2 # history window size
win_rate = 0.0
imctr = 1
pre_episodes = 2**31 - 1
for epoch in range(n_epoch):
loss = 0.0
#rat_cell = random.choice(qmaze.free_cells)
#rat_cell = (0, 0)
rat_cell = (12, 12)
qmaze.reset(rat_cell)
game_over = False
# get initial envstate (1d flattened canvas)
envstate = qmaze.observe()
n_episodes = 0
while not game_over:
valid_actions = qmaze.valid_actions()
if not valid_actions: break
prev_envstate = envstate
# Get next action
if np.random.rand() < epsilon:
action = random.choice(valid_actions)
else:
action = np.argmax(experience.predict(prev_envstate))
# Apply action, get reward and new envstate
envstate, reward, game_status = qmaze.act(action)
if game_status == 'win':
print("win")
win_history.append(1)
game_over = True
# save_pic(qmaze)
if n_episodes <= pre_episodes:
# output_route(qmaze)
print(qmaze.visited)
with open('res.data', 'wb') as filehandle:
pickle.dump(qmaze.visited, filehandle)
pre_episodes = n_episodes
elif game_status == 'lose':
print("lose")
win_history.append(0)
game_over = True
# save_pic(qmaze)
else:
game_over = False
# Store episode (experience)
episode = [prev_envstate, action, reward, envstate, game_over]
experience.remember(episode)
n_episodes += 1
# Train neural network model
inputs, targets = experience.get_data(data_size=data_size)
h = model.fit(
inputs,
targets,
epochs=8,
batch_size=16,
verbose=0,
)
loss = model.evaluate(inputs, targets, verbose=0)
if len(win_history) > hsize:
win_rate = sum(win_history[-hsize:]) / hsize
dt = datetime.datetime.now() - start_time
t = format_time(dt.total_seconds())
template = "Epoch: {:03d}/{:d} | Loss: {:.4f} | Episodes: {:d} | Win count: {:d} | Win rate: {:.3f} | time: {}"
print(template.format(epoch, n_epoch-1, loss, n_episodes, sum(win_history), win_rate, t))
# we simply check if training has exhausted all free cells and if in all
# cases the agent won
if win_rate > 0.9 : epsilon = 0.05
train_max = 192
# print(sum(win_history[-192*1.5:]))
# print(192)
if sum(win_history[-192:]) >= 192:
print("Reached 100%% win rate at epoch: %d" % (epoch,))
break
# Save trained model weights and architecture, this will be used by the visualization code
h5file = name + ".h5"
json_file = name + ".json"
model.save_weights(h5file, overwrite=True)
with open(json_file, "w") as outfile:
json.dump(model.to_json(), outfile)
end_time = datetime.datetime.now()
dt = datetime.datetime.now() - start_time
seconds = dt.total_seconds()
t = format_time(seconds)
print('files: %s, %s' % (h5file, json_file))
print("n_epoch: %d, max_mem: %d, data: %d, time: %s" % (epoch, max_memory, data_size, t))
return seconds
# This is a small utility for printing readable time strings:
def format_time(seconds):
if seconds < 400:
s = float(seconds)
return "%.1f seconds" % (s,)
elif seconds < 4000:
m = seconds / 60.0
return "%.2f minutes" % (m,)
else:
h = seconds / 3600.0
return "%.2f hours" % (h,)
def build_model(maze, lr=0.001):
model = Sequential()
model.add(Dense(maze.size, input_shape=(maze.size,)))
model.add(PReLU())
model.add(Dense(maze.size))
model.add(PReLU())
model.add(Dense(num_actions))
model.compile(optimizer='adam', loss='mse')
return model
class Table:
def __init__(self, coordinate_i, coordinate_j):
self.coordinate_i = coordinate_i
self.coordinate_j = coordinate_j
change_value(coordinate_i, coordinate_j, 2, 0.)
def get_destination_coor(self):
return [self.coordinate_i, self.coordinate_j-1]
class Kitchen:
def __init__(self, coordinate_i, coordinate_j):
self.coordinate_i = coordinate_i
self.coordinate_j = coordinate_j
change_value(coordinate_i, coordinate_j, 3, 0.)
if __name__== "__main__":
def change_value(i, j, width, n):
for r in range (i, i+width):
for c in range (j, j+width):
grid[r][c] = n
grid = [[1 for x in range(16)] for y in range(16)]
table1 = Table(2, 2)
table2 = Table (2,7)
table3 = Table(2, 12)
table4 = Table(7, 2)
table5 = Table(7, 7)
table6 = Table(7, 12)
table7 = Table(12, 2)
table8 = Table(12, 7)
kitchen = Kitchen(13, 13)
maze = np.array(grid)
# print(maze)
# maze = np.array([
# [ 1., 0., 1., 1., 1., 1., 1., 1.],
# [ 1., 1., 1., 0., 0., 1., 0., 1.],
# [ 1., 1., 1., 1., 1., 1., 0., 1.],
# [ 1., 1., 1., 1., 0., 0., 1., 1.],
# [ 1., 0., 0., 0., 1., 1., 1., 1.],
# [ 1., 0., 1., 1., 1., 1., 1., 1.],
# [ 1., 1., 1., 0., 1., 1., 1., 1.]
# ])
# print(maze)
# qmaze = Qmaze(maze)
# show(qmaze)
model = build_model(maze)
qtrain(model, maze, epochs=1000, max_memory=8*maze.size, data_size=32)