diff --git a/main_training.py b/main_training.py new file mode 100644 index 0000000..de6e84d --- /dev/null +++ b/main_training.py @@ -0,0 +1,489 @@ +from __future__ import print_function +import os, sys, time, datetime, json, random +import numpy as np +from keras.models import Sequential +from keras.layers.core import Dense, Activation +from keras.optimizers import SGD , Adam, RMSprop +from keras.layers.advanced_activations import PReLU +import matplotlib.pyplot as plt +import pickle + +visited_mark = 0.8 # Cells visited by the rat will be painted by gray 0.8 +rat_mark = 0.5 # The current rat cell will be painteg by gray 0.5 +LEFT = 0 +UP = 1 +RIGHT = 2 +DOWN = 3 + +# Actions dictionary +actions_dict = { + LEFT: 'left', + UP: 'up', + RIGHT: 'right', + DOWN: 'down', +} + +num_actions = len(actions_dict) + +# Exploration factor +epsilon = 0.1 +file_name_num = 1 +win_targets = [(4, 4),(4, 9),(4, 14),(9, 4)] + +class Qmaze(object): + def __init__(self, maze, rat=(12,12)): + global win_targets + self._maze = np.array(maze) + nrows, ncols = self._maze.shape + #self.target = (nrows-1, ncols-1) # target cell where the "cheese" is + self.target = win_targets[0] + self.free_cells = [(r,c) for r in range(nrows) for c in range(ncols) if self._maze[r,c] == 1.0] + self.free_cells.remove(win_targets[-1]) + if self._maze[self.target] == 0.0: + raise Exception("Invalid maze: target cell cannot be blocked!") + if not rat in self.free_cells: + raise Exception("Invalid Rat Location: must sit on a free cell") + self.reset(rat) + + def reset(self, rat): + global win_targets + self.rat = rat + self.maze = np.copy(self._maze) + nrows, ncols = self.maze.shape + row, col = rat + self.maze[row, col] = rat_mark + self.state = (row, col, 'start') + self.min_reward = -0.5 * self.maze.size + self.total_reward = 0 + self.visited = list() + self.curr_win_targets = win_targets[:] + + def update_state(self, action): + nrows, ncols = self.maze.shape + nrow, ncol, nmode = rat_row, rat_col, mode = self.state + + if self.maze[rat_row, rat_col] > 0.0: + self.visited.append((rat_row, rat_col)) # mark visited cell + + valid_actions = self.valid_actions() + + if not valid_actions: + nmode = 'blocked' + elif action in valid_actions: + nmode = 'valid' + if action == LEFT: + ncol -= 1 + elif action == UP: + nrow -= 1 + if action == RIGHT: + ncol += 1 + elif action == DOWN: + nrow += 1 + else: # invalid action, no change in rat position + mode = 'invalid' + + # new state + self.state = (nrow, ncol, nmode) + + def get_reward(self): + win_target_x, win_target_y = self.target + rat_row, rat_col, mode = self.state + nrows, ncols = self.maze.shape + if rat_row == win_target_x and rat_col == win_target_y: + return 1.0 + if mode == 'blocked': # move to the block in the grid + return -1.0 + if (rat_row, rat_col) in self.visited: + return -0.5 # default -0.25 -> -0.5 + if mode == 'invalid': + return -0.75 # default -0.75 move to the boundary + if mode == 'valid': # default -0.04 -> -0.1 + return -0.04 + if (rat_row, rat_col) in self.curr_win_targets: + return 1.0 + + def act(self, action): + self.update_state(action) + reward = self.get_reward() + self.total_reward += reward + status = self.game_status() + envstate = self.observe() + return envstate, reward, status + + def observe(self): + canvas = self.draw_env() + envstate = canvas.reshape((1, -1)) + return envstate + + def draw_env(self): + canvas = np.copy(self.maze) + nrows, ncols = self.maze.shape + # clear all visual marks + for r in range(nrows): + for c in range(ncols): + if canvas[r,c] > 0.0: + canvas[r,c] = 1.0 + # draw the rat + row, col, valid = self.state + canvas[row, col] = rat_mark + return canvas + + def game_status(self): + if self.total_reward < self.min_reward: + return 'lose' + rat_row, rat_col, mode = self.state + nrows, ncols = self.maze.shape + + curPos = (rat_row, rat_col) + + if curPos in self.curr_win_targets: + self.curr_win_targets.remove(curPos) + if len(self.curr_win_targets) == 0: + return 'win' + else: + self.target = self.curr_win_targets[0] + + return 'not_over' + + def valid_actions(self, cell=None): + if cell is None: + row, col, mode = self.state + else: + row, col = cell + actions = [0, 1, 2, 3] + nrows, ncols = self.maze.shape + if row == 0: + actions.remove(1) + elif row == nrows-1: + actions.remove(3) + + if col == 0: + actions.remove(0) + elif col == ncols-1: + actions.remove(2) + + if row>0 and self.maze[row-1,col] == 0.0: + actions.remove(1) + if row0 and self.maze[row,col-1] == 0.0: + actions.remove(0) + if col self.max_memory: + del self.memory[0] + + def predict(self, envstate): + return self.model.predict(envstate)[0] + + def get_data(self, data_size=10): + env_size = self.memory[0][0].shape[1] # envstate 1d size (1st element of episode) + mem_size = len(self.memory) + data_size = min(mem_size, data_size) + inputs = np.zeros((data_size, env_size)) + targets = np.zeros((data_size, self.num_actions)) + for i, j in enumerate(np.random.choice(range(mem_size), data_size, replace=False)): + envstate, action, reward, envstate_next, game_over = self.memory[j] + inputs[i] = envstate + # There should be no target values for actions not taken. + targets[i] = self.predict(envstate) + # Q_sa = derived policy = max quality env/action = max_a' Q(s', a') + Q_sa = np.max(self.predict(envstate_next)) + if game_over: + targets[i, action] = reward + else: + # reward + gamma * max_a' Q(s', a') + targets[i, action] = reward + self.discount * Q_sa + return inputs, targets + +def qtrain(model, maze, **opt): + global epsilon + n_epoch = opt.get('n_epoch', 15000) + max_memory = opt.get('max_memory', 1000) + data_size = opt.get('data_size', 50) + weights_file = opt.get('weights_file', "") + name = opt.get('name', 'model') + start_time = datetime.datetime.now() + + # If you want to continue training from a previous model, + # just supply the h5 file name to weights_file option + if weights_file: + print("loading weights from file: %s" % (weights_file,)) + model.load_weights(weights_file) + + # Construct environment/game from numpy array: maze (see above) + qmaze = Qmaze(maze) + + # Initialize experience replay object + experience = Experience(model, max_memory=max_memory) + + win_history = [] # history of win/lose game + n_free_cells = len(qmaze.free_cells) + hsize = qmaze.maze.size//2 # history window size + win_rate = 0.0 + imctr = 1 + pre_episodes = 2**31 - 1 + + for epoch in range(n_epoch): + loss = 0.0 + #rat_cell = random.choice(qmaze.free_cells) + #rat_cell = (0, 0) + rat_cell = (12, 12) + + qmaze.reset(rat_cell) + game_over = False + + # get initial envstate (1d flattened canvas) + envstate = qmaze.observe() + + n_episodes = 0 + while not game_over: + valid_actions = qmaze.valid_actions() + if not valid_actions: break + prev_envstate = envstate + # Get next action + if np.random.rand() < epsilon: + action = random.choice(valid_actions) + else: + action = np.argmax(experience.predict(prev_envstate)) + + # Apply action, get reward and new envstate + envstate, reward, game_status = qmaze.act(action) + if game_status == 'win': + print("win") + win_history.append(1) + game_over = True + # save_pic(qmaze) + if n_episodes <= pre_episodes: + # output_route(qmaze) + print(qmaze.visited) + with open('res.data', 'wb') as filehandle: + pickle.dump(qmaze.visited, filehandle) + pre_episodes = n_episodes + + elif game_status == 'lose': + print("lose") + win_history.append(0) + game_over = True + # save_pic(qmaze) + else: + game_over = False + + # Store episode (experience) + episode = [prev_envstate, action, reward, envstate, game_over] + experience.remember(episode) + n_episodes += 1 + + # Train neural network model + inputs, targets = experience.get_data(data_size=data_size) + h = model.fit( + inputs, + targets, + epochs=8, + batch_size=16, + verbose=0, + ) + loss = model.evaluate(inputs, targets, verbose=0) + + + if len(win_history) > hsize: + win_rate = sum(win_history[-hsize:]) / hsize + + dt = datetime.datetime.now() - start_time + t = format_time(dt.total_seconds()) + + template = "Epoch: {:03d}/{:d} | Loss: {:.4f} | Episodes: {:d} | Win count: {:d} | Win rate: {:.3f} | time: {}" + print(template.format(epoch, n_epoch-1, loss, n_episodes, sum(win_history), win_rate, t)) + # we simply check if training has exhausted all free cells and if in all + # cases the agent won + if win_rate > 0.9 : epsilon = 0.05 + train_max = 192 + # print(sum(win_history[-192*1.5:])) + # print(192) + if sum(win_history[-192:]) >= 192: + print("Reached 100%% win rate at epoch: %d" % (epoch,)) + break + + # Save trained model weights and architecture, this will be used by the visualization code + h5file = name + ".h5" + json_file = name + ".json" + model.save_weights(h5file, overwrite=True) + with open(json_file, "w") as outfile: + json.dump(model.to_json(), outfile) + end_time = datetime.datetime.now() + dt = datetime.datetime.now() - start_time + seconds = dt.total_seconds() + t = format_time(seconds) + print('files: %s, %s' % (h5file, json_file)) + print("n_epoch: %d, max_mem: %d, data: %d, time: %s" % (epoch, max_memory, data_size, t)) + return seconds + +# This is a small utility for printing readable time strings: +def format_time(seconds): + if seconds < 400: + s = float(seconds) + return "%.1f seconds" % (s,) + elif seconds < 4000: + m = seconds / 60.0 + return "%.2f minutes" % (m,) + else: + h = seconds / 3600.0 + return "%.2f hours" % (h,) + +def build_model(maze, lr=0.001): + model = Sequential() + model.add(Dense(maze.size, input_shape=(maze.size,))) + model.add(PReLU()) + model.add(Dense(maze.size)) + model.add(PReLU()) + model.add(Dense(num_actions)) + model.compile(optimizer='adam', loss='mse') + return model + + + +class Table: + def __init__(self, coordinate_i, coordinate_j): + self.coordinate_i = coordinate_i + self.coordinate_j = coordinate_j + change_value(coordinate_i, coordinate_j, 2, 0.) + def get_destination_coor(self): + return [self.coordinate_i, self.coordinate_j-1] + +class Kitchen: + def __init__(self, coordinate_i, coordinate_j): + self.coordinate_i = coordinate_i + self.coordinate_j = coordinate_j + change_value(coordinate_i, coordinate_j, 3, 0.) + +if __name__== "__main__": + + def change_value(i, j, width, n): + for r in range (i, i+width): + for c in range (j, j+width): + grid[r][c] = n + + grid = [[1 for x in range(16)] for y in range(16)] + table1 = Table(2, 2) + table2 = Table (2,7) + table3 = Table(2, 12) + table4 = Table(7, 2) + table5 = Table(7, 7) + table6 = Table(7, 12) + table7 = Table(12, 2) + table8 = Table(12, 7) + + + kitchen = Kitchen(13, 13) + maze = np.array(grid) + + # print(maze) + # maze = np.array([ + # [ 1., 0., 1., 1., 1., 1., 1., 1.], + # [ 1., 1., 1., 0., 0., 1., 0., 1.], + # [ 1., 1., 1., 1., 1., 1., 0., 1.], + # [ 1., 1., 1., 1., 0., 0., 1., 1.], + # [ 1., 0., 0., 0., 1., 1., 1., 1.], + # [ 1., 0., 1., 1., 1., 1., 1., 1.], + # [ 1., 1., 1., 0., 1., 1., 1., 1.] + # ]) + # print(maze) + + + # qmaze = Qmaze(maze) + # show(qmaze) + + model = build_model(maze) + qtrain(model, maze, epochs=1000, max_memory=8*maze.size, data_size=32) + + +