Individual Project #2; s442720

2020-05-10 12:27:15 +00:00 · 2020-05-10 12:27:15 +00:00 · 71dc3e81a2
commit 71dc3e81a2
parent a73862b48b
1 changed files with 489 additions and 0 deletions
--- a/main_training.py
+++ b/main_training.py
@ -0,0 +1,489 @@
+from __future__ import print_function
+import os, sys, time, datetime, json, random
+import numpy as np
+from keras.models import Sequential
+from keras.layers.core import Dense, Activation
+from keras.optimizers import SGD , Adam, RMSprop
+from keras.layers.advanced_activations import PReLU
+import matplotlib.pyplot as plt
+import pickle
+
+visited_mark = 0.8  # Cells visited by the rat will be painted by gray 0.8
+rat_mark = 0.5      # The current rat cell will be painteg by gray 0.5
+LEFT = 0
+UP = 1
+RIGHT = 2
+DOWN = 3
+
+# Actions dictionary
+actions_dict = {
+    LEFT: 'left',
+    UP: 'up',
+    RIGHT: 'right',
+    DOWN: 'down',
+}
+
+num_actions = len(actions_dict)
+
+# Exploration factor
+epsilon = 0.1
+file_name_num = 1
+win_targets = [(4, 4),(4, 9),(4, 14),(9, 4)]
+
+class Qmaze(object):
+    def __init__(self, maze, rat=(12,12)):
+        global win_targets
+        self._maze = np.array(maze)
+        nrows, ncols = self._maze.shape
+        #self.target = (nrows-1, ncols-1)   # target cell where the "cheese" is
+        self.target = win_targets[0]
+        self.free_cells = [(r,c) for r in range(nrows) for c in range(ncols) if self._maze[r,c] == 1.0]
+        self.free_cells.remove(win_targets[-1])
+        if self._maze[self.target] == 0.0:
+            raise Exception("Invalid maze: target cell cannot be blocked!")
+        if not rat in self.free_cells:
+            raise Exception("Invalid Rat Location: must sit on a free cell")
+        self.reset(rat)
+
+    def reset(self, rat):
+        global win_targets
+        self.rat = rat
+        self.maze = np.copy(self._maze)
+        nrows, ncols = self.maze.shape
+        row, col = rat
+        self.maze[row, col] = rat_mark
+        self.state = (row, col, 'start')
+        self.min_reward = -0.5 * self.maze.size
+        self.total_reward = 0
+        self.visited = list()
+        self.curr_win_targets = win_targets[:]
+
+    def update_state(self, action):
+        nrows, ncols = self.maze.shape
+        nrow, ncol, nmode = rat_row, rat_col, mode = self.state
+
+        if self.maze[rat_row, rat_col] > 0.0:
+            self.visited.append((rat_row, rat_col))  # mark visited cell
+
+        valid_actions = self.valid_actions()
+                
+        if not valid_actions:
+            nmode = 'blocked'
+        elif action in valid_actions:
+            nmode = 'valid'
+            if action == LEFT:
+                ncol -= 1
+            elif action == UP:
+                nrow -= 1
+            if action == RIGHT:
+                ncol += 1
+            elif action == DOWN:
+                nrow += 1
+        else:                  # invalid action, no change in rat position
+            mode = 'invalid'
+
+        # new state
+        self.state = (nrow, ncol, nmode)
+
+    def get_reward(self):
+        win_target_x, win_target_y = self.target
+        rat_row, rat_col, mode = self.state
+        nrows, ncols = self.maze.shape
+        if rat_row == win_target_x and rat_col == win_target_y:
+            return 1.0
+        if mode == 'blocked':  # move to the block in the grid
+            return -1.0
+        if (rat_row, rat_col) in self.visited:
+            return -0.5    # default -0.25 -> -0.5
+        if mode == 'invalid':
+            return -0.75    # default -0.75 move to the boundary
+        if mode == 'valid': # default -0.04 -> -0.1 
+            return -0.04
+        if (rat_row, rat_col) in self.curr_win_targets:
+            return 1.0
+
+    def act(self, action):
+        self.update_state(action)
+        reward = self.get_reward()
+        self.total_reward += reward
+        status = self.game_status()
+        envstate = self.observe()
+        return envstate, reward, status
+
+    def observe(self):
+        canvas = self.draw_env()
+        envstate = canvas.reshape((1, -1))
+        return envstate
+
+    def draw_env(self):
+        canvas = np.copy(self.maze)
+        nrows, ncols = self.maze.shape
+        # clear all visual marks
+        for r in range(nrows):
+            for c in range(ncols):
+                if canvas[r,c] > 0.0:
+                    canvas[r,c] = 1.0
+        # draw the rat
+        row, col, valid = self.state
+        canvas[row, col] = rat_mark
+        return canvas
+
+    def game_status(self):
+        if self.total_reward < self.min_reward:
+            return 'lose'
+        rat_row, rat_col, mode = self.state
+        nrows, ncols = self.maze.shape
+        
+        curPos = (rat_row, rat_col)
+                
+        if curPos in self.curr_win_targets:
+            self.curr_win_targets.remove(curPos)
+            if len(self.curr_win_targets) == 0:
+                return 'win'
+            else:
+                self.target = self.curr_win_targets[0]
+
+        return 'not_over'
+
+    def valid_actions(self, cell=None):
+        if cell is None:
+            row, col, mode = self.state
+        else:
+            row, col = cell
+        actions = [0, 1, 2, 3]
+        nrows, ncols = self.maze.shape
+        if row == 0:
+            actions.remove(1)
+        elif row == nrows-1:
+            actions.remove(3)
+
+        if col == 0:
+            actions.remove(0)
+        elif col == ncols-1:
+            actions.remove(2)
+
+        if row>0 and self.maze[row-1,col] == 0.0:
+            actions.remove(1)
+        if row<nrows-1 and self.maze[row+1,col] == 0.0:
+            actions.remove(3)
+
+        if col>0 and self.maze[row,col-1] == 0.0:
+            actions.remove(0)
+        if col<ncols-1 and self.maze[row,col+1] == 0.0:
+            actions.remove(2)
+
+        return actions
+    
+def show(qmaze):
+    global win_target
+    win_target_row, win_target_col = win_target
+    plt.grid('on')
+    nrows, ncols = qmaze.maze.shape
+    ax = plt.gca()
+    ax.set_xticks(np.arange(0.5, nrows, 1))
+    ax.set_yticks(np.arange(0.5, ncols, 1))
+    ax.set_xticklabels([])
+    ax.set_yticklabels([])
+    canvas = np.copy(qmaze.maze)
+    for row,col in qmaze.visited:
+        canvas[row,col] = 0.6
+    rat_row, rat_col, _ = qmaze.state
+    canvas[rat_row, rat_col] = 0.3   # rat cell
+    canvas[win_target_row, win_target_col] = 0.9 # cheese cell
+    img = plt.imshow(canvas, interpolation='none', cmap='gray')
+    return img
+
+
+def save_pic(qmaze):
+    global file_name_num
+    global win_target
+    win_target_row, win_target_col = win_target
+    plt.grid('on')
+    nrows, ncols = qmaze.maze.shape
+    ax = plt.gca()
+    ax.set_xticks(np.arange(0.5, nrows, 1))
+    ax.set_yticks(np.arange(0.5, ncols, 1))
+    ax.set_xticklabels([])
+    ax.set_yticklabels([])
+    canvas = np.copy(qmaze.maze)
+    for row,col in qmaze.visited:
+        canvas[row,col] = 0.6
+    rat_row, rat_col, _ = qmaze.state
+    canvas[rat_row, rat_col] = 0.3   # rat cell
+    canvas[win_target_row, win_target_col] = 0.9 # cheese cell
+    plt.imshow(canvas, interpolation='none', cmap='gray')
+    plt.savefig(str(file_name_num) + ".png")
+    file_name_num += 1
+
+def output_route(qmaze):
+    global win_target
+    win_target_row, win_target_col = win_target
+    print(qmaze._maze)
+
+def play_game(model, qmaze, rat_cell):
+    qmaze.reset(rat_cell)
+    envstate = qmaze.observe()
+    while True:
+        prev_envstate = envstate
+        # get next action
+        q = model.predict(prev_envstate)
+        action = np.argmax(q[0])
+
+        # apply action, get rewards and new state
+        envstate, reward, game_status = qmaze.act(action)
+        if game_status == 'win':
+            return True
+        elif game_status == 'lose':
+            return False
+
+
+def completion_check(model, qmaze):
+    for cell in qmaze.free_cells:
+        if not qmaze.valid_actions(cell):
+            return False
+        if not play_game(model, qmaze, cell):
+            return False
+    return True
+
+
+class Experience(object):
+    def __init__(self, model, max_memory=100, discount=0.9):
+        self.model = model
+        self.max_memory = max_memory
+        self.discount = discount
+        self.memory = list()
+        self.num_actions = model.output_shape[-1]
+
+    def remember(self, episode):
+        # episode = [envstate, action, reward, envstate_next, game_over]
+        # memory[i] = episode
+        # envstate == flattened 1d maze cells info, including rat cell (see method: observe)
+        self.memory.append(episode)
+        if len(self.memory) > self.max_memory:
+            del self.memory[0]
+
+    def predict(self, envstate):
+        return self.model.predict(envstate)[0]
+
+    def get_data(self, data_size=10):
+        env_size = self.memory[0][0].shape[1]   # envstate 1d size (1st element of episode)
+        mem_size = len(self.memory)
+        data_size = min(mem_size, data_size)
+        inputs = np.zeros((data_size, env_size))
+        targets = np.zeros((data_size, self.num_actions))
+        for i, j in enumerate(np.random.choice(range(mem_size), data_size, replace=False)):
+            envstate, action, reward, envstate_next, game_over = self.memory[j]
+            inputs[i] = envstate
+            # There should be no target values for actions not taken.
+            targets[i] = self.predict(envstate)
+            # Q_sa = derived policy = max quality env/action = max_a' Q(s', a')
+            Q_sa = np.max(self.predict(envstate_next))
+            if game_over:
+                targets[i, action] = reward
+            else:
+                # reward + gamma * max_a' Q(s', a')
+                targets[i, action] = reward + self.discount * Q_sa
+        return inputs, targets
+
+def qtrain(model, maze, **opt):
+    global epsilon
+    n_epoch = opt.get('n_epoch', 15000)
+    max_memory = opt.get('max_memory', 1000)
+    data_size = opt.get('data_size', 50)
+    weights_file = opt.get('weights_file', "")
+    name = opt.get('name', 'model')
+    start_time = datetime.datetime.now()
+
+    # If you want to continue training from a previous model,
+    # just supply the h5 file name to weights_file option
+    if weights_file:
+        print("loading weights from file: %s" % (weights_file,))
+        model.load_weights(weights_file)
+
+    # Construct environment/game from numpy array: maze (see above)
+    qmaze = Qmaze(maze)
+
+    # Initialize experience replay object
+    experience = Experience(model, max_memory=max_memory)
+
+    win_history = []   # history of win/lose game
+    n_free_cells = len(qmaze.free_cells)
+    hsize = qmaze.maze.size//2   # history window size
+    win_rate = 0.0
+    imctr = 1
+    pre_episodes = 2**31 - 1
+
+    for epoch in range(n_epoch):
+        loss = 0.0
+        #rat_cell = random.choice(qmaze.free_cells)
+        #rat_cell = (0, 0)
+        rat_cell = (12, 12)
+
+        qmaze.reset(rat_cell)
+        game_over = False
+
+        # get initial envstate (1d flattened canvas)
+        envstate = qmaze.observe()
+
+        n_episodes = 0
+        while not game_over:
+            valid_actions = qmaze.valid_actions()
+            if not valid_actions: break
+            prev_envstate = envstate
+            # Get next action
+            if np.random.rand() < epsilon:
+                action = random.choice(valid_actions)
+            else:
+                action = np.argmax(experience.predict(prev_envstate))
+
+            # Apply action, get reward and new envstate
+            envstate, reward, game_status = qmaze.act(action)
+            if game_status == 'win':
+                print("win")
+                win_history.append(1)
+                game_over = True
+                # save_pic(qmaze)
+                if n_episodes <= pre_episodes:
+                    # output_route(qmaze)
+                    print(qmaze.visited)
+                    with open('res.data', 'wb') as filehandle:
+                        pickle.dump(qmaze.visited, filehandle)
+                    pre_episodes = n_episodes
+                    
+            elif game_status == 'lose':
+                print("lose")
+                win_history.append(0)
+                game_over = True
+                # save_pic(qmaze)
+            else:
+                game_over = False
+
+            # Store episode (experience)
+            episode = [prev_envstate, action, reward, envstate, game_over]
+            experience.remember(episode)
+            n_episodes += 1
+
+            # Train neural network model
+            inputs, targets = experience.get_data(data_size=data_size)
+            h = model.fit(
+                inputs,
+                targets,
+                epochs=8,
+                batch_size=16,
+                verbose=0,
+            )
+            loss = model.evaluate(inputs, targets, verbose=0)
+            
+        
+        if len(win_history) > hsize:
+            win_rate = sum(win_history[-hsize:]) / hsize
+    
+        dt = datetime.datetime.now() - start_time
+        t = format_time(dt.total_seconds())
+        
+        template = "Epoch: {:03d}/{:d} | Loss: {:.4f} | Episodes: {:d} | Win count: {:d} | Win rate: {:.3f} | time: {}"
+        print(template.format(epoch, n_epoch-1, loss, n_episodes, sum(win_history), win_rate, t))
+        # we simply check if training has exhausted all free cells and if in all
+        # cases the agent won
+        if win_rate > 0.9 : epsilon = 0.05
+        train_max = 192
+        # print(sum(win_history[-192*1.5:]))
+        # print(192)
+        if sum(win_history[-192:]) >= 192:
+            print("Reached 100%% win rate at epoch: %d" % (epoch,))
+            break
+
+    # Save trained model weights and architecture, this will be used by the visualization code
+    h5file = name + ".h5"
+    json_file = name + ".json"
+    model.save_weights(h5file, overwrite=True)
+    with open(json_file, "w") as outfile:
+        json.dump(model.to_json(), outfile)
+    end_time = datetime.datetime.now()
+    dt = datetime.datetime.now() - start_time
+    seconds = dt.total_seconds()
+    t = format_time(seconds)
+    print('files: %s, %s' % (h5file, json_file))
+    print("n_epoch: %d, max_mem: %d, data: %d, time: %s" % (epoch, max_memory, data_size, t))
+    return seconds
+
+# This is a small utility for printing readable time strings:
+def format_time(seconds):
+    if seconds < 400:
+        s = float(seconds)
+        return "%.1f seconds" % (s,)
+    elif seconds < 4000:
+        m = seconds / 60.0
+        return "%.2f minutes" % (m,)
+    else:
+        h = seconds / 3600.0
+        return "%.2f hours" % (h,)
+
+def build_model(maze, lr=0.001):
+    model = Sequential()
+    model.add(Dense(maze.size, input_shape=(maze.size,)))
+    model.add(PReLU())
+    model.add(Dense(maze.size))
+    model.add(PReLU())
+    model.add(Dense(num_actions))
+    model.compile(optimizer='adam', loss='mse')
+    return model
+
+
+            
+class Table:
+    def __init__(self, coordinate_i, coordinate_j):
+        self.coordinate_i = coordinate_i
+        self.coordinate_j = coordinate_j
+        change_value(coordinate_i, coordinate_j, 2, 0.)
+    def get_destination_coor(self):
+        return [self.coordinate_i, self.coordinate_j-1]
+        
+class Kitchen:
+    def __init__(self, coordinate_i, coordinate_j):
+        self.coordinate_i = coordinate_i
+        self.coordinate_j = coordinate_j
+        change_value(coordinate_i, coordinate_j, 3, 0.)
+
+if __name__== "__main__":
+    
+    def change_value(i, j, width, n):
+        for r in range (i, i+width):
+            for c in range (j, j+width):
+                grid[r][c] = n
+
+    grid = [[1 for x in range(16)] for y in range(16)]
+    table1 = Table(2, 2)
+    table2 = Table (2,7)
+    table3 = Table(2, 12)
+    table4 = Table(7, 2)
+    table5 = Table(7, 7)
+    table6 = Table(7, 12)
+    table7 = Table(12, 2)
+    table8 = Table(12, 7)
+
+
+    kitchen = Kitchen(13, 13)
+    maze = np.array(grid)
+
+    # print(maze)
+    # maze =  np.array([
+    #     [ 1.,  0.,  1.,  1.,  1.,  1.,  1., 1.],
+    #     [ 1.,  1.,  1.,  0.,  0.,  1.,  0., 1.],
+    #     [ 1.,  1.,  1.,  1.,  1.,  1.,  0., 1.],
+    #     [ 1.,  1.,  1.,  1.,  0.,  0.,  1., 1.],
+    #     [ 1.,  0.,  0.,  0.,  1.,  1.,  1., 1.],
+    #     [ 1.,  0.,  1.,  1.,  1.,  1.,  1., 1.],
+    #     [ 1.,  1.,  1.,  0.,  1.,  1.,  1., 1.]
+    # ])
+    # print(maze)
+    
+    
+    # qmaze = Qmaze(maze)
+    # show(qmaze)
+
+    model = build_model(maze)
+    qtrain(model, maze, epochs=1000, max_memory=8*maze.size, data_size=32)
+
+
+