Computer_Vision/Chapter16/Pong_Deep_Q_Learning_with_F...

26 KiB

Open In Colab

import gym
import numpy as np
import cv2
from collections import deque
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import random
from collections import namedtuple, deque
import torch
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
%matplotlib inline

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

env = gym.make('PongDeterministic-v0')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
def preprocess_frame(frame): 
    bkg_color = np.array([144, 72, 17])
    img = np.mean(frame[34:-16:2,::2]-bkg_color, axis=-1)/255.
    resized_image = img
    return resized_image
def stack_frames(stacked_frames, state, is_new_episode):
    # Preprocess frame
    frame = preprocess_frame(state)
    stack_size = 4
    if is_new_episode:
        # Clear our stacked_frames
        stacked_frames = deque([np.zeros((80,80), dtype=np.uint8) for i in range(stack_size)], maxlen=4)
        # Because we're in a new episode, copy the same frame 4x
        for i in range(stack_size):
            stacked_frames.append(frame) 
        # Stack the frames
        stacked_state = np.stack(stacked_frames, axis=2).transpose(2, 0, 1)
    else:
        # Append frame to deque, automatically removes the oldest frame
        stacked_frames.append(frame)
        # Build the stacked state (first dimension specifies different frames)
        stacked_state = np.stack(stacked_frames, axis=2).transpose(2, 0, 1) 
    return stacked_state, stacked_frames
class DQNetwork(nn.Module):
    def __init__(self, states, action_size):
        super(DQNetwork, self).__init__()
        
        self.conv1 = nn.Conv2d(4, 32, (8, 8), stride=4)
        self.conv2 = nn.Conv2d(32, 64, (4, 4), stride=2)
        self.conv3 = nn.Conv2d(64, 64, (3, 3), stride=1)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(2304, 512)
        self.fc2 = nn.Linear(512, action_size)
        
    def forward(self, state): 
        x = F.relu(self.conv1(state))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = self.flatten(x)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
class Agent():
    def __init__(self, state_size, action_size):
        
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(0)

        ## hyperparameters
        self.buffer_size = 10000
        self.batch_size = 32
        self.gamma = 0.99
        self.lr = 0.0001
        self.update_every = 4
        self.update_every_target = 1000 
        self.learn_every_target_counter = 0
        # Q-Network
        self.local = DQNetwork(state_size, action_size).to(device)
        self.target = DQNetwork(state_size, action_size).to(device)
        self.optimizer = optim.Adam(self.local.parameters(), lr=self.lr)

        # Replay memory
        self.memory = deque(maxlen=self.buffer_size) 
        self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        # Initialize time step (for updating every few steps)
        self.t_step = 0
    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.append(self.experience(state[None], action, reward, next_state[None], done))
        
        # Learn every update_every time steps.
        self.t_step = (self.t_step + 1) % self.update_every
        if self.t_step == 0:
   # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > self.batch_size:
                experiences = self.sample_experiences()
                self.learn(experiences, self.gamma)
    def act(self, state, eps=0.):
        # Epsilon-greedy action selection
        if random.random() > eps:
            state = torch.from_numpy(state).float().unsqueeze(0).to(device)
            self.local.eval()
            with torch.no_grad():
                action_values = self.local(state)
            self.local.train()
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))
    def learn(self, experiences, gamma):
        self.learn_every_target_counter+=1
        states, actions, rewards, next_states, dones = experiences
       # Get expected Q values from local model
        Q_expected = self.local(states).gather(1, actions)

        # Get max predicted Q values (for next states) from target model
        Q_targets_next = self.target(next_states).detach().max(1)[0].unsqueeze(1)
        # Compute Q targets for current state
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        
        # Compute loss
        loss = F.mse_loss(Q_expected, Q_targets)

        # Minimize the loss
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        if self.learn_every_target_counter%1000 ==0:
            self.target_update() 
    def target_update(self):
        print('target updating')
        self.target.load_state_dict(self.local.state_dict())
    def sample_experiences(self):
        experiences = random.sample(self.memory, k=self.batch_size)        
        states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)        
        return (states, actions, rewards, next_states, dones)
agent = Agent(state_size, action_size)
n_episodes=5000
max_t=5000
eps_start=1.0
eps_end=0.02
eps_decay=0.995
scores = [] # list containing scores from each episode
scores_window = deque(maxlen=100) # last 100 scores
eps = eps_start
stack_size = 4
stacked_frames = deque([np.zeros((80,80), dtype=np.int) for i in range(stack_size)], maxlen=stack_size) 
for i_episode in range(1, n_episodes+1):
    state = env.reset()
    state, frames = stack_frames(stacked_frames, state, True)
    score = 0
    for i in range(max_t):
        action = agent.act(state, eps)
        next_state, reward, done, _ = env.step(action)
        next_state, frames = stack_frames(frames, next_state, False)
        agent.step(state, action, reward, next_state, done)
        state = next_state
        score += reward
        if done:
            break 
    scores_window.append(score) # save most recent score
    scores.append(score) # save most recent score
    eps = max(eps_end, eps_decay*eps) # decrease epsilon
    print('\rEpisode {}\tReward {} \tAverage Score: {:.2f} \tEpsilon: {}'.format(i_episode,score,np.mean(scores_window), eps), end="")
    if i_episode % 100 == 0:
        print('\rEpisode {}\tAverage Score: {:.2f} \tEpsilon: {}'.format(i_episode, np.mean(scores_window), eps))
Episode 4	Reward -20.0 	Average Score: -20.25 	Epsilon: 0.9801495006250001target updating
Episode 8	Reward -21.0 	Average Score: -20.62 	Epsilon: 0.960693043575437target updating
Episode 13	Reward -21.0 	Average Score: -20.31 	Epsilon: 0.9369146928798039target updating
Episode 17	Reward -21.0 	Average Score: -20.47 	Epsilon: 0.918316468354365target updating
Episode 22	Reward -18.0 	Average Score: -20.36 	Epsilon: 0.8955869907338783target updating
Episode 26	Reward -21.0 	Average Score: -20.46 	Epsilon: 0.8778091417340573target updating
Episode 30	Reward -20.0 	Average Score: -20.47 	Epsilon: 0.8603841919146962target updating
Episode 35	Reward -20.0 	Average Score: -20.49 	Epsilon: 0.8390886103705794target updating
Episode 39	Reward -20.0 	Average Score: -20.44 	Epsilon: 0.8224322824348486target updating
Episode 43	Reward -21.0 	Average Score: -20.42 	Epsilon: 0.8061065909263957target updating
Episode 48	Reward -21.0 	Average Score: -20.42 	Epsilon: 0.7861544476842928target updating
Episode 52	Reward -20.0 	Average Score: -20.38 	Epsilon: 0.7705488893118823target updating
Episode 56	Reward -19.0 	Average Score: -20.30 	Epsilon: 0.7552531090661897target updating
Episode 60	Reward -20.0 	Average Score: -20.30 	Epsilon: 0.7402609576967045target updating
Episode 64	Reward -20.0 	Average Score: -20.31 	Epsilon: 0.7255664080186093target updating
Episode 68	Reward -18.0 	Average Score: -20.26 	Epsilon: 0.7111635524897149target updating
Episode 72	Reward -19.0 	Average Score: -20.24 	Epsilon: 0.697046600835495target updating
Episode 76	Reward -20.0 	Average Score: -20.24 	Epsilon: 0.6832098777212641target updating
Episode 80	Reward -20.0 	Average Score: -20.26 	Epsilon: 0.6696478204705644target updating
Episode 83	Reward -20.0 	Average Score: -20.24 	Epsilon: 0.6596532430440636target updating
Episode 87	Reward -20.0 	Average Score: -20.22 	Epsilon: 0.6465587967553006target updating
Episode 91	Reward -20.0 	Average Score: -20.21 	Epsilon: 0.6337242817644086target updating
Episode 95	Reward -21.0 	Average Score: -20.18 	Epsilon: 0.6211445383053219target updating
Episode 98	Reward -21.0 	Average Score: -20.16 	Epsilon: 0.6118738784280476target updating
Episode 100	Average Score: -20.16 	Epsilon: 0.6057704364907278
Episode 102	Reward -20.0 	Average Score: -20.15 	Epsilon: 0.5997278763867329target updating
Episode 106	Reward -20.0 	Average Score: -20.11 	Epsilon: 0.5878229785513479target updating
Episode 109	Reward -21.0 	Average Score: -20.06 	Epsilon: 0.5790496471185967target updating
Episode 111	Reward -16.0 	Average Score: -19.99 	Epsilon: 0.5732736268885887target updating
Episode 115	Reward -20.0 	Average Score: -19.97 	Epsilon: 0.5618938591163328target updating
Episode 117	Reward -18.0 	Average Score: -19.91 	Epsilon: 0.5562889678716474target updating
Episode 120	Reward -18.0 	Average Score: -19.83 	Epsilon: 0.547986285490042target updating
Episode 124	Reward -19.0 	Average Score: -19.79 	Epsilon: 0.5371084840724134target updating
Episode 126	Reward -19.0 	Average Score: -19.71 	Epsilon: 0.531750826943791target updating
Episode 129	Reward -18.0 	Average Score: -19.68 	Epsilon: 0.5238143793828016target updating
Episode 132	Reward -17.0 	Average Score: -19.64 	Epsilon: 0.5159963842937159target updating
Episode 134	Reward -18.0 	Average Score: -19.58 	Epsilon: 0.510849320360386target updating
Episode 137	Reward -15.0 	Average Score: -19.49 	Epsilon: 0.5032248303978422target updating
Episode 139	Reward -21.0 	Average Score: -19.48 	Epsilon: 0.4982051627146237target updating
Episode 142	Reward -17.0 	Average Score: -19.45 	Epsilon: 0.4907693883854626target updating
Episode 145	Reward -18.0 	Average Score: -19.31 	Epsilon: 0.483444593917636target updating
Episode 147	Reward -19.0 	Average Score: -19.30 	Epsilon: 0.47862223409330756target updating
Episode 150	Reward -17.0 	Average Score: -19.22 	Epsilon: 0.47147873742168567target updating
Episode 152	Reward -21.0 	Average Score: -19.23 	Epsilon: 0.46677573701590436target updating
Episode 155	Reward -19.0 	Average Score: -19.22 	Epsilon: 0.4598090507939749target updating
Episode 157	Reward -20.0 	Average Score: -19.19 	Epsilon: 0.45522245551230495target updating
Episode 159	Reward -21.0 	Average Score: -19.20 	Epsilon: 0.4506816115185697target updating
Episode 162	Reward -20.0 	Average Score: -19.16 	Epsilon: 0.4439551321314536target updating
Episode 164	Reward -19.0 	Average Score: -19.11 	Epsilon: 0.43952667968844233target updating
Episode 167	Reward -18.0 	Average Score: -19.07 	Epsilon: 0.43296668905325736target updating
Episode 169	Reward -18.0 	Average Score: -19.05 	Epsilon: 0.4286478463299511target updating
Episode 171	Reward -17.0 	Average Score: -18.98 	Epsilon: 0.42437208406280985target updating
Episode 174	Reward -20.0 	Average Score: -18.98 	Epsilon: 0.4180382776616619target updating
Episode 176	Reward -17.0 	Average Score: -18.92 	Epsilon: 0.41386834584198684target updating
Episode 178	Reward -17.0 	Average Score: -18.84 	Epsilon: 0.40974000909221303target updating
Episode 181	Reward -17.0 	Average Score: -18.79 	Epsilon: 0.4036245882390106target updating
Episode 183	Reward -17.0 	Average Score: -18.75 	Epsilon: 0.3995984329713264target updating
Episode 185	Reward -18.0 	Average Score: -18.70 	Epsilon: 0.39561243860243744target updating
Episode 187	Reward -20.0 	Average Score: -18.69 	Epsilon: 0.39166620452737816target updating
Episode 190	Reward -20.0 	Average Score: -18.69 	Epsilon: 0.3858205374665315target updating
Episode 192	Reward -17.0 	Average Score: -18.66 	Epsilon: 0.3819719776053028target updating
Episode 194	Reward -16.0 	Average Score: -18.58 	Epsilon: 0.37816180712868996target updating
Episode 196	Reward -16.0 	Average Score: -18.50 	Epsilon: 0.3743896431025813target updating
Episode 198	Reward -15.0 	Average Score: -18.44 	Epsilon: 0.3706551064126331target updating
Episode 200	Average Score: -18.39 	Epsilon: 0.3669578217261671
target updating
Episode 202	Reward -20.0 	Average Score: -18.34 	Epsilon: 0.3632974174544486target updating
Episode 204	Reward -16.0 	Average Score: -18.27 	Epsilon: 0.3596735257153405target updating
Episode 205	Reward -14.0 	Average Score: -18.22 	Epsilon: 0.3578751580867638target updating
Episode 208	Reward -19.0 	Average Score: -18.21 	Epsilon: 0.35253382661792404target updating
Episode 210	Reward -19.0 	Average Score: -18.23 	Epsilon: 0.34901730169741024target updating
Episode 212	Reward -19.0 	Average Score: -18.25 	Epsilon: 0.3455358541129786target updating
Episode 214	Reward -16.0 	Average Score: -18.14 	Epsilon: 0.3420891339682016target updating
Episode 216	Reward -19.0 	Average Score: -18.13 	Epsilon: 0.3386767948568688target updating
Episode 218	Reward -16.0 	Average Score: -18.10 	Epsilon: 0.3352984938281715target updating
Episode 220	Reward -20.0 	Average Score: -18.12 	Epsilon: 0.33195389135223546target updating
Episode 222	Reward -14.0 	Average Score: -18.06 	Epsilon: 0.32864265128599696target updating
Episode 224	Reward -17.0 	Average Score: -18.03 	Epsilon: 0.3253644408394192target updating
Episode 225	Reward -15.0 	Average Score: -18.03 	Epsilon: 0.3237376186352221target updating
Episode 227	Reward -20.0 	Average Score: -18.00 	Epsilon: 0.32050833588933575target updating
Episode 229	Reward -16.0 	Average Score: -17.95 	Epsilon: 0.3173112652388396target updating
Episode 231	Reward -17.0 	Average Score: -17.85 	Epsilon: 0.3141460853680822target updating
Episode 233	Reward -16.0 	Average Score: -17.83 	Epsilon: 0.31101247816653554target updating
Episode 235	Reward -18.0 	Average Score: -17.79 	Epsilon: 0.3079101286968243target updating
Episode 236	Reward -17.0 	Average Score: -17.77 	Epsilon: 0.3063705780533402target updating
Episode 238	Reward -14.0 	Average Score: -17.70 	Epsilon: 0.3033145315372582target updating
Episode 240	Reward -18.0 	Average Score: -17.62 	Epsilon: 0.30028896908517405target updating
Episode 242	Reward -16.0 	Average Score: -17.60 	Epsilon: 0.29729358661854943target updating
Episode 244	Reward -20.0 	Average Score: -17.70 	Epsilon: 0.2943280830920294target updating
Episode 246	Reward -15.0 	Average Score: -17.68 	Epsilon: 0.2913921604631864target updating
Episode 248	Reward -17.0 	Average Score: -17.66 	Epsilon: 0.2884855236625661target updating
Episode 251	Reward -20.0 	Average Score: -17.71 	Epsilon: 0.28417984116121187target updating
Episode 252	Reward -18.0 	Average Score: -17.68 	Epsilon: 0.2827589419554058target updating
Episode 254	Reward -16.0 	Average Score: -17.61 	Epsilon: 0.2799384215094006target updating
Episode 257	Reward -17.0 	Average Score: -17.57 	Epsilon: 0.2757603055760701target updating
Episode 258	Reward -17.0 	Average Score: -17.54 	Epsilon: 0.2743815040481898target updating
Episode 260	Reward -20.0 	Average Score: -17.52 	Epsilon: 0.27164454854530906target updating
Episode 262	Reward -16.0 	Average Score: -17.49 	Epsilon: 0.2689348941735696target updating
Episode 263	Reward -12.0 	Average Score: -17.45 	Epsilon: 0.26759021970270175target updating
Episode 265	Reward -17.0 	Average Score: -17.37 	Epsilon: 0.2649210072611673target updating
Episode 267	Reward -18.0 	Average Score: -17.35 	Epsilon: 0.26227842021373715target updating
Episode 268	Reward -16.0 	Average Score: -17.33 	Epsilon: 0.2609670281126685target updating
Episode 270	Reward -16.0 	Average Score: -17.30 	Epsilon: 0.2583638820072446target updating
Episode 272	Reward -18.0 	Average Score: -17.27 	Epsilon: 0.25578670228422234target updating
Episode 274	Reward -15.0 	Average Score: -17.18 	Epsilon: 0.2532352299289372target updating
Episode 276	Reward -18.0 	Average Score: -17.16 	Epsilon: 0.2507092085103961target updating
Episode 278	Reward -19.0 	Average Score: -17.20 	Epsilon: 0.24820838415550486target updating
Episode 280	Reward -14.0 	Average Score: -17.12 	Epsilon: 0.2457325055235537target updating
Episode 281	Reward -15.0 	Average Score: -17.10 	Epsilon: 0.24450384299593592target updating
Episode 283	Reward -21.0 	Average Score: -17.03 	Epsilon: 0.24206491716205145target updating
Episode 285	Reward -17.0 	Average Score: -16.98 	Epsilon: 0.23965031961336target updating
Episode 287	Reward -17.0 	Average Score: -16.92 	Epsilon: 0.23725980767521673target updating
Episode 289	Reward -18.0 	Average Score: -16.81 	Epsilon: 0.23489314109365644target updating
Episode 290	Reward -12.0 	Average Score: -16.73 	Epsilon: 0.23371867538818816target updating
Episode 292	Reward -16.0 	Average Score: -16.69 	Epsilon: 0.231387331601191target updating
Episode 294	Reward -15.0 	Average Score: -16.71 	Epsilon: 0.2290792429684691target updating
Episode 296	Reward -16.0 	Average Score: -16.72 	Epsilon: 0.22679417751985861target updating
Episode 298	Reward -15.0 	Average Score: -16.66 	Epsilon: 0.22453190559909803target updating
Episode 299	Reward -14.0 	Average Score: -16.64 	Epsilon: 0.22340924607110255target updating
Episode 300	Average Score: -16.61 	Epsilon: 0.22229219984074702
Episode 301	Reward -17.0 	Average Score: -16.63 	Epsilon: 0.2211807388415433target updating
Episode 302	Reward -17.0 	Average Score: -16.60 	Epsilon: 0.22007483514733558target updating
Episode 304	Reward -13.0 	Average Score: -16.59 	Epsilon: 0.2178795886667409target updating
Episode 305	Reward -14.0 	Average Score: -16.59 	Epsilon: 0.2167901907234072target updating
Episode 307	Reward -19.0 	Average Score: -16.57 	Epsilon: 0.21462770857094118target updating
Episode 309	Reward -18.0 	Average Score: -16.54 	Epsilon: 0.21248679717794605target updating
Episode 310	Reward -15.0 	Average Score: -16.50 	Epsilon: 0.21142436319205632target updating
Episode 312	Reward -14.0 	Average Score: -16.44 	Epsilon: 0.20931540516921554target updating
Episode 313	Reward -18.0 	Average Score: -16.49 	Epsilon: 0.20826882814336947target updating
Episode 314	Reward -19.0 	Average Score: -16.52 	Epsilon: 0.20722748400265262target updating
Episode 316	Reward -17.0 	Average Score: -16.45 	Epsilon: 0.20516038984972615target updating
Episode 317	Reward -14.0 	Average Score: -16.41 	Epsilon: 0.2041345879004775target updating
Episode 319	Reward -16.0 	Average Score: -16.42 	Epsilon: 0.20209834538617025target updating
Episode 321	Reward -12.0 	Average Score: -16.33 	Epsilon: 0.2000824143909432target updating
Episode 322	Reward -16.0 	Average Score: -16.35 	Epsilon: 0.19908200231898848target updating