26 KiB
26 KiB
import gym
import numpy as np
import cv2
from collections import deque
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import random
from collections import namedtuple, deque
import torch
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
%matplotlib inline
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
env = gym.make('PongDeterministic-v0')
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
def preprocess_frame(frame):
bkg_color = np.array([144, 72, 17])
img = np.mean(frame[34:-16:2,::2]-bkg_color, axis=-1)/255.
resized_image = img
return resized_image
def stack_frames(stacked_frames, state, is_new_episode):
# Preprocess frame
frame = preprocess_frame(state)
stack_size = 4
if is_new_episode:
# Clear our stacked_frames
stacked_frames = deque([np.zeros((80,80), dtype=np.uint8) for i in range(stack_size)], maxlen=4)
# Because we're in a new episode, copy the same frame 4x
for i in range(stack_size):
stacked_frames.append(frame)
# Stack the frames
stacked_state = np.stack(stacked_frames, axis=2).transpose(2, 0, 1)
else:
# Append frame to deque, automatically removes the oldest frame
stacked_frames.append(frame)
# Build the stacked state (first dimension specifies different frames)
stacked_state = np.stack(stacked_frames, axis=2).transpose(2, 0, 1)
return stacked_state, stacked_frames
class DQNetwork(nn.Module):
def __init__(self, states, action_size):
super(DQNetwork, self).__init__()
self.conv1 = nn.Conv2d(4, 32, (8, 8), stride=4)
self.conv2 = nn.Conv2d(32, 64, (4, 4), stride=2)
self.conv3 = nn.Conv2d(64, 64, (3, 3), stride=1)
self.flatten = nn.Flatten()
self.fc1 = nn.Linear(2304, 512)
self.fc2 = nn.Linear(512, action_size)
def forward(self, state):
x = F.relu(self.conv1(state))
x = F.relu(self.conv2(x))
x = F.relu(self.conv3(x))
x = self.flatten(x)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x
class Agent():
def __init__(self, state_size, action_size):
self.state_size = state_size
self.action_size = action_size
self.seed = random.seed(0)
## hyperparameters
self.buffer_size = 10000
self.batch_size = 32
self.gamma = 0.99
self.lr = 0.0001
self.update_every = 4
self.update_every_target = 1000
self.learn_every_target_counter = 0
# Q-Network
self.local = DQNetwork(state_size, action_size).to(device)
self.target = DQNetwork(state_size, action_size).to(device)
self.optimizer = optim.Adam(self.local.parameters(), lr=self.lr)
# Replay memory
self.memory = deque(maxlen=self.buffer_size)
self.experience = namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
# Initialize time step (for updating every few steps)
self.t_step = 0
def step(self, state, action, reward, next_state, done):
# Save experience in replay memory
self.memory.append(self.experience(state[None], action, reward, next_state[None], done))
# Learn every update_every time steps.
self.t_step = (self.t_step + 1) % self.update_every
if self.t_step == 0:
# If enough samples are available in memory, get random subset and learn
if len(self.memory) > self.batch_size:
experiences = self.sample_experiences()
self.learn(experiences, self.gamma)
def act(self, state, eps=0.):
# Epsilon-greedy action selection
if random.random() > eps:
state = torch.from_numpy(state).float().unsqueeze(0).to(device)
self.local.eval()
with torch.no_grad():
action_values = self.local(state)
self.local.train()
return np.argmax(action_values.cpu().data.numpy())
else:
return random.choice(np.arange(self.action_size))
def learn(self, experiences, gamma):
self.learn_every_target_counter+=1
states, actions, rewards, next_states, dones = experiences
# Get expected Q values from local model
Q_expected = self.local(states).gather(1, actions)
# Get max predicted Q values (for next states) from target model
Q_targets_next = self.target(next_states).detach().max(1)[0].unsqueeze(1)
# Compute Q targets for current state
Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
# Compute loss
loss = F.mse_loss(Q_expected, Q_targets)
# Minimize the loss
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
# ------------------- update target network ------------------- #
if self.learn_every_target_counter%1000 ==0:
self.target_update()
def target_update(self):
print('target updating')
self.target.load_state_dict(self.local.state_dict())
def sample_experiences(self):
experiences = random.sample(self.memory, k=self.batch_size)
states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)
actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)
rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)
next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)
dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device)
return (states, actions, rewards, next_states, dones)
agent = Agent(state_size, action_size)
n_episodes=5000
max_t=5000
eps_start=1.0
eps_end=0.02
eps_decay=0.995
scores = [] # list containing scores from each episode
scores_window = deque(maxlen=100) # last 100 scores
eps = eps_start
stack_size = 4
stacked_frames = deque([np.zeros((80,80), dtype=np.int) for i in range(stack_size)], maxlen=stack_size)
for i_episode in range(1, n_episodes+1):
state = env.reset()
state, frames = stack_frames(stacked_frames, state, True)
score = 0
for i in range(max_t):
action = agent.act(state, eps)
next_state, reward, done, _ = env.step(action)
next_state, frames = stack_frames(frames, next_state, False)
agent.step(state, action, reward, next_state, done)
state = next_state
score += reward
if done:
break
scores_window.append(score) # save most recent score
scores.append(score) # save most recent score
eps = max(eps_end, eps_decay*eps) # decrease epsilon
print('\rEpisode {}\tReward {} \tAverage Score: {:.2f} \tEpsilon: {}'.format(i_episode,score,np.mean(scores_window), eps), end="")
if i_episode % 100 == 0:
print('\rEpisode {}\tAverage Score: {:.2f} \tEpsilon: {}'.format(i_episode, np.mean(scores_window), eps))
Episode 4 Reward -20.0 Average Score: -20.25 Epsilon: 0.9801495006250001target updating Episode 8 Reward -21.0 Average Score: -20.62 Epsilon: 0.960693043575437target updating Episode 13 Reward -21.0 Average Score: -20.31 Epsilon: 0.9369146928798039target updating Episode 17 Reward -21.0 Average Score: -20.47 Epsilon: 0.918316468354365target updating Episode 22 Reward -18.0 Average Score: -20.36 Epsilon: 0.8955869907338783target updating Episode 26 Reward -21.0 Average Score: -20.46 Epsilon: 0.8778091417340573target updating Episode 30 Reward -20.0 Average Score: -20.47 Epsilon: 0.8603841919146962target updating Episode 35 Reward -20.0 Average Score: -20.49 Epsilon: 0.8390886103705794target updating Episode 39 Reward -20.0 Average Score: -20.44 Epsilon: 0.8224322824348486target updating Episode 43 Reward -21.0 Average Score: -20.42 Epsilon: 0.8061065909263957target updating Episode 48 Reward -21.0 Average Score: -20.42 Epsilon: 0.7861544476842928target updating Episode 52 Reward -20.0 Average Score: -20.38 Epsilon: 0.7705488893118823target updating Episode 56 Reward -19.0 Average Score: -20.30 Epsilon: 0.7552531090661897target updating Episode 60 Reward -20.0 Average Score: -20.30 Epsilon: 0.7402609576967045target updating Episode 64 Reward -20.0 Average Score: -20.31 Epsilon: 0.7255664080186093target updating Episode 68 Reward -18.0 Average Score: -20.26 Epsilon: 0.7111635524897149target updating Episode 72 Reward -19.0 Average Score: -20.24 Epsilon: 0.697046600835495target updating Episode 76 Reward -20.0 Average Score: -20.24 Epsilon: 0.6832098777212641target updating Episode 80 Reward -20.0 Average Score: -20.26 Epsilon: 0.6696478204705644target updating Episode 83 Reward -20.0 Average Score: -20.24 Epsilon: 0.6596532430440636target updating Episode 87 Reward -20.0 Average Score: -20.22 Epsilon: 0.6465587967553006target updating Episode 91 Reward -20.0 Average Score: -20.21 Epsilon: 0.6337242817644086target updating Episode 95 Reward -21.0 Average Score: -20.18 Epsilon: 0.6211445383053219target updating Episode 98 Reward -21.0 Average Score: -20.16 Epsilon: 0.6118738784280476target updating Episode 100 Average Score: -20.16 Epsilon: 0.6057704364907278 Episode 102 Reward -20.0 Average Score: -20.15 Epsilon: 0.5997278763867329target updating Episode 106 Reward -20.0 Average Score: -20.11 Epsilon: 0.5878229785513479target updating Episode 109 Reward -21.0 Average Score: -20.06 Epsilon: 0.5790496471185967target updating Episode 111 Reward -16.0 Average Score: -19.99 Epsilon: 0.5732736268885887target updating Episode 115 Reward -20.0 Average Score: -19.97 Epsilon: 0.5618938591163328target updating Episode 117 Reward -18.0 Average Score: -19.91 Epsilon: 0.5562889678716474target updating Episode 120 Reward -18.0 Average Score: -19.83 Epsilon: 0.547986285490042target updating Episode 124 Reward -19.0 Average Score: -19.79 Epsilon: 0.5371084840724134target updating Episode 126 Reward -19.0 Average Score: -19.71 Epsilon: 0.531750826943791target updating Episode 129 Reward -18.0 Average Score: -19.68 Epsilon: 0.5238143793828016target updating Episode 132 Reward -17.0 Average Score: -19.64 Epsilon: 0.5159963842937159target updating Episode 134 Reward -18.0 Average Score: -19.58 Epsilon: 0.510849320360386target updating Episode 137 Reward -15.0 Average Score: -19.49 Epsilon: 0.5032248303978422target updating Episode 139 Reward -21.0 Average Score: -19.48 Epsilon: 0.4982051627146237target updating Episode 142 Reward -17.0 Average Score: -19.45 Epsilon: 0.4907693883854626target updating Episode 145 Reward -18.0 Average Score: -19.31 Epsilon: 0.483444593917636target updating Episode 147 Reward -19.0 Average Score: -19.30 Epsilon: 0.47862223409330756target updating Episode 150 Reward -17.0 Average Score: -19.22 Epsilon: 0.47147873742168567target updating Episode 152 Reward -21.0 Average Score: -19.23 Epsilon: 0.46677573701590436target updating Episode 155 Reward -19.0 Average Score: -19.22 Epsilon: 0.4598090507939749target updating Episode 157 Reward -20.0 Average Score: -19.19 Epsilon: 0.45522245551230495target updating Episode 159 Reward -21.0 Average Score: -19.20 Epsilon: 0.4506816115185697target updating Episode 162 Reward -20.0 Average Score: -19.16 Epsilon: 0.4439551321314536target updating Episode 164 Reward -19.0 Average Score: -19.11 Epsilon: 0.43952667968844233target updating Episode 167 Reward -18.0 Average Score: -19.07 Epsilon: 0.43296668905325736target updating Episode 169 Reward -18.0 Average Score: -19.05 Epsilon: 0.4286478463299511target updating Episode 171 Reward -17.0 Average Score: -18.98 Epsilon: 0.42437208406280985target updating Episode 174 Reward -20.0 Average Score: -18.98 Epsilon: 0.4180382776616619target updating Episode 176 Reward -17.0 Average Score: -18.92 Epsilon: 0.41386834584198684target updating Episode 178 Reward -17.0 Average Score: -18.84 Epsilon: 0.40974000909221303target updating Episode 181 Reward -17.0 Average Score: -18.79 Epsilon: 0.4036245882390106target updating Episode 183 Reward -17.0 Average Score: -18.75 Epsilon: 0.3995984329713264target updating Episode 185 Reward -18.0 Average Score: -18.70 Epsilon: 0.39561243860243744target updating Episode 187 Reward -20.0 Average Score: -18.69 Epsilon: 0.39166620452737816target updating Episode 190 Reward -20.0 Average Score: -18.69 Epsilon: 0.3858205374665315target updating Episode 192 Reward -17.0 Average Score: -18.66 Epsilon: 0.3819719776053028target updating Episode 194 Reward -16.0 Average Score: -18.58 Epsilon: 0.37816180712868996target updating Episode 196 Reward -16.0 Average Score: -18.50 Epsilon: 0.3743896431025813target updating Episode 198 Reward -15.0 Average Score: -18.44 Epsilon: 0.3706551064126331target updating Episode 200 Average Score: -18.39 Epsilon: 0.3669578217261671 target updating Episode 202 Reward -20.0 Average Score: -18.34 Epsilon: 0.3632974174544486target updating Episode 204 Reward -16.0 Average Score: -18.27 Epsilon: 0.3596735257153405target updating Episode 205 Reward -14.0 Average Score: -18.22 Epsilon: 0.3578751580867638target updating Episode 208 Reward -19.0 Average Score: -18.21 Epsilon: 0.35253382661792404target updating Episode 210 Reward -19.0 Average Score: -18.23 Epsilon: 0.34901730169741024target updating Episode 212 Reward -19.0 Average Score: -18.25 Epsilon: 0.3455358541129786target updating Episode 214 Reward -16.0 Average Score: -18.14 Epsilon: 0.3420891339682016target updating Episode 216 Reward -19.0 Average Score: -18.13 Epsilon: 0.3386767948568688target updating Episode 218 Reward -16.0 Average Score: -18.10 Epsilon: 0.3352984938281715target updating Episode 220 Reward -20.0 Average Score: -18.12 Epsilon: 0.33195389135223546target updating Episode 222 Reward -14.0 Average Score: -18.06 Epsilon: 0.32864265128599696target updating Episode 224 Reward -17.0 Average Score: -18.03 Epsilon: 0.3253644408394192target updating Episode 225 Reward -15.0 Average Score: -18.03 Epsilon: 0.3237376186352221target updating Episode 227 Reward -20.0 Average Score: -18.00 Epsilon: 0.32050833588933575target updating Episode 229 Reward -16.0 Average Score: -17.95 Epsilon: 0.3173112652388396target updating Episode 231 Reward -17.0 Average Score: -17.85 Epsilon: 0.3141460853680822target updating Episode 233 Reward -16.0 Average Score: -17.83 Epsilon: 0.31101247816653554target updating Episode 235 Reward -18.0 Average Score: -17.79 Epsilon: 0.3079101286968243target updating Episode 236 Reward -17.0 Average Score: -17.77 Epsilon: 0.3063705780533402target updating Episode 238 Reward -14.0 Average Score: -17.70 Epsilon: 0.3033145315372582target updating Episode 240 Reward -18.0 Average Score: -17.62 Epsilon: 0.30028896908517405target updating Episode 242 Reward -16.0 Average Score: -17.60 Epsilon: 0.29729358661854943target updating Episode 244 Reward -20.0 Average Score: -17.70 Epsilon: 0.2943280830920294target updating Episode 246 Reward -15.0 Average Score: -17.68 Epsilon: 0.2913921604631864target updating Episode 248 Reward -17.0 Average Score: -17.66 Epsilon: 0.2884855236625661target updating Episode 251 Reward -20.0 Average Score: -17.71 Epsilon: 0.28417984116121187target updating Episode 252 Reward -18.0 Average Score: -17.68 Epsilon: 0.2827589419554058target updating Episode 254 Reward -16.0 Average Score: -17.61 Epsilon: 0.2799384215094006target updating Episode 257 Reward -17.0 Average Score: -17.57 Epsilon: 0.2757603055760701target updating Episode 258 Reward -17.0 Average Score: -17.54 Epsilon: 0.2743815040481898target updating Episode 260 Reward -20.0 Average Score: -17.52 Epsilon: 0.27164454854530906target updating Episode 262 Reward -16.0 Average Score: -17.49 Epsilon: 0.2689348941735696target updating Episode 263 Reward -12.0 Average Score: -17.45 Epsilon: 0.26759021970270175target updating Episode 265 Reward -17.0 Average Score: -17.37 Epsilon: 0.2649210072611673target updating Episode 267 Reward -18.0 Average Score: -17.35 Epsilon: 0.26227842021373715target updating Episode 268 Reward -16.0 Average Score: -17.33 Epsilon: 0.2609670281126685target updating Episode 270 Reward -16.0 Average Score: -17.30 Epsilon: 0.2583638820072446target updating Episode 272 Reward -18.0 Average Score: -17.27 Epsilon: 0.25578670228422234target updating Episode 274 Reward -15.0 Average Score: -17.18 Epsilon: 0.2532352299289372target updating Episode 276 Reward -18.0 Average Score: -17.16 Epsilon: 0.2507092085103961target updating Episode 278 Reward -19.0 Average Score: -17.20 Epsilon: 0.24820838415550486target updating Episode 280 Reward -14.0 Average Score: -17.12 Epsilon: 0.2457325055235537target updating Episode 281 Reward -15.0 Average Score: -17.10 Epsilon: 0.24450384299593592target updating Episode 283 Reward -21.0 Average Score: -17.03 Epsilon: 0.24206491716205145target updating Episode 285 Reward -17.0 Average Score: -16.98 Epsilon: 0.23965031961336target updating Episode 287 Reward -17.0 Average Score: -16.92 Epsilon: 0.23725980767521673target updating Episode 289 Reward -18.0 Average Score: -16.81 Epsilon: 0.23489314109365644target updating Episode 290 Reward -12.0 Average Score: -16.73 Epsilon: 0.23371867538818816target updating Episode 292 Reward -16.0 Average Score: -16.69 Epsilon: 0.231387331601191target updating Episode 294 Reward -15.0 Average Score: -16.71 Epsilon: 0.2290792429684691target updating Episode 296 Reward -16.0 Average Score: -16.72 Epsilon: 0.22679417751985861target updating Episode 298 Reward -15.0 Average Score: -16.66 Epsilon: 0.22453190559909803target updating Episode 299 Reward -14.0 Average Score: -16.64 Epsilon: 0.22340924607110255target updating Episode 300 Average Score: -16.61 Epsilon: 0.22229219984074702 Episode 301 Reward -17.0 Average Score: -16.63 Epsilon: 0.2211807388415433target updating Episode 302 Reward -17.0 Average Score: -16.60 Epsilon: 0.22007483514733558target updating Episode 304 Reward -13.0 Average Score: -16.59 Epsilon: 0.2178795886667409target updating Episode 305 Reward -14.0 Average Score: -16.59 Epsilon: 0.2167901907234072target updating Episode 307 Reward -19.0 Average Score: -16.57 Epsilon: 0.21462770857094118target updating Episode 309 Reward -18.0 Average Score: -16.54 Epsilon: 0.21248679717794605target updating Episode 310 Reward -15.0 Average Score: -16.50 Epsilon: 0.21142436319205632target updating Episode 312 Reward -14.0 Average Score: -16.44 Epsilon: 0.20931540516921554target updating Episode 313 Reward -18.0 Average Score: -16.49 Epsilon: 0.20826882814336947target updating Episode 314 Reward -19.0 Average Score: -16.52 Epsilon: 0.20722748400265262target updating Episode 316 Reward -17.0 Average Score: -16.45 Epsilon: 0.20516038984972615target updating Episode 317 Reward -14.0 Average Score: -16.41 Epsilon: 0.2041345879004775target updating Episode 319 Reward -16.0 Average Score: -16.42 Epsilon: 0.20209834538617025target updating Episode 321 Reward -12.0 Average Score: -16.33 Epsilon: 0.2000824143909432target updating Episode 322 Reward -16.0 Average Score: -16.35 Epsilon: 0.19908200231898848target updating