429 lines
26 KiB
Plaintext
429 lines
26 KiB
Plaintext
{
|
|
"nbformat": 4,
|
|
"nbformat_minor": 0,
|
|
"metadata": {
|
|
"colab": {
|
|
"name": "Pong_Deep_Q_Learning_with_Fixed_targets.ipynb",
|
|
"provenance": [],
|
|
"include_colab_link": true
|
|
},
|
|
"kernelspec": {
|
|
"name": "python3",
|
|
"display_name": "Python 3"
|
|
},
|
|
"accelerator": "GPU"
|
|
},
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"id": "view-in-github",
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"<a href=\"https://colab.research.google.com/github/PacktPublishing/Hands-On-Computer-Vision-with-PyTorch/blob/master/Chapter16/Pong_Deep_Q_Learning_with_Fixed_targets.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "KeIaR6oerFRN"
|
|
},
|
|
"source": [
|
|
"import gym\n",
|
|
"import numpy as np\n",
|
|
"import cv2\n",
|
|
"from collections import deque\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"import torch\n",
|
|
"import torch.nn as nn\n",
|
|
"import torch.nn.functional as F\n",
|
|
"import numpy as np\n",
|
|
"import random\n",
|
|
"from collections import namedtuple, deque\n",
|
|
"import torch\n",
|
|
"import torch.nn.functional as F\n",
|
|
"import torch.optim as optim\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"%matplotlib inline\n",
|
|
"\n",
|
|
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
|
"\n",
|
|
"env = gym.make('PongDeterministic-v0')\n"
|
|
],
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "t44gFycfrIeT"
|
|
},
|
|
"source": [
|
|
"state_size = env.observation_space.shape[0]\n",
|
|
"action_size = env.action_space.n"
|
|
],
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "eewhAu_MrJ_P"
|
|
},
|
|
"source": [
|
|
"def preprocess_frame(frame): \n",
|
|
" bkg_color = np.array([144, 72, 17])\n",
|
|
" img = np.mean(frame[34:-16:2,::2]-bkg_color, axis=-1)/255.\n",
|
|
" resized_image = img\n",
|
|
" return resized_image"
|
|
],
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "F2JLUoEcrMY0"
|
|
},
|
|
"source": [
|
|
"def stack_frames(stacked_frames, state, is_new_episode):\n",
|
|
" # Preprocess frame\n",
|
|
" frame = preprocess_frame(state)\n",
|
|
" stack_size = 4\n",
|
|
" if is_new_episode:\n",
|
|
" # Clear our stacked_frames\n",
|
|
" stacked_frames = deque([np.zeros((80,80), dtype=np.uint8) for i in range(stack_size)], maxlen=4)\n",
|
|
" # Because we're in a new episode, copy the same frame 4x\n",
|
|
" for i in range(stack_size):\n",
|
|
" stacked_frames.append(frame) \n",
|
|
" # Stack the frames\n",
|
|
" stacked_state = np.stack(stacked_frames, axis=2).transpose(2, 0, 1)\n",
|
|
" else:\n",
|
|
" # Append frame to deque, automatically removes the oldest frame\n",
|
|
" stacked_frames.append(frame)\n",
|
|
" # Build the stacked state (first dimension specifies different frames)\n",
|
|
" stacked_state = np.stack(stacked_frames, axis=2).transpose(2, 0, 1) \n",
|
|
" return stacked_state, stacked_frames\n",
|
|
"class DQNetwork(nn.Module):\n",
|
|
" def __init__(self, states, action_size):\n",
|
|
" super(DQNetwork, self).__init__()\n",
|
|
" \n",
|
|
" self.conv1 = nn.Conv2d(4, 32, (8, 8), stride=4)\n",
|
|
" self.conv2 = nn.Conv2d(32, 64, (4, 4), stride=2)\n",
|
|
" self.conv3 = nn.Conv2d(64, 64, (3, 3), stride=1)\n",
|
|
" self.flatten = nn.Flatten()\n",
|
|
" self.fc1 = nn.Linear(2304, 512)\n",
|
|
" self.fc2 = nn.Linear(512, action_size)\n",
|
|
" \n",
|
|
" def forward(self, state): \n",
|
|
" x = F.relu(self.conv1(state))\n",
|
|
" x = F.relu(self.conv2(x))\n",
|
|
" x = F.relu(self.conv3(x))\n",
|
|
" x = self.flatten(x)\n",
|
|
" x = F.relu(self.fc1(x))\n",
|
|
" x = self.fc2(x)\n",
|
|
" return x"
|
|
],
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "4rNQ6IthrUNB"
|
|
},
|
|
"source": [
|
|
"class Agent():\n",
|
|
" def __init__(self, state_size, action_size):\n",
|
|
" \n",
|
|
" self.state_size = state_size\n",
|
|
" self.action_size = action_size\n",
|
|
" self.seed = random.seed(0)\n",
|
|
"\n",
|
|
" ## hyperparameters\n",
|
|
" self.buffer_size = 10000\n",
|
|
" self.batch_size = 32\n",
|
|
" self.gamma = 0.99\n",
|
|
" self.lr = 0.0001\n",
|
|
" self.update_every = 4\n",
|
|
" self.update_every_target = 1000 \n",
|
|
" self.learn_every_target_counter = 0\n",
|
|
" # Q-Network\n",
|
|
" self.local = DQNetwork(state_size, action_size).to(device)\n",
|
|
" self.target = DQNetwork(state_size, action_size).to(device)\n",
|
|
" self.optimizer = optim.Adam(self.local.parameters(), lr=self.lr)\n",
|
|
"\n",
|
|
" # Replay memory\n",
|
|
" self.memory = deque(maxlen=self.buffer_size) \n",
|
|
" self.experience = namedtuple(\"Experience\", field_names=[\"state\", \"action\", \"reward\", \"next_state\", \"done\"])\n",
|
|
" # Initialize time step (for updating every few steps)\n",
|
|
" self.t_step = 0\n",
|
|
" def step(self, state, action, reward, next_state, done):\n",
|
|
" # Save experience in replay memory\n",
|
|
" self.memory.append(self.experience(state[None], action, reward, next_state[None], done))\n",
|
|
" \n",
|
|
" # Learn every update_every time steps.\n",
|
|
" self.t_step = (self.t_step + 1) % self.update_every\n",
|
|
" if self.t_step == 0:\n",
|
|
" # If enough samples are available in memory, get random subset and learn\n",
|
|
" if len(self.memory) > self.batch_size:\n",
|
|
" experiences = self.sample_experiences()\n",
|
|
" self.learn(experiences, self.gamma)\n",
|
|
" def act(self, state, eps=0.):\n",
|
|
" # Epsilon-greedy action selection\n",
|
|
" if random.random() > eps:\n",
|
|
" state = torch.from_numpy(state).float().unsqueeze(0).to(device)\n",
|
|
" self.local.eval()\n",
|
|
" with torch.no_grad():\n",
|
|
" action_values = self.local(state)\n",
|
|
" self.local.train()\n",
|
|
" return np.argmax(action_values.cpu().data.numpy())\n",
|
|
" else:\n",
|
|
" return random.choice(np.arange(self.action_size))\n",
|
|
" def learn(self, experiences, gamma):\n",
|
|
" self.learn_every_target_counter+=1\n",
|
|
" states, actions, rewards, next_states, dones = experiences\n",
|
|
" # Get expected Q values from local model\n",
|
|
" Q_expected = self.local(states).gather(1, actions)\n",
|
|
"\n",
|
|
" # Get max predicted Q values (for next states) from target model\n",
|
|
" Q_targets_next = self.target(next_states).detach().max(1)[0].unsqueeze(1)\n",
|
|
" # Compute Q targets for current state\n",
|
|
" Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))\n",
|
|
" \n",
|
|
" # Compute loss\n",
|
|
" loss = F.mse_loss(Q_expected, Q_targets)\n",
|
|
"\n",
|
|
" # Minimize the loss\n",
|
|
" self.optimizer.zero_grad()\n",
|
|
" loss.backward()\n",
|
|
" self.optimizer.step()\n",
|
|
"\n",
|
|
" # ------------------- update target network ------------------- #\n",
|
|
" if self.learn_every_target_counter%1000 ==0:\n",
|
|
" self.target_update() \n",
|
|
" def target_update(self):\n",
|
|
" print('target updating')\n",
|
|
" self.target.load_state_dict(self.local.state_dict())\n",
|
|
" def sample_experiences(self):\n",
|
|
" experiences = random.sample(self.memory, k=self.batch_size) \n",
|
|
" states = torch.from_numpy(np.vstack([e.state for e in experiences if e is not None])).float().to(device)\n",
|
|
" actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).long().to(device)\n",
|
|
" rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(device)\n",
|
|
" next_states = torch.from_numpy(np.vstack([e.next_state for e in experiences if e is not None])).float().to(device)\n",
|
|
" dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(device) \n",
|
|
" return (states, actions, rewards, next_states, dones)\n",
|
|
"agent = Agent(state_size, action_size)"
|
|
],
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "Y2ZVDl2yrrZX"
|
|
},
|
|
"source": [
|
|
"n_episodes=5000\n",
|
|
"max_t=5000\n",
|
|
"eps_start=1.0\n",
|
|
"eps_end=0.02\n",
|
|
"eps_decay=0.995\n",
|
|
"scores = [] # list containing scores from each episode\n",
|
|
"scores_window = deque(maxlen=100) # last 100 scores\n",
|
|
"eps = eps_start\n",
|
|
"stack_size = 4\n",
|
|
"stacked_frames = deque([np.zeros((80,80), dtype=np.int) for i in range(stack_size)], maxlen=stack_size) "
|
|
],
|
|
"execution_count": null,
|
|
"outputs": []
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "iACTglwrrs1L",
|
|
"outputId": "1009afc5-f3fc-4c0e-b77f-8be3dca9253a",
|
|
"colab": {
|
|
"base_uri": "https://localhost:8080/"
|
|
}
|
|
},
|
|
"source": [
|
|
"for i_episode in range(1, n_episodes+1):\n",
|
|
" state = env.reset()\n",
|
|
" state, frames = stack_frames(stacked_frames, state, True)\n",
|
|
" score = 0\n",
|
|
" for i in range(max_t):\n",
|
|
" action = agent.act(state, eps)\n",
|
|
" next_state, reward, done, _ = env.step(action)\n",
|
|
" next_state, frames = stack_frames(frames, next_state, False)\n",
|
|
" agent.step(state, action, reward, next_state, done)\n",
|
|
" state = next_state\n",
|
|
" score += reward\n",
|
|
" if done:\n",
|
|
" break \n",
|
|
" scores_window.append(score) # save most recent score\n",
|
|
" scores.append(score) # save most recent score\n",
|
|
" eps = max(eps_end, eps_decay*eps) # decrease epsilon\n",
|
|
" print('\\rEpisode {}\\tReward {} \\tAverage Score: {:.2f} \\tEpsilon: {}'.format(i_episode,score,np.mean(scores_window), eps), end=\"\")\n",
|
|
" if i_episode % 100 == 0:\n",
|
|
" print('\\rEpisode {}\\tAverage Score: {:.2f} \\tEpsilon: {}'.format(i_episode, np.mean(scores_window), eps))"
|
|
],
|
|
"execution_count": null,
|
|
"outputs": [
|
|
{
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Episode 4\tReward -20.0 \tAverage Score: -20.25 \tEpsilon: 0.9801495006250001target updating\n",
|
|
"Episode 8\tReward -21.0 \tAverage Score: -20.62 \tEpsilon: 0.960693043575437target updating\n",
|
|
"Episode 13\tReward -21.0 \tAverage Score: -20.31 \tEpsilon: 0.9369146928798039target updating\n",
|
|
"Episode 17\tReward -21.0 \tAverage Score: -20.47 \tEpsilon: 0.918316468354365target updating\n",
|
|
"Episode 22\tReward -18.0 \tAverage Score: -20.36 \tEpsilon: 0.8955869907338783target updating\n",
|
|
"Episode 26\tReward -21.0 \tAverage Score: -20.46 \tEpsilon: 0.8778091417340573target updating\n",
|
|
"Episode 30\tReward -20.0 \tAverage Score: -20.47 \tEpsilon: 0.8603841919146962target updating\n",
|
|
"Episode 35\tReward -20.0 \tAverage Score: -20.49 \tEpsilon: 0.8390886103705794target updating\n",
|
|
"Episode 39\tReward -20.0 \tAverage Score: -20.44 \tEpsilon: 0.8224322824348486target updating\n",
|
|
"Episode 43\tReward -21.0 \tAverage Score: -20.42 \tEpsilon: 0.8061065909263957target updating\n",
|
|
"Episode 48\tReward -21.0 \tAverage Score: -20.42 \tEpsilon: 0.7861544476842928target updating\n",
|
|
"Episode 52\tReward -20.0 \tAverage Score: -20.38 \tEpsilon: 0.7705488893118823target updating\n",
|
|
"Episode 56\tReward -19.0 \tAverage Score: -20.30 \tEpsilon: 0.7552531090661897target updating\n",
|
|
"Episode 60\tReward -20.0 \tAverage Score: -20.30 \tEpsilon: 0.7402609576967045target updating\n",
|
|
"Episode 64\tReward -20.0 \tAverage Score: -20.31 \tEpsilon: 0.7255664080186093target updating\n",
|
|
"Episode 68\tReward -18.0 \tAverage Score: -20.26 \tEpsilon: 0.7111635524897149target updating\n",
|
|
"Episode 72\tReward -19.0 \tAverage Score: -20.24 \tEpsilon: 0.697046600835495target updating\n",
|
|
"Episode 76\tReward -20.0 \tAverage Score: -20.24 \tEpsilon: 0.6832098777212641target updating\n",
|
|
"Episode 80\tReward -20.0 \tAverage Score: -20.26 \tEpsilon: 0.6696478204705644target updating\n",
|
|
"Episode 83\tReward -20.0 \tAverage Score: -20.24 \tEpsilon: 0.6596532430440636target updating\n",
|
|
"Episode 87\tReward -20.0 \tAverage Score: -20.22 \tEpsilon: 0.6465587967553006target updating\n",
|
|
"Episode 91\tReward -20.0 \tAverage Score: -20.21 \tEpsilon: 0.6337242817644086target updating\n",
|
|
"Episode 95\tReward -21.0 \tAverage Score: -20.18 \tEpsilon: 0.6211445383053219target updating\n",
|
|
"Episode 98\tReward -21.0 \tAverage Score: -20.16 \tEpsilon: 0.6118738784280476target updating\n",
|
|
"Episode 100\tAverage Score: -20.16 \tEpsilon: 0.6057704364907278\n",
|
|
"Episode 102\tReward -20.0 \tAverage Score: -20.15 \tEpsilon: 0.5997278763867329target updating\n",
|
|
"Episode 106\tReward -20.0 \tAverage Score: -20.11 \tEpsilon: 0.5878229785513479target updating\n",
|
|
"Episode 109\tReward -21.0 \tAverage Score: -20.06 \tEpsilon: 0.5790496471185967target updating\n",
|
|
"Episode 111\tReward -16.0 \tAverage Score: -19.99 \tEpsilon: 0.5732736268885887target updating\n",
|
|
"Episode 115\tReward -20.0 \tAverage Score: -19.97 \tEpsilon: 0.5618938591163328target updating\n",
|
|
"Episode 117\tReward -18.0 \tAverage Score: -19.91 \tEpsilon: 0.5562889678716474target updating\n",
|
|
"Episode 120\tReward -18.0 \tAverage Score: -19.83 \tEpsilon: 0.547986285490042target updating\n",
|
|
"Episode 124\tReward -19.0 \tAverage Score: -19.79 \tEpsilon: 0.5371084840724134target updating\n",
|
|
"Episode 126\tReward -19.0 \tAverage Score: -19.71 \tEpsilon: 0.531750826943791target updating\n",
|
|
"Episode 129\tReward -18.0 \tAverage Score: -19.68 \tEpsilon: 0.5238143793828016target updating\n",
|
|
"Episode 132\tReward -17.0 \tAverage Score: -19.64 \tEpsilon: 0.5159963842937159target updating\n",
|
|
"Episode 134\tReward -18.0 \tAverage Score: -19.58 \tEpsilon: 0.510849320360386target updating\n",
|
|
"Episode 137\tReward -15.0 \tAverage Score: -19.49 \tEpsilon: 0.5032248303978422target updating\n",
|
|
"Episode 139\tReward -21.0 \tAverage Score: -19.48 \tEpsilon: 0.4982051627146237target updating\n",
|
|
"Episode 142\tReward -17.0 \tAverage Score: -19.45 \tEpsilon: 0.4907693883854626target updating\n",
|
|
"Episode 145\tReward -18.0 \tAverage Score: -19.31 \tEpsilon: 0.483444593917636target updating\n",
|
|
"Episode 147\tReward -19.0 \tAverage Score: -19.30 \tEpsilon: 0.47862223409330756target updating\n",
|
|
"Episode 150\tReward -17.0 \tAverage Score: -19.22 \tEpsilon: 0.47147873742168567target updating\n",
|
|
"Episode 152\tReward -21.0 \tAverage Score: -19.23 \tEpsilon: 0.46677573701590436target updating\n",
|
|
"Episode 155\tReward -19.0 \tAverage Score: -19.22 \tEpsilon: 0.4598090507939749target updating\n",
|
|
"Episode 157\tReward -20.0 \tAverage Score: -19.19 \tEpsilon: 0.45522245551230495target updating\n",
|
|
"Episode 159\tReward -21.0 \tAverage Score: -19.20 \tEpsilon: 0.4506816115185697target updating\n",
|
|
"Episode 162\tReward -20.0 \tAverage Score: -19.16 \tEpsilon: 0.4439551321314536target updating\n",
|
|
"Episode 164\tReward -19.0 \tAverage Score: -19.11 \tEpsilon: 0.43952667968844233target updating\n",
|
|
"Episode 167\tReward -18.0 \tAverage Score: -19.07 \tEpsilon: 0.43296668905325736target updating\n",
|
|
"Episode 169\tReward -18.0 \tAverage Score: -19.05 \tEpsilon: 0.4286478463299511target updating\n",
|
|
"Episode 171\tReward -17.0 \tAverage Score: -18.98 \tEpsilon: 0.42437208406280985target updating\n",
|
|
"Episode 174\tReward -20.0 \tAverage Score: -18.98 \tEpsilon: 0.4180382776616619target updating\n",
|
|
"Episode 176\tReward -17.0 \tAverage Score: -18.92 \tEpsilon: 0.41386834584198684target updating\n",
|
|
"Episode 178\tReward -17.0 \tAverage Score: -18.84 \tEpsilon: 0.40974000909221303target updating\n",
|
|
"Episode 181\tReward -17.0 \tAverage Score: -18.79 \tEpsilon: 0.4036245882390106target updating\n",
|
|
"Episode 183\tReward -17.0 \tAverage Score: -18.75 \tEpsilon: 0.3995984329713264target updating\n",
|
|
"Episode 185\tReward -18.0 \tAverage Score: -18.70 \tEpsilon: 0.39561243860243744target updating\n",
|
|
"Episode 187\tReward -20.0 \tAverage Score: -18.69 \tEpsilon: 0.39166620452737816target updating\n",
|
|
"Episode 190\tReward -20.0 \tAverage Score: -18.69 \tEpsilon: 0.3858205374665315target updating\n",
|
|
"Episode 192\tReward -17.0 \tAverage Score: -18.66 \tEpsilon: 0.3819719776053028target updating\n",
|
|
"Episode 194\tReward -16.0 \tAverage Score: -18.58 \tEpsilon: 0.37816180712868996target updating\n",
|
|
"Episode 196\tReward -16.0 \tAverage Score: -18.50 \tEpsilon: 0.3743896431025813target updating\n",
|
|
"Episode 198\tReward -15.0 \tAverage Score: -18.44 \tEpsilon: 0.3706551064126331target updating\n",
|
|
"Episode 200\tAverage Score: -18.39 \tEpsilon: 0.3669578217261671\n",
|
|
"target updating\n",
|
|
"Episode 202\tReward -20.0 \tAverage Score: -18.34 \tEpsilon: 0.3632974174544486target updating\n",
|
|
"Episode 204\tReward -16.0 \tAverage Score: -18.27 \tEpsilon: 0.3596735257153405target updating\n",
|
|
"Episode 205\tReward -14.0 \tAverage Score: -18.22 \tEpsilon: 0.3578751580867638target updating\n",
|
|
"Episode 208\tReward -19.0 \tAverage Score: -18.21 \tEpsilon: 0.35253382661792404target updating\n",
|
|
"Episode 210\tReward -19.0 \tAverage Score: -18.23 \tEpsilon: 0.34901730169741024target updating\n",
|
|
"Episode 212\tReward -19.0 \tAverage Score: -18.25 \tEpsilon: 0.3455358541129786target updating\n",
|
|
"Episode 214\tReward -16.0 \tAverage Score: -18.14 \tEpsilon: 0.3420891339682016target updating\n",
|
|
"Episode 216\tReward -19.0 \tAverage Score: -18.13 \tEpsilon: 0.3386767948568688target updating\n",
|
|
"Episode 218\tReward -16.0 \tAverage Score: -18.10 \tEpsilon: 0.3352984938281715target updating\n",
|
|
"Episode 220\tReward -20.0 \tAverage Score: -18.12 \tEpsilon: 0.33195389135223546target updating\n",
|
|
"Episode 222\tReward -14.0 \tAverage Score: -18.06 \tEpsilon: 0.32864265128599696target updating\n",
|
|
"Episode 224\tReward -17.0 \tAverage Score: -18.03 \tEpsilon: 0.3253644408394192target updating\n",
|
|
"Episode 225\tReward -15.0 \tAverage Score: -18.03 \tEpsilon: 0.3237376186352221target updating\n",
|
|
"Episode 227\tReward -20.0 \tAverage Score: -18.00 \tEpsilon: 0.32050833588933575target updating\n",
|
|
"Episode 229\tReward -16.0 \tAverage Score: -17.95 \tEpsilon: 0.3173112652388396target updating\n",
|
|
"Episode 231\tReward -17.0 \tAverage Score: -17.85 \tEpsilon: 0.3141460853680822target updating\n",
|
|
"Episode 233\tReward -16.0 \tAverage Score: -17.83 \tEpsilon: 0.31101247816653554target updating\n",
|
|
"Episode 235\tReward -18.0 \tAverage Score: -17.79 \tEpsilon: 0.3079101286968243target updating\n",
|
|
"Episode 236\tReward -17.0 \tAverage Score: -17.77 \tEpsilon: 0.3063705780533402target updating\n",
|
|
"Episode 238\tReward -14.0 \tAverage Score: -17.70 \tEpsilon: 0.3033145315372582target updating\n",
|
|
"Episode 240\tReward -18.0 \tAverage Score: -17.62 \tEpsilon: 0.30028896908517405target updating\n",
|
|
"Episode 242\tReward -16.0 \tAverage Score: -17.60 \tEpsilon: 0.29729358661854943target updating\n",
|
|
"Episode 244\tReward -20.0 \tAverage Score: -17.70 \tEpsilon: 0.2943280830920294target updating\n",
|
|
"Episode 246\tReward -15.0 \tAverage Score: -17.68 \tEpsilon: 0.2913921604631864target updating\n",
|
|
"Episode 248\tReward -17.0 \tAverage Score: -17.66 \tEpsilon: 0.2884855236625661target updating\n",
|
|
"Episode 251\tReward -20.0 \tAverage Score: -17.71 \tEpsilon: 0.28417984116121187target updating\n",
|
|
"Episode 252\tReward -18.0 \tAverage Score: -17.68 \tEpsilon: 0.2827589419554058target updating\n",
|
|
"Episode 254\tReward -16.0 \tAverage Score: -17.61 \tEpsilon: 0.2799384215094006target updating\n",
|
|
"Episode 257\tReward -17.0 \tAverage Score: -17.57 \tEpsilon: 0.2757603055760701target updating\n",
|
|
"Episode 258\tReward -17.0 \tAverage Score: -17.54 \tEpsilon: 0.2743815040481898target updating\n",
|
|
"Episode 260\tReward -20.0 \tAverage Score: -17.52 \tEpsilon: 0.27164454854530906target updating\n",
|
|
"Episode 262\tReward -16.0 \tAverage Score: -17.49 \tEpsilon: 0.2689348941735696target updating\n",
|
|
"Episode 263\tReward -12.0 \tAverage Score: -17.45 \tEpsilon: 0.26759021970270175target updating\n",
|
|
"Episode 265\tReward -17.0 \tAverage Score: -17.37 \tEpsilon: 0.2649210072611673target updating\n",
|
|
"Episode 267\tReward -18.0 \tAverage Score: -17.35 \tEpsilon: 0.26227842021373715target updating\n",
|
|
"Episode 268\tReward -16.0 \tAverage Score: -17.33 \tEpsilon: 0.2609670281126685target updating\n",
|
|
"Episode 270\tReward -16.0 \tAverage Score: -17.30 \tEpsilon: 0.2583638820072446target updating\n",
|
|
"Episode 272\tReward -18.0 \tAverage Score: -17.27 \tEpsilon: 0.25578670228422234target updating\n",
|
|
"Episode 274\tReward -15.0 \tAverage Score: -17.18 \tEpsilon: 0.2532352299289372target updating\n",
|
|
"Episode 276\tReward -18.0 \tAverage Score: -17.16 \tEpsilon: 0.2507092085103961target updating\n",
|
|
"Episode 278\tReward -19.0 \tAverage Score: -17.20 \tEpsilon: 0.24820838415550486target updating\n",
|
|
"Episode 280\tReward -14.0 \tAverage Score: -17.12 \tEpsilon: 0.2457325055235537target updating\n",
|
|
"Episode 281\tReward -15.0 \tAverage Score: -17.10 \tEpsilon: 0.24450384299593592target updating\n",
|
|
"Episode 283\tReward -21.0 \tAverage Score: -17.03 \tEpsilon: 0.24206491716205145target updating\n",
|
|
"Episode 285\tReward -17.0 \tAverage Score: -16.98 \tEpsilon: 0.23965031961336target updating\n",
|
|
"Episode 287\tReward -17.0 \tAverage Score: -16.92 \tEpsilon: 0.23725980767521673target updating\n",
|
|
"Episode 289\tReward -18.0 \tAverage Score: -16.81 \tEpsilon: 0.23489314109365644target updating\n",
|
|
"Episode 290\tReward -12.0 \tAverage Score: -16.73 \tEpsilon: 0.23371867538818816target updating\n",
|
|
"Episode 292\tReward -16.0 \tAverage Score: -16.69 \tEpsilon: 0.231387331601191target updating\n",
|
|
"Episode 294\tReward -15.0 \tAverage Score: -16.71 \tEpsilon: 0.2290792429684691target updating\n",
|
|
"Episode 296\tReward -16.0 \tAverage Score: -16.72 \tEpsilon: 0.22679417751985861target updating\n",
|
|
"Episode 298\tReward -15.0 \tAverage Score: -16.66 \tEpsilon: 0.22453190559909803target updating\n",
|
|
"Episode 299\tReward -14.0 \tAverage Score: -16.64 \tEpsilon: 0.22340924607110255target updating\n",
|
|
"Episode 300\tAverage Score: -16.61 \tEpsilon: 0.22229219984074702\n",
|
|
"Episode 301\tReward -17.0 \tAverage Score: -16.63 \tEpsilon: 0.2211807388415433target updating\n",
|
|
"Episode 302\tReward -17.0 \tAverage Score: -16.60 \tEpsilon: 0.22007483514733558target updating\n",
|
|
"Episode 304\tReward -13.0 \tAverage Score: -16.59 \tEpsilon: 0.2178795886667409target updating\n",
|
|
"Episode 305\tReward -14.0 \tAverage Score: -16.59 \tEpsilon: 0.2167901907234072target updating\n",
|
|
"Episode 307\tReward -19.0 \tAverage Score: -16.57 \tEpsilon: 0.21462770857094118target updating\n",
|
|
"Episode 309\tReward -18.0 \tAverage Score: -16.54 \tEpsilon: 0.21248679717794605target updating\n",
|
|
"Episode 310\tReward -15.0 \tAverage Score: -16.50 \tEpsilon: 0.21142436319205632target updating\n",
|
|
"Episode 312\tReward -14.0 \tAverage Score: -16.44 \tEpsilon: 0.20931540516921554target updating\n",
|
|
"Episode 313\tReward -18.0 \tAverage Score: -16.49 \tEpsilon: 0.20826882814336947target updating\n",
|
|
"Episode 314\tReward -19.0 \tAverage Score: -16.52 \tEpsilon: 0.20722748400265262target updating\n",
|
|
"Episode 316\tReward -17.0 \tAverage Score: -16.45 \tEpsilon: 0.20516038984972615target updating\n",
|
|
"Episode 317\tReward -14.0 \tAverage Score: -16.41 \tEpsilon: 0.2041345879004775target updating\n",
|
|
"Episode 319\tReward -16.0 \tAverage Score: -16.42 \tEpsilon: 0.20209834538617025target updating\n",
|
|
"Episode 321\tReward -12.0 \tAverage Score: -16.33 \tEpsilon: 0.2000824143909432target updating\n",
|
|
"Episode 322\tReward -16.0 \tAverage Score: -16.35 \tEpsilon: 0.19908200231898848target updating\n"
|
|
],
|
|
"name": "stdout"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"metadata": {
|
|
"id": "BCNoMUGyrwrR"
|
|
},
|
|
"source": [
|
|
""
|
|
],
|
|
"execution_count": null,
|
|
"outputs": []
|
|
}
|
|
]
|
|
} |