Computer_Vision/Chapter16/Building_Q_table.ipynb
2024-02-13 03:34:51 +01:00

8.0 KiB
Raw Blame History

Open In Colab

import numpy as np
import gym
import random
env = gym.make('FrozenLake-v0', is_slippery=False)
action_size=env.action_space.n
state_size=env.observation_space.n
qtable=np.zeros((state_size,action_size))
episode_rewards = []
for i in range(10000):
    state=env.reset()
    total_rewards = 0
    for step in range(50):
        action=env.action_space.sample()
        new_state,reward,done,info=env.step(action)
        qtable[state,action]+=0.1*(reward+0.9*np.max(qtable[new_state,:])-qtable[state,action])
        state=new_state
        total_rewards+=reward
    episode_rewards.append(total_rewards)
print(qtable)
[[0.53143206 0.59047965 0.59048027 0.53143202]
 [0.5314317  0.         0.6560894  0.59048003]
 [0.5904795  0.72898869 0.5904782  0.65608855]
 [0.65608778 0.         0.59047782 0.59047755]
 [0.59047942 0.65608858 0.         0.53143178]
 [0.         0.         0.         0.        ]
 [0.         0.80998872 0.         0.65608495]
 [0.         0.         0.         0.        ]
 [0.65608789 0.         0.72898776 0.59047884]
 [0.65608665 0.80998743 0.80998606 0.        ]
 [0.72898105 0.89999117 0.         0.72898107]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.80994439 0.89999233 0.72897284]
 [0.80984792 0.8999768  0.99999765 0.80991361]
 [0.         0.         0.         0.        ]]
episode_rewards = []
epsilon=1
max_epsilon=1
min_epsilon=0.01
decay_rate=0.005
for episode in range(1000):
    state=env.reset()
    total_rewards = 0
    for step in range(50):
        exp_exp_tradeoff=random.uniform(0,1)
        ## Exploitation:
        if exp_exp_tradeoff>epsilon:
            action=np.argmax(qtable[state,:])
        else:
            ## Exploration
            action=env.action_space.sample()
        new_state,reward,done,info=env.step(action)
        qtable[state,action]+=0.9*(reward+0.9*np.max(qtable[new_state,:])-qtable[state,action])
        state=new_state
        total_rewards+=reward
    episode_rewards.append(total_rewards)
    epsilon=min_epsilon+(max_epsilon-min_epsilon)*np.exp(decay_rate*episode)
print(qtable)
[[0.531441   0.59049    0.59049    0.531441  ]
 [0.531441   0.         0.6561     0.59049   ]
 [0.59049    0.729      0.59049    0.6561    ]
 [0.6561     0.         0.59049    0.59049   ]
 [0.59049    0.6561     0.         0.531441  ]
 [0.         0.         0.         0.        ]
 [0.         0.81       0.         0.6561    ]
 [0.         0.         0.         0.        ]
 [0.6561     0.         0.729      0.59049   ]
 [0.6561     0.81       0.81       0.        ]
 [0.729      0.9        0.         0.729     ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.80999999 0.9        0.729     ]
 [0.81       0.9        1.         0.81      ]
 [0.         0.         0.         0.        ]]
env.reset()
for episode in range(1):
    state=env.reset()
    step=0
    done=False
    print("-----------------------")
    print("Episode",episode)
    for step in range(50):
        env.render()
        action=np.argmax(qtable[state,:])
        print(action)
        new_state,reward,done,info=env.step(action) 
        if done:
            print("Number of Steps",step+1)
            break
        state=new_state
env.close()
-----------------------
Episode 0

SFFF
FHFH
FFFH
HFFG
2
  (Right)
SFFF
FHFH
FFFH
HFFG
2
  (Right)
SFFF
FHFH
FFFH
HFFG
1
  (Down)
SFFF
FHFH
FFFH
HFFG
1
  (Down)
SFFF
FHFH
FFFH
HFFG
1
  (Down)
SFFF
FHFH
FFFH
HFFG
2
Number of Steps 6