8.0 KiB
8.0 KiB
import numpy as np
import gym
import random
env = gym.make('FrozenLake-v0', is_slippery=False)
action_size=env.action_space.n
state_size=env.observation_space.n
qtable=np.zeros((state_size,action_size))
episode_rewards = []
for i in range(10000):
state=env.reset()
total_rewards = 0
for step in range(50):
action=env.action_space.sample()
new_state,reward,done,info=env.step(action)
qtable[state,action]+=0.1*(reward+0.9*np.max(qtable[new_state,:])-qtable[state,action])
state=new_state
total_rewards+=reward
episode_rewards.append(total_rewards)
print(qtable)
[[0.53143206 0.59047965 0.59048027 0.53143202] [0.5314317 0. 0.6560894 0.59048003] [0.5904795 0.72898869 0.5904782 0.65608855] [0.65608778 0. 0.59047782 0.59047755] [0.59047942 0.65608858 0. 0.53143178] [0. 0. 0. 0. ] [0. 0.80998872 0. 0.65608495] [0. 0. 0. 0. ] [0.65608789 0. 0.72898776 0.59047884] [0.65608665 0.80998743 0.80998606 0. ] [0.72898105 0.89999117 0. 0.72898107] [0. 0. 0. 0. ] [0. 0. 0. 0. ] [0. 0.80994439 0.89999233 0.72897284] [0.80984792 0.8999768 0.99999765 0.80991361] [0. 0. 0. 0. ]]
episode_rewards = []
epsilon=1
max_epsilon=1
min_epsilon=0.01
decay_rate=0.005
for episode in range(1000):
state=env.reset()
total_rewards = 0
for step in range(50):
exp_exp_tradeoff=random.uniform(0,1)
## Exploitation:
if exp_exp_tradeoff>epsilon:
action=np.argmax(qtable[state,:])
else:
## Exploration
action=env.action_space.sample()
new_state,reward,done,info=env.step(action)
qtable[state,action]+=0.9*(reward+0.9*np.max(qtable[new_state,:])-qtable[state,action])
state=new_state
total_rewards+=reward
episode_rewards.append(total_rewards)
epsilon=min_epsilon+(max_epsilon-min_epsilon)*np.exp(decay_rate*episode)
print(qtable)
[[0.531441 0.59049 0.59049 0.531441 ] [0.531441 0. 0.6561 0.59049 ] [0.59049 0.729 0.59049 0.6561 ] [0.6561 0. 0.59049 0.59049 ] [0.59049 0.6561 0. 0.531441 ] [0. 0. 0. 0. ] [0. 0.81 0. 0.6561 ] [0. 0. 0. 0. ] [0.6561 0. 0.729 0.59049 ] [0.6561 0.81 0.81 0. ] [0.729 0.9 0. 0.729 ] [0. 0. 0. 0. ] [0. 0. 0. 0. ] [0. 0.80999999 0.9 0.729 ] [0.81 0.9 1. 0.81 ] [0. 0. 0. 0. ]]
env.reset()
for episode in range(1):
state=env.reset()
step=0
done=False
print("-----------------------")
print("Episode",episode)
for step in range(50):
env.render()
action=np.argmax(qtable[state,:])
print(action)
new_state,reward,done,info=env.step(action)
if done:
print("Number of Steps",step+1)
break
state=new_state
env.close()
----------------------- Episode 0 [41mS[0mFFF FHFH FFFH HFFG 2 (Right) S[41mF[0mFF FHFH FFFH HFFG 2 (Right) SF[41mF[0mF FHFH FFFH HFFG 1 (Down) SFFF FH[41mF[0mH FFFH HFFG 1 (Down) SFFF FHFH FF[41mF[0mH HFFG 1 (Down) SFFF FHFH FFFH HF[41mF[0mG 2 Number of Steps 6