CartPole is a game test in OpenAI gym
https://gym.openai.com/envs/CartPole-v1/
The purpose is to let the agent control the cart through reinforcement learning, so that the pole will not fall for as long as possible.
Here, Q-learning is used to understand Q-learning.
Q matrix definition:
The state of CartPole is saved in the observation. There are 4 variables, cart position and speed, angle and speed of pole. They are all continuous values. Now we need to discretize these continuous values. Cart position: -2.4 ~ 2.4 Cart speed
:
- inf ~ inf
pole angle: -0.5 ~ 0.5 (radian)
pole angular velocity: -inf ~ inf
Discretization is 6 bins, 4 variables, so there are 6 4 = 1296 states, and there are only 2 types of action, left and right,
so the Q matrix size is 1296 * 2
Define 3 classes: Agent, Brain, Environment, and their relationship is as follows:
(1) Agent transmits the current observation to Brain
(2) After Brain discretizes the observation state, decides the action according to the Q matrix, and then sends the action to the Agent
(3) The Agent applies the action to the Environment, and the Environment uses the observation_t fed back by the action +1 and reward_t+1 are returned to Agent
(4) Agent sends current observation_t, action_t, observation_t+1 and reward_t+1 after action to Brain
(5) Brain updates Q matrix
to update observation=observation_t+1, repeat (1) ~ (5)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import gym
ENV = 'CartPole-v1'
NUM_DIGITIZED = 6
GAMMA = 0.99 #decrease rate
ETA = 0.5 #learning rate
MAX_STEPS = 200 #steps for 1 episode
NUM_EPISODES = 2000 #number of episodes
class Agent:
def __init__(self, num_states, num_actions):
self.brain = Brain(num_states, num_actions)
#update the Q function
def update_Q_function(self, observation, action, reward, observation_next):
self.brain.update_Q_table(
observation, action, reward, observation_next)
#get the action
def get_action(self, observation, step):
action = self.brain.decide_action(observation, step)
return action
class Brain:
#do Q-learning
def __init__(self, num_states, num_actions):
self.num_actions = num_actions #the number of CartPole actions
#create the Q table, row is the discrete state(digitized state^number of variables), column is action(left, right)
self.q_table = np.random.uniform(low=0, high=1, size=(NUM_DIGITIZED**num_states, num_actions)) #uniform distributed sample with size
def bins(self, clip_min, clip_max, num):
#convert continous value to discrete value
return np.linspace(clip_min, clip_max, num + 1)[1: -1] #num of bins needs num+1 value
def digitize_state(self, observation):
#get the discrete state in total 1296 states
cart_pos, cart_v, pole_angle, pole_v = observation
digitized = [
np.digitize(cart_pos, bins = self.bins(-2.4, 2.4, NUM_DIGITIZED)),
np.digitize(cart_v, bins=self.bins(-3.0, 3.0, NUM_DIGITIZED)),
np.digitize(pole_angle, bins=self.bins(-0.5, 0.5, NUM_DIGITIZED)), #angle represent by radian
np.digitize(pole_v, bins=self.bins(-2.0, 2.0, NUM_DIGITIZED))
]
return sum([x* (NUM_DIGITIZED**i) for i, x in enumerate(digitized)])
def update_Q_table(self, observation, action, reward, observation_next):
state = self.digitize_state(observation)
state_next = self.digitize_state(observation_next)
Max_Q_next = max(self.q_table[state_next][:])
self.q_table[state, action] = self.q_table[state, action] + \
ETA * (reward + GAMMA * Max_Q_next - self.q_table[state, action])
def decide_action(self, observation, episode):
#epsilon-greedy
state = self.digitize_state(observation)
epsilon = 0.5 * (1 / (episode + 1))
if epsilon <= np.random.uniform(0, 1):
action = np.argmax(self.q_table[state][:])
else:
action = np.random.choice(self.num_actions)
return action
class Environment:
def __init__(self):
self.env = gym.make(ENV)
num_states = self.env.observation_space.shape[0] #4
num_actions = self.env.action_space.n #2
self.agent = Agent(num_states, num_actions) #create the agent
def run(self):
complete_episodes = 0 #succeed episodes that hold on for more than 195 steps
is_episode_final = False #last episode flag
frames = [] #for animation
for episode in range(NUM_EPISODES): #1000 episodes
observation = self.env.reset() #initialize environment
for step in range(MAX_STEPS): #steps in one episode
if is_episode_final is True: #True / False is singleton in Python, so can use "is" to compare the object, while "==" compares the value
frames.append(self.env.render(mode='rgb_array'))
action = self.agent.get_action(observation, episode) #not step
#get state_t+1, reward from action_t
observation_next, _, done, _ = self.env.step(action) #reward and info not need
#if use default reward, use following:
#observation_next, reward, done, _ = self.env.step(action) #Test
#self.agent.update_Q_function(observation, action, reward, observation_next) #Test
#observation = observation_next #Test
#get reward
if done: #step > 200 or larger than angle
if step < 195:
reward = -1 #give punishment if game over less than last step
complete_episodes = 0 #game over less than 195 step then reset
else:
reward = 1
complete_episodes += 1
else:
reward = 0 #until done, reward is 0
#update Q table
self.agent.update_Q_function(observation, action, reward, observation_next)
#update observation
observation = observation_next
if done:
print('{0} Episode: Finished after {1} time steps'.format(episode, step + 1))
break
if is_episode_final is True: #save the animation
display_frames_as_gif(frames)
break
if complete_episodes >= 10:
print('succeeded for 10 times')
is_episode_final = True
The tested convergence is as follows. It can be seen that it is done soon from the beginning, and it can persist for 200 steps later.