""" A simple example for Reinforcement Learning using table lookup Q-learning method. An agent "o" is on the left of a 1 dimensional world, the treasure is on the rightmost location. Run this program and to see how the agent will improve its strategy of finding the treasure. View more on my tutorial page: https://morvanzhou.github.io/tutorials/ """ import numpy as np import pandas as pd import time #random will generate the same random number np.random.seed(2) # reproducible N_STATES = 6 # the length of the 1 dimensional world ACTIONS = ['left', 'right'] # available actions EPSILON = 0.9 # greedy police ALPHA = 0.1 # learning rate GAMMA = 0.9 # discount factor MAX_EPISODES = 13 # maximum episodes FRESH_TIME = 0.3 # fresh time for one move def build_q_table(n_states, actions): #Generate q_table table 6 rows and 2 columns and assign it to 0, the two columns are called left and right table = pd.DataFrame( np.zeros((n_states, len(actions))), # q_table initial values columns=actions, # actions's name ) # print(table) # show table return table def choose_action(state, q_table): # This is how to choose an action state_actions = q_table.iloc[state, :]#Get the value of the state of a row in the q_table table #print('\r') #print(state_actions) #Randomly generate random numbers between [0, 1] > EPSILON (random with 10% probability) or random when state_actions are all 0 if (np.random.uniform() > EPSILON) or ((state_actions == 0).all()): # act non-greedy or state-action have no value #select one randomly from ACTIONS action_name = np.random.choice(ACTIONS) print(' choice random:' + action_name) print(state_actions) else: # act greedy #return the index of the largest value action_name = state_actions.idxmax() # replace argmax to idxmax as argmax means a different function in newer version of pandas print('choice maxindex:'+action_name) print(state_actions) return action_name def get_env_feedback(S, A): #Enter current state S and action A # Return to the next state S_ and the reward after the action # This is how agent will interact with the environment if A == 'right': # move right if S == N_STATES - 2: # terminate S_ = 'terminal' R = 1 else: S_ = S + 1 R = 0 else: # move left R = 0 if S == 0: S_ = S # reach the wall else: S_ = S - 1 return S_, R def update_env(S, episode, step_counter): # This is how environment be updated env_list = ['-']*(N_STATES-1) + ['T'] # '---------T' our environment if S == 'terminal': interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter) print('\r{}'.format(interaction), end='') time.sleep(2) print('\r ', end='') else: env_list[S] = 'o' interaction = ''.join(env_list) print('\r{}'.format(interaction), end='') time.sleep(FRESH_TIME) def rl(): # main part of RL loop q_table = build_q_table(N_STATES, ACTIONS) for episode in range(MAX_EPISODES): step_counter = 0 S = 0 is_terminated = False update_env(S, episode, step_counter) while not is_terminated: A = choose_action(S, q_table) S_, R = get_env_feedback(S, A) # take action & get next state and reward q_predict = q_table.loc[S, A]#Prediction reward of action in table if S_ != 'terminal': #The actual reward after the action + GAMMA * the largest child of the predicted value of the next step (action reward at S_) q_target = R + GAMMA * q_table.iloc[S_, :].max() # next state is not terminal else: q_target = R # next state is terminal is_terminated = True # terminate this episode q_table.loc[S, A] += ALPHA * (q_target - q_predict) # update q_table 向q_target靠近 S = S_ # move to next state update_env(S, episode, step_counter+1)#Update environment step_counter += 1 return q_table if __name__ == "__main__": q_table = rl() print('\r\nQ-table:\n') print(q_table)
sarsa only needs to change two action_ = RL.choose_action(str(observation_))#From random generation of a_ to actual selection and action q_target = r + self.gamma * self.q_table.loc[s_, a_]