Q-learning 例子注释

"""
A simple example for Reinforcement Learning using table lookup Q-learning method.
An agent "o" is on the left of a 1 dimensional world, the treasure is on the rightmost location.
Run this program and to see how the agent will improve its strategy of finding the treasure.
View more on my tutorial page: https://morvanzhou.github.io/tutorials/
"""

import numpy as np
import pandas as pd
import time

#random会生成相同的随机数
np.random.seed(2)  # reproducible



N_STATES = 6   # the length of the 1 dimensional world
ACTIONS = ['left', 'right']     # available actions
EPSILON = 0.9   # greedy police
ALPHA = 0.1     # learning rate
GAMMA = 0.9    # discount factor
MAX_EPISODES = 13   # maximum episodes
FRESH_TIME = 0.3    # fresh time for one move


def build_q_table(n_states, actions):
    #生成q_table 表 6行2列 并赋值为0,两列叫left 和right
    table = pd.DataFrame(
        np.zeros((n_states, len(actions))),     # q_table initial values
        columns=actions,    # actions's name
    )
    # print(table)    # show table
    return table


def choose_action(state, q_table):
    # This is how to choose an action
    state_actions = q_table.iloc[state, :]#获取q_table表中 某一行的state 的值
    #print('\r')
    #print(state_actions)
    #随机生成【0，1】间的随机数>EPSILON(以10%的概率随机)或state_actions全为0时随机
    if (np.random.uniform() > EPSILON) or ((state_actions == 0).all()):  # act non-greedy or state-action have no value
        #从ACTIONS随机选一个
        action_name = np.random.choice(ACTIONS)
        print(' choice random：' + action_name)
        print(state_actions)
    else:   # act greedy
        #返回最大的数值的索引
        action_name = state_actions.idxmax()    # replace argmax to idxmax as argmax means a different function in newer version of pandas
        print('choice maxindex：'+action_name)
        print(state_actions)
    return action_name


def get_env_feedback(S, A):
    #输入当前状态S 和 动作 A
    #返回下一个状态S_和动作后的奖赏
    #  This is how agent will interact with the environment
    if A == 'right':    # move right
        if S == N_STATES - 2:   # terminate
            S_ = 'terminal'
            R = 1
        else:
            S_ = S + 1
            R = 0
    else:   # move left
        R = 0
        if S == 0:
            S_ = S  # reach the wall
        else:
            S_ = S - 1
    return S_, R


def update_env(S, episode, step_counter):
    # This is how environment be updated
    env_list = ['-']*(N_STATES-1) + ['T']   # '---------T' our environment
    if S == 'terminal':
        interaction = 'Episode %s: total_steps = %s' % (episode+1, step_counter)
        print('\r{}'.format(interaction), end='')
        time.sleep(2)
        print('\r                                ', end='')
    else:
        env_list[S] = 'o'
        interaction = ''.join(env_list)
        print('\r{}'.format(interaction), end='')
        time.sleep(FRESH_TIME)


def rl():
    # main part of RL loop
    q_table = build_q_table(N_STATES, ACTIONS)
    for episode in range(MAX_EPISODES):
        step_counter = 0
        S = 0
        is_terminated = False
        update_env(S, episode, step_counter)
        while not is_terminated:

            A = choose_action(S, q_table)
            S_, R = get_env_feedback(S, A)  # take action & get next state and reward
            q_predict = q_table.loc[S, A]#表格中动作的预测奖赏
            if S_ != 'terminal':
                #动作后的实际奖赏+GAMMA*下一步的预测值（S_时的动作奖赏）的最大子
                q_target = R + GAMMA * q_table.iloc[S_, :].max()   # next state is not terminal
            else:
                q_target = R     # next state is terminal
                is_terminated = True    # terminate this episode

            q_table.loc[S, A] += ALPHA * (q_target - q_predict)  # update q_table 向q_target靠近
            S = S_  # move to next state

            update_env(S, episode, step_counter+1)#更新环境
            step_counter += 1
    return q_table


if __name__ == "__main__":
    q_table = rl()
    print('\r\nQ-table:\n')
    print(q_table)

sarsa只需要改动两条
action_ = RL.choose_action(str(observation_))#由随机产生a_ 变成实际选出并做动作
q_target = r + self.gamma * self.q_table.loc[s_, a_]

Q-learning 例子注释

猜你喜欢