Reinforcement Learning with Code【Code 6. Advantage Actor-Critic(A2C)】

Reinforcement Learning with Code【Code 6. Advantage Actor-Critic(A2C)】

This note records how the author begin to learn RL. Both theoretical understanding and code practice are presented. Many material are referenced such as ZhaoShiyu’s Mathematical Foundation of Reinforcement Learning.

1. Actor-Criti’s Various Forms

Image

2. Review Advantage Actor-Critic (A2C)

First, understand the traditional AC algorithm (see Reinforcement Learning with Code [Chapter 10. Actor Critic] for details ). The A2C algorithm adds a baseline to the traditional AC algorithm to reduce the variance of the fitting. It happens that this added baseline is the value Function v π ( s ) v_\pi(s)vp( s ) , then we get the definition of advantage function

δ π ( S , A ) = q π ( S , A ) − v π ( S ) \textcolor{red}{\delta_\pi(S,A) = q_\pi(S,A) - v_\pi( S)}dp(S,A)=qp(S,A)vp( S )
describes how good or bad the action selected in the current state is compared to the average state value. For the complete A2C algorithm, please refer toReinforcement Learning with Code [Chapter 10. Actor Critic]

Image
Image

3. A2C Code

When implementing A2C, the environment gymin is still used CartPole-v1.

import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np


# Policy Network
class PolicyNet(nn.Module):
    def __init__(self, state_dim, hidden_dim, action_dim):
        super(PolicyNet, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, action_dim)
    
    def forward(self, observation):
        x = F.relu(self.fc1(observation))
        return F.softmax(self.fc2(x), dim=1)

# State Value Network
class ValueNet(nn.Module):
    def __init__(self, state_dim, hidden_dim):
        super(ValueNet, self).__init__()
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 1)
    
    def forward(self, observation):
        x = F.relu(self.fc1(observation))
        return self.fc2(x)

# # Q Value Network
# class QValueNet(nn.Module):
#     def __init__(self, state_dim, hidden_dim, action_dim):
#         super(QValueNet,self).__init__()
#         self.fc1 = nn.Linear(state_dim, hidden_dim)
#         self.fc2 = nn.Linear(hidden_dim, action_dim)
    
#     def forward(self, observation):
#         x = F.relu(self.fc1(observation))
#         return self.fc2(x)


# QAC & A2C
class ActorCritic():
    def __init__(self, state_dim, hidden_dim, action_dim, actor_lr, critic_lr, gamma, ac_type, device):

        self.ac_type = ac_type
        if ac_type == "A2C":
            self.critic = ValueNet(state_dim, hidden_dim).to(device)
        elif ac_type == "QAC":
            self.critic = QValueNet(state_dim, hidden_dim, action_dim).to(device)

        self.actor = PolicyNet(state_dim, hidden_dim, action_dim).to(device)
        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=critic_lr)
        self.gamma = gamma
        self.device = device
    
    def choose_action(self, state):
        state = torch.tensor([state], dtype=torch.float).to(self.device)
        probs = self.actor(state)
        action_dist = torch.distributions.Categorical(probs)
        action = action_dist.sample().item()
        return action
    
    def learn(self, transition_dict):
        states = torch.tensor(transition_dict['states'], dtype=torch.float).to(self.device)
        rewards = torch.tensor(transition_dict['rewards'], dtype=torch.float).view(-1,1).to(self.device)
        actions = torch.tensor(transition_dict['actions'], dtype=torch.int64).view(-1,1).to(self.device)
        next_states = torch.tensor(transition_dict['next_states'], dtype=torch.float).to(self.device)
        dones = torch.tensor(transition_dict['dones'], dtype=torch.float).view(-1,1).to(self.device)

        if self.ac_type == 'A2C':
            td_target = rewards + self.gamma * self.critic(next_states) * (1-dones)
            td_delta = td_target - self.critic(states)
            log_probs = torch.log(self.actor(states).gather(dim=1, index=actions))
            actor_loss = torch.mean(- log_probs * td_delta.detach())
            critic_loss = torch.mean(F.mse_loss(td_target.detach(), self.critic(states)))
        # elif self.ac_type == 'QAC':
        #     td_target = rewards + self.gamma * self.critic(next_states).gather(dim=1, index=actions) * (1-dones)
        #     td_delta = self.critic(states).gather(dim=1, index=actions)
        #     log_probs = torch.log(self.actor(states).gather(dim=1, index=actions))
        #     actor_loss = torch.mean(- log_probs * td_delta.detach())
        #     critic_loss = torch.mean(F.mse_loss(td_target, td_delta))

        # clear gradient cumulation
        self.actor_optimizer.zero_grad()
        self.critic_optimizer.zero_grad()
        # calculate gradient
        actor_loss.backward()
        critic_loss.backward()
        # update parameters
        self.actor_optimizer.step()
        self.critic_optimizer.step()


def train_on_policy_agent(env, agent, num_episodes, seed):
    return_list = []
    for i in range(10):
        with tqdm(total = int(num_episodes/10), desc="Iteration %d"%(i+1)) as pbar:
            for i_episode in range(int(num_episodes/10)):
                episode_return = 0
                transition_dict = {
    
    
                    'states': [],
                    'actions': [],
                    'next_states': [],
                    'rewards': [],
                    'dones': []
                }
                observation, _ = env.reset(seed=seed)
                done = False
                while not done:
                    if render:
                        env.render()
                    action = agent.choose_action(observation)
                    observation_, reward, terminated, truncated, _ = env.step(action)
                    done = terminated or truncated
                    # save one episode experience into a dict
                    transition_dict['states'].append(observation)
                    transition_dict['actions'].append(action)
                    transition_dict['next_states'].append(observation_)
                    transition_dict['rewards'].append(reward)
                    transition_dict['dones'].append(done)
                    # swap state
                    observation = observation_
                    # compute one episode return
                    episode_return += reward
                return_list.append(episode_return)
                agent.learn(transition_dict)
                if((i_episode + 1) % 10 == 0):
                    pbar.set_postfix({
    
    
                        'episode': '%d'%(num_episodes / 10 * i + i_episode + 1),
                        'return': '%.3f'%(np.mean(return_list[-10:]))
                    })
                pbar.update(1)
    env.close()
    return return_list

def moving_average(a, window_size):
    cumulative_sum = np.cumsum(np.insert(a, 0, 0)) 
    middle = (cumulative_sum[window_size:] - cumulative_sum[:-window_size]) / window_size
    r = np.arange(1, window_size-1, 2)
    begin = np.cumsum(a[:window_size-1])[::2] / r
    end = (np.cumsum(a[:-window_size:-1])[::2] / r)[::-1]
    return np.concatenate((begin, middle, end))

def plot_curve(return_list, mv_return, algorithm_name, env_name):
    episodes_list = list(range(len(return_list)))
    plt.plot(episodes_list, return_list, c='gray', alpha=0.6)
    plt.plot(episodes_list, mv_return)
    plt.xlabel('Episodes')
    plt.ylabel('Returns')
    plt.title('{} on {}'.format(algorithm_name, env_name))
    plt.show()


if __name__ == "__main__":

    # reproducible
    seed_number = 0
    np.random.seed(seed_number)
    torch.manual_seed(seed_number)

    num_episodes = 1000     # episodes length
    hidden_dim = 256        # hidden layers dimension
    gamma = 0.98            # discounted rate
    device = torch.device('cuda' if torch.cuda.is_available() else 'gpu')
    env_name = 'CartPole-v1'
    ac_type = 'A2C'  # Actor-Critic Type: QAC or A2C

    # Attention Learning Rate Is Important 
    actor_lr = 1e-3         # learning rate of actor
    if ac_type == 'A2C':
        critic_lr = 1e-2        # learning rate of critic
    # elif ac_type == 'QAC':
    #     critic_lr = 1e-3

    render = False
    if render:
        env = gym.make(id=env_name, render_mode='human')
    else:
        env = gym.make(id=env_name)
    
    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.n

    agent = ActorCritic(state_dim, hidden_dim, action_dim, actor_lr, critic_lr, gamma, ac_type, device)

    return_list = train_on_policy_agent(env, agent, num_episodes, seed_number)
    
    mv_return = moving_average(return_list, 9)
    plot_curve(return_list, mv_return, ac_type, env_name)

The final learning curve is shown in the figure

Image

Reference

Teacher Zhao Shiyu’s course
Hands on RL
Reinforcement Learning with Code [Chapter 10. Actor Critic]

Guess you like

Origin blog.csdn.net/qq_44940689/article/details/132258446