[ROS] Combined use of ros-noetic and anaconda [Practical operation]

[ROS] Combined use of ros-noetic and anaconda [Practical operation]


After introducing the basic joint use methods (refer to This blog ), the author hopes to use ros to complete gym Training of reinforcement learning algorithms in the environment.

1. requirement

Requires manual usepipInstall the following packages

torch
gym
tqdm
matplotlib

Examplepip install gym.

2. Create a new python script in the ros package

roscd test_ros_python
cd scripts
touch test_python.py
chom +x ./test_python.py

And remember to complete the configuration in theCMakeLists.txt file, so I won’t go into details here.

3. SAC algorithm

After completing the environment configuration, use the code inHands on RL's Off-policy Maximum Entropy Actor-Critic (SAC), Place it intest_python.pyscript file, as follows

#! /usr/bin/env python
# coding :utf-8

print('\n*****************************************\n\t[test libraries]:\n')
import rospy
import sys
sys.path.append('/home/sjh/anaconda3/envs/metaRL/lib/python3.8/site-packages')
print(' - rospy.__file__ = %s'%rospy.__file__)

import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.distributions import Normal

from tqdm import tqdm
import collections
import random
import numpy as np
import matplotlib.pyplot as plt
import gym
print(' - torch.__file__ = %s'%torch.__file__)
print(' - gym.__file__ = %s'%gym.__file__)

print('\n*****************************************\n\t[finish test]\n')

# replay buffer
class ReplayBuffer():
    def __init__(self, capacity):
        self.buffer = collections.deque(maxlen=capacity)
    
    def add(self, s, r, a, s_, d):
        self.buffer.append((s,r,a,s_,d))
    
    def sample(self, batch_size):
        transitions = random.sample(self.buffer, batch_size)
        states, rewards, actions, next_states, dones = zip(*transitions)
        return np.array(states), rewards, actions, np.array(next_states), dones
    
    def size(self):
        return len(self.buffer)

# Actor
class PolicyNet_Continuous(nn.Module):
    """动作空间符合高斯分布,输出动作空间的均值mu,和标准差std"""
    def __init__(self, state_dim, hidden_dim, action_dim, action_bound):
        super(PolicyNet_Continuous, self).__init__()
        self.fc1 = nn.Sequential(
            nn.Linear(in_features=state_dim, out_features=hidden_dim),
            nn.ReLU()
        )
        self.fc_mu = nn.Linear(in_features=hidden_dim, out_features=action_dim)
        self.fc_std = nn.Sequential(
            nn.Linear(in_features=hidden_dim, out_features=action_dim),
            nn.Softplus()
        )
        self.action_bound = action_bound

    def forward(self, s):
        x = self.fc1(s)
        mu = self.fc_mu(x)
        std = self.fc_std(x)
        distribution = Normal(mu, std)
        normal_sample = distribution.rsample()
        normal_log_prob = distribution.log_prob(normal_sample)
        # get action limit to [-1,1]
        action = torch.tanh(normal_sample)
        # get tanh_normal log probability
        tanh_log_prob = normal_log_prob - torch.log(1 - torch.tanh(action).pow(2) + 1e-7)
        # get action bounded
        action = action * self.action_bound
        return action, tanh_log_prob


# Critic
class QValueNet_Continuous(nn.Module):
    def __init__(self, state_dim, hidden_dim, action_dim):
        super(QValueNet_Continuous, self).__init__()
        self.fc1 = nn.Sequential(
            nn.Linear(in_features=state_dim + action_dim, out_features=hidden_dim),
            nn.ReLU()
        )
        self.fc2 = nn.Sequential(
            nn.Linear(in_features=hidden_dim, out_features=hidden_dim),
            nn.ReLU()
        )
        self.fc_out = nn.Linear(in_features=hidden_dim, out_features=1)
    
    def forward(self, s, a):
        cat = torch.cat([s,a], dim=1)
        x = self.fc1(cat)
        x = self.fc2(x)
        return self.fc_out(x)

# maximize entropy deep reinforcement learning SAC
class SAC_Continuous():
    def __init__(self, state_dim, hidden_dim, action_dim, action_bound,
                    actor_lr, critic_lr, alpha_lr, target_entropy, tau, gamma,
                    device):
        # actor
        self.actor = PolicyNet_Continuous(state_dim, hidden_dim, action_dim, action_bound).to(device)
        # two critics
        self.critic1 = QValueNet_Continuous(state_dim, hidden_dim, action_dim).to(device)
        self.critic2 = QValueNet_Continuous(state_dim, hidden_dim, action_dim).to(device)
        # two target critics
        self.target_critic1 = QValueNet_Continuous(state_dim, hidden_dim, action_dim).to(device)
        self.target_critic2 = QValueNet_Continuous(state_dim, hidden_dim, action_dim).to(device)
        # initialize with same parameters
        self.target_critic1.load_state_dict(self.critic1.state_dict())
        self.target_critic2.load_state_dict(self.critic2.state_dict())
        # specify optimizers
        self.optimizer_actor = torch.optim.Adam(self.actor.parameters(), lr=actor_lr)
        self.optimizer_critic1 = torch.optim.Adam(self.critic1.parameters(), lr=critic_lr)
        self.optimizer_critic2 = torch.optim.Adam(self.critic2.parameters(), lr=critic_lr)
        # 使用alpha的log值可以使训练稳定
        self.log_alpha = torch.tensor(np.log(0.01), dtype=torch.float, requires_grad = True)
        self.optimizer_log_alpha = torch.optim.Adam([self.log_alpha], lr=alpha_lr)

        self.target_entropy = target_entropy
        self.gamma = gamma
        self.tau = tau
        self.device = device
    
    def take_action(self, state):
        state = torch.tensor(np.array([state]), dtype=torch.float).to(self.device)
        action, _ = self.actor(state)
        return [action.item()]
    
    # calculate td target
    def calc_target(self, rewards, next_states, dones):
        next_action, log_prob = self.actor(next_states)
        entropy = -log_prob
        q1_values = self.target_critic1(next_states, next_action)
        q2_values = self.target_critic2(next_states, next_action)
        next_values = torch.min(q1_values, q2_values) + self.log_alpha.exp() * entropy
        td_target = rewards + self.gamma * next_values * (1-dones)
        return td_target

    # soft update method
    def soft_update(self, net, target_net):
        for param_target, param in zip(target_net.parameters(), net.parameters()):
            param_target.data.copy_(param_target.data * (1.0-self.tau) + param.data * self.tau)
        
    def update(self, transition_dict):
        states = torch.tensor(transition_dict['states'], dtype=torch.float).to(self.device)
        rewards = torch.tensor(transition_dict['rewards'], dtype=torch.float).view(-1,1).to(self.device)
        actions = torch.tensor(transition_dict['actions'], dtype=torch.float).view(-1,1).to(self.device)
        next_states = torch.tensor(transition_dict['next_states'], dtype=torch.float).to(self.device)
        dones = torch.tensor(transition_dict['dones'], dtype=torch.float).view(-1,1).to(self.device)

        rewards = (rewards + 8.0) / 8.0     #对倒立摆环境的奖励进行重塑,方便训练

        # update two Q-value network
        td_target = self.calc_target(rewards, next_states, dones).detach()
        critic1_loss = torch.mean(F.mse_loss(td_target, self.critic1(states, actions)))
        critic2_loss = torch.mean(F.mse_loss(td_target, self.critic2(states, actions)))

        self.optimizer_critic1.zero_grad()
        critic1_loss.backward()
        self.optimizer_critic1.step()
        self.optimizer_critic2.zero_grad()
        critic2_loss.backward()
        self.optimizer_critic2.step()

        # update policy network
        new_actions, log_prob = self.actor(states)
        entropy = - log_prob
        q1_value = self.critic1(states, new_actions)
        q2_value = self.critic2(states, new_actions)
        actor_loss = torch.mean(-self.log_alpha.exp() * entropy - torch.min(q1_value, q2_value))
        self.optimizer_actor.zero_grad()
        actor_loss.backward()
        self.optimizer_actor.step()

        # update temperature alpha
        alpha_loss = torch.mean((entropy - self.target_entropy).detach() * self.log_alpha.exp())
        self.optimizer_log_alpha.zero_grad()
        alpha_loss.backward()
        self.optimizer_log_alpha.step()

        # soft update target Q-value network
        self.soft_update(self.critic1, self.target_critic1)
        self.soft_update(self.critic2, self.target_critic2)


def train_off_policy_agent(env, agent, num_episodes, replay_buffer, minimal_size, batch_size, render, seed_number):
    return_list = []
    for i in range(10):
        with tqdm(total=int(num_episodes/10), desc='Iteration %d'%(i+1)) as pbar:
            for i_episode in range(int(num_episodes/10)):
                observation, _ = env.reset(seed=seed_number)
                done = False
                episode_return = 0

                while not done:
                    if render:
                        env.render()
                    action = agent.take_action(observation)
                    observation_, reward, terminated, truncated, _ = env.step(action)
                    done = terminated or truncated
                    replay_buffer.add(observation, action, reward, observation_, done)
                    # swap states
                    observation = observation_
                    episode_return += reward
                    if replay_buffer.size() > minimal_size:
                        b_s, b_a, b_r, b_ns, b_d = replay_buffer.sample(batch_size)
                        transition_dict = {
    
    
                            'states': b_s,
                            'actions': b_a,
                            'rewards': b_r,
                            'next_states': b_ns,
                            'dones': b_d
                        }
                        agent.update(transition_dict)
                return_list.append(episode_return)
                if(i_episode+1) % 10 == 0:
                    pbar.set_postfix({
    
    
                        'episode': '%d'%(num_episodes/10 * i + i_episode + 1),
                        'return': "%.3f"%(np.mean(return_list[-10:]))
                    })
                pbar.update(1)
    env.close()
    return return_list

def moving_average(a, window_size):
    cumulative_sum = np.cumsum(np.insert(a, 0, 0)) 
    middle = (cumulative_sum[window_size:] - cumulative_sum[:-window_size]) / window_size
    r = np.arange(1, window_size-1, 2)
    begin = np.cumsum(a[:window_size-1])[::2] / r
    end = (np.cumsum(a[:-window_size:-1])[::2] / r)[::-1]
    return np.concatenate((begin, middle, end))

def plot_curve(return_list, mv_return, algorithm_name, env_name):
    episodes_list = list(range(len(return_list)))
    plt.plot(episodes_list, return_list, c='gray', alpha=0.6)
    plt.plot(episodes_list, mv_return)
    plt.xlabel('Episodes')
    plt.ylabel('Returns')
    plt.title('{} on {}'.format(algorithm_name, env_name))
    plt.show()



if __name__ == "__main__":
    rospy.init_node('ros_sac')
    rospy.loginfo(">>>>>>>>>> SAC for ROS >>>>>>>>>>")

    # reproducible
    seed_number = 0
    random.seed(seed_number)
    np.random.seed(seed_number)
    torch.manual_seed(seed_number)

    num_episodes = 150     # episodes length
    hidden_dim = 128        # hidden layers dimension
    gamma = 0.98            # discounted rate
    actor_lr = 1e-4         # lr of actor
    critic_lr = 1e-3        # lr of critic
    alpha_lr = 1e-4
    tau = 0.005             # soft update parameter
    buffer_size = 10000
    minimal_size = 1000
    batch_size = 64

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    env_name = 'Pendulum-v1'

    render = False
    if render:
        env = gym.make(id=env_name, render_mode='human')
    else:
        env = gym.make(id=env_name)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]  
    action_bound = env.action_space.high[0]
    # entropy初始化为动作空间维度的负数
    target_entropy = - env.action_space.shape[0]

    replaybuffer = ReplayBuffer(buffer_size)
    agent = SAC_Continuous(state_dim, hidden_dim, action_dim, action_bound, actor_lr, critic_lr, alpha_lr, target_entropy, tau, gamma, device)
    return_list = train_off_policy_agent(env, agent, num_episodes, replaybuffer, minimal_size, batch_size, render, seed_number)

    mv_return = moving_average(return_list, 9)
    plot_curve(return_list, mv_return, 'SAC', env_name)

The result of the operation is as follows

Insert image description here
If you set render in the code to True you will see an inverted pendulum as shown below, but this requires installation pygame, just execute the following statement

pip install gym[classic_control]

Insert image description here

Reference

[ROS] Combined use of ros-noetic and anaconda (1)
Hands on RL's Off-policy Maximum Entropy Actor-Critic (SAC)

Supongo que te gusta

Origin blog.csdn.net/qq_44940689/article/details/133815970
Recomendado
Clasificación