policy gradient code pytorch framework

I believe that if you can use Baidu, you already know what the PG (policy gradient, PG) algorithm is, and you have already installed pytorch. Therefore, I will not give any explanation of the PG algorithm here, because I may not be able to explain it clearly because I am stupid.

The meaning of this blog: There are codes on the Internet that involve Categorical functions (categorical distribution). Can it be written without using Categorical? Answer: Yes! ! Inspired by Mo Fan’s policy gradient tensorfollow version.

The following is PG playing the cartpole game. The training only takes 10 minutes. During the test, the game can last forever.

GPU version (GPU training is used, so you need to install Cuda first)

At this point, the code is as follows, you can take it away without any thanks, just copy it and use it, but you can’t kill me!

#开发者:Bright Fang
#开发时间:2022/4/12 11:35
import torch
import torch.nn as nn
import torch.nn.functional as F
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
import numpy as np
import gym
LearningRate=0.01
Gamma=0.9#Gamma越大越容易收敛
Switch=0#训练、测试切换标志
env=gym.make('CartPole-v1')
env=env.unwrapped
state_number=env.observation_space.shape[0]
action_number=env.action_space.n
'''policygrandient第一步先建网络'''
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.in_to_y1=nn.Linear(state_number,20)
        self.in_to_y1.weight.data.normal_(0,0.1)
        self.y1_to_y2=nn.Linear(20,10)
        self.y1_to_y2.weight.data.normal_(0,0.1)
        self.out=nn.Linear(10,action_number)
        self.out.weight.data.normal_(0,0.1)
    def forward(self,inputstate):
        inputstate=self.in_to_y1(inputstate)
        inputstate=F.relu(inputstate)
        inputstate=self.y1_to_y2(inputstate)
        inputstate=torch.sigmoid(inputstate)
        act=self.out(inputstate)
        # return act
        return F.softmax(act,dim=-1)
class PG():
    def __init__(self):
        self.policy = Net().cuda()
        self.rewards,self.obs,self.acts = [],[],[]
        self.renderflag=False
        self.optimizer=torch.optim.Adam(self.policy.parameters(),lr=LearningRate)
    '''第二步 定义选择动作函数'''
    def choose(self,inputstate):
        inputstate=torch.FloatTensor(inputstate).cuda()
        probs=self.policy(inputstate).cpu().detach().numpy()
        action=np.random.choice(np.arange(action_number),p=probs)
        return action
    '''第三步 存储每一个回合的数据'''
    def store_transtion(self,s,a,r):
        self.obs.append(s)
        self.acts.append(a)
        self.rewards.append(r)
    '''第四步 学习'''
    def learn(self):
        # pass
        discounted_ep_r =np.zeros_like(self.rewards)
        running_add=0
        for t in reversed(range(0,len(self.rewards))):
            running_add=running_add*Gamma+self.rewards[t]
            discounted_ep_r[t]=running_add#例如,discounted_ep_r是1*87的列表,列表的第一个值为58,最后一个值为1
        #先减去平均数再除以标准差,就可对奖励归一化,奖励列表的中间段为0,最左为+2.1,最右为-1.9.
        discounted_ep_r-=np.mean(discounted_ep_r)
        discounted_ep_r/=np.std(discounted_ep_r)
        discounted_ep_rs_norm=discounted_ep_r
        self.optimizer.zero_grad()
        #把一个回合的状态、动作、奖励三个列表转为tensor
        self.obs=np.array(self.obs)
        state_tensor = torch.FloatTensor(self.obs).cuda()
        reward_tensor = torch.FloatTensor(discounted_ep_rs_norm).cuda()
        action_tensor = torch.LongTensor(self.acts).cuda()
        #我们可以用G值直接进行学习,但一般来说,对数据进行归一化处理后,训练效果会更好
        log_prob=torch.log(self.policy(state_tensor))#log_prob是拥有两个动作概率的张量,一个左动作概率,一个右动作概率
        selected_log_probs =reward_tensor * log_prob[np.arange(len(action_tensor)), action_tensor]#np.arange(len(action_tensor))是log_prob的索引,
        # action_tensor由0、1组成,于是log_prob[np.arange(len(action_tensor)), action_tensor]就可以取到我们已经选择了的动作的概率,是拥有一个动作概率的张量
        loss=-selected_log_probs.mean()
        loss.backward()
        self.optimizer.step()
        self.obs,self.acts,self.rewards=[],[],[]
'''训练'''
if Switch==0:
    print("训练PG中...")
    f=PG()
    for i in range(2000):
        r=0
        observation=env.reset()
        while True:
            if f.renderflag: env.render()
            action=f.choose(observation)
            observation_,reward,done,info=env.step(action)
            #修改reward
            x, x_dot, theta, theta_dot = observation_
            r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
            r3=3*r1+r2
            #你也可以不修改奖励,直接用reward,都能收敛
            f.store_transtion(observation,action,r3)
            r+=r3
            if done:
                f.learn()
                break
            observation=observation_
        print("\rEp: {} rewards: {}".format(i,r), end="")
        if i % 10 == 0 and i > 500:
            save_data = {
    
    'net': f.policy.state_dict(), 'opt': f.optimizer.state_dict(), 'i': i}
            torch.save(save_data, "D:\PyCharm 2019.3\mytorch_spacework\demo\model_PG.pth")
else:
    print("测试PG中...")
    c=PG()
    checkpoint = torch.load("D:\PyCharm 2019.3\mytorch_spacework\demo\model_PG.pth")
    c.policy.load_state_dict(checkpoint['net'])
    for j in range(10):
        state = env.reset()
        total_rewards = 0
        while True:
            env.render()
            state = torch.FloatTensor(state)
            action=c.choose(state)
            new_state, reward, done, info = env.step(action)  # 执行动作
            total_rewards += reward
            if done:
                print("Score", total_rewards)
                break
            state = new_state
    env.close()

Not surprisingly, the above code should not be copied and used, I had already expected it. because:

  1. There is no PyCharm 2019.3\mytorch_spacework\demo folder in your D drive. After all, everyone’s code is stored in different places. My code is placed here, Insert image description here
    so either change my D:\PyCharm 2019.3\mytorch_spacework\demo\ to the location of your own current code, or you can directly put
torch.save(save_data, "D:\PyCharm 2019.3\mytorch_spacework\demo\model_PG.pth")

Change to

torch.save(save_data, "E:\model_PG.pth")

Put it directly in your E drive. Don't tell me your computer doesn't have an E drive.
2. Not everyone has installed Cuda, so I gave the code for the CPU version of PG algorithm to play cartpole, as follows:

CPU version

#开发者:Bright Fang
#开发时间:2022/4/12 11:35
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import gym
LearningRate=0.01
Gamma=0.9#Gamma越大越容易收敛
Switch=0#训练、测试切换标志
env=gym.make('CartPole-v1')
env=env.unwrapped
state_number=env.observation_space.shape[0]
action_number=env.action_space.n
'''policygrandient第一步先建网络'''
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.in_to_y1=nn.Linear(state_number,20)
        self.in_to_y1.weight.data.normal_(0,0.1)
        self.y1_to_y2=nn.Linear(20,10)
        self.y1_to_y2.weight.data.normal_(0,0.1)
        self.out=nn.Linear(10,action_number)
        self.out.weight.data.normal_(0,0.1)
    def forward(self,inputstate):
        inputstate=self.in_to_y1(inputstate)
        inputstate=F.relu(inputstate)
        inputstate=self.y1_to_y2(inputstate)
        inputstate=torch.sigmoid(inputstate)
        act=self.out(inputstate)
        # return act
        return F.softmax(act,dim=-1)
class PG():
    def __init__(self):
        self.policy = Net()
        self.rewards,self.obs,self.acts = [],[],[]
        self.renderflag=False
        self.optimizer=torch.optim.Adam(self.policy.parameters(),lr=LearningRate)
    '''第二步 定义选择动作函数'''
    def choose(self,inputstate):
        inputstate=torch.FloatTensor(inputstate)
        probs=self.policy(inputstate).detach().numpy()
        action=np.random.choice(np.arange(action_number),p=probs)
        return action
    '''第三步 存储每一个回合的数据'''
    def store_transtion(self,s,a,r):
        self.obs.append(s)
        self.acts.append(a)
        self.rewards.append(r)
    '''第四步 学习'''
    def learn(self):
        # pass
        discounted_ep_r =np.zeros_like(self.rewards)
        running_add=0
        for t in reversed(range(0,len(self.rewards))):
            running_add=running_add*Gamma+self.rewards[t]
            discounted_ep_r[t]=running_add#例如,discounted_ep_r是1*87的列表,列表的第一个值为58,最后一个值为1
        #先减去平均数再除以标准差,就可对奖励归一化,奖励列表的中间段为0,最左为+2.1,最右为-1.9.
        discounted_ep_r-=np.mean(discounted_ep_r)
        discounted_ep_r/=np.std(discounted_ep_r)
        discounted_ep_rs_norm=discounted_ep_r
        self.optimizer.zero_grad()
        #把一个回合的状态、动作、奖励三个列表转为tensor
        self.obs=np.array(self.obs)
        state_tensor = torch.FloatTensor(self.obs)
        reward_tensor = torch.FloatTensor(discounted_ep_rs_norm)
        action_tensor = torch.LongTensor(self.acts)
        #我们可以用G值直接进行学习,但一般来说,对数据进行归一化处理后,训练效果会更好
        log_prob=torch.log(self.policy(state_tensor))#log_prob是拥有两个动作概率的张量,一个左动作概率,一个右动作概率
        selected_log_probs =reward_tensor * log_prob[np.arange(len(action_tensor)), action_tensor]#np.arange(len(action_tensor))是log_prob的索引,
        # action_tensor由0、1组成,于是log_prob[np.arange(len(action_tensor)), action_tensor]就可以取到我们已经选择了的动作的概率,是拥有一个动作概率的张量
        loss=-selected_log_probs.mean()
        loss.backward()
        self.optimizer.step()
        self.obs,self.acts,self.rewards=[],[],[]
'''训练'''
if Switch==0:
    print("训练PG中...")
    f=PG()
    for i in range(2000):
        r=0
        observation=env.reset()
        while True:
            if f.renderflag: env.render()
            action=f.choose(observation)
            observation_,reward,done,info=env.step(action)
            #修改reward
            x, x_dot, theta, theta_dot = observation_
            r1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8
            r2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5
            r3=3*r1+r2
            #你也可以不修改奖励,直接用reward,都能收敛
            f.store_transtion(observation,action,r3)
            r+=r3
            if done:
                f.learn()
                break
            observation=observation_
        print("\rEp: {} rewards: {}".format(i,r), end="")
        if i % 10 == 0 and i > 500:
            save_data = {
    
    'net': f.policy.state_dict(), 'opt': f.optimizer.state_dict(), 'i': i}
            torch.save(save_data, "E:\model_PG.pth")
else:
    print("测试PG中...")
    c=PG()
    checkpoint = torch.load("E:\model_PG.pth")
    c.policy.load_state_dict(checkpoint['net'])
    for j in range(10):
        state = env.reset()
        total_rewards = 0
        while True:
            env.render()
            state = torch.FloatTensor(state)
            action=c.choose(state)
            new_state, reward, done, info = env.step(action)  # 执行动作
            total_rewards += reward
            if done:
                print("Score", total_rewards)
                break
            state = new_state
    env.close()

Code usage:
First set the Switch flag to 0, train first, and then stop training directly after training for 5-10 minutes (don’t wait, if you let the training end naturally, you will have to wait until the end of the monkey year), because the parameters of the neural network have been changed by us Saved in model_PG.pth. Then, set the Switch flag to 1, and you can see the effect of training.

Guess you like

Origin blog.csdn.net/fangchenglia/article/details/124253981