DDPG reinforcement learning pytorch code

DDPG reinforcement learning pytorch code

Referring to the intensive learning tutorial of Mo Fan, the tensorflow code was rewritten into pytorch code.
The details 代码are as follows, you can also go to my GitHub to download

'''
torch = 0.41
'''
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import gym
import time


#####################  hyper parameters  ####################

MAX_EPISODES = 200
MAX_EP_STEPS = 200
LR_A = 0.001    # learning rate for actor
LR_C = 0.002    # learning rate for critic
GAMMA = 0.9     # reward discount
TAU = 0.01      # soft replacement
MEMORY_CAPACITY = 10000
BATCH_SIZE = 32
TAU = 0.01
RENDER = False
ENV_NAME = 'Pendulum-v0'

###############################  DDPG  ####################################

class ANet(nn.Module):   # ae(s)=a
    def __init__(self,s_dim,a_dim):
        super(ANet,self).__init__()
        self.fc1 = nn.Linear(s_dim,30)
        self.fc1.weight.data.normal_(0,0.1) # initialization
        self.out = nn.Linear(30,a_dim)
        self.out.weight.data.normal_(0,0.1) # initialization
    def forward(self,x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.out(x)
        x = F.tanh(x)
        actions_value = x*2
        return actions_value

class CNet(nn.Module):   # ae(s)=a
    def __init__(self,s_dim,a_dim):
        super(CNet,self).__init__()
        self.fcs = nn.Linear(s_dim,30)
        self.fcs.weight.data.normal_(0,0.1) # initialization
        self.fca = nn.Linear(a_dim,30)
        self.fca.weight.data.normal_(0,0.1) # initialization
        self.out = nn.Linear(30,1)
        self.out.weight.data.normal_(0, 0.1)  # initialization
    def forward(self,s,a):
        x = self.fcs(s)
        y = self.fca(a)
        net = F.relu(x+y)
        actions_value = self.out(net)
        return actions_value


class DDPG(object):
    def __init__(self, a_dim, s_dim, a_bound,):
        self.a_dim, self.s_dim, self.a_bound = a_dim, s_dim, a_bound,
        self.memory = np.zeros((MEMORY_CAPACITY, s_dim * 2 + a_dim + 1), dtype=np.float32)
        self.pointer = 0
        #self.sess = tf.Session()
        self.Actor_eval = ANet(s_dim,a_dim)
        self.Actor_target = ANet(s_dim,a_dim)
        self.Critic_eval = CNet(s_dim,a_dim)
        self.Critic_target = CNet(s_dim,a_dim)
        self.ctrain = torch.optim.Adam(self.Critic_eval.parameters(),lr=LR_C)
        self.atrain = torch.optim.Adam(self.Actor_eval.parameters(),lr=LR_A)
        self.loss_td = nn.MSELoss()

    def choose_action(self, s):
        s = torch.unsqueeze(torch.FloatTensor(s), 0)
        return self.Actor_eval(s)[0].detach() # ae(s)

    def learn(self):

        for x in self.Actor_target.state_dict().keys():
            eval('self.Actor_target.' + x + '.data.mul_((1-TAU))')
            eval('self.Actor_target.' + x + '.data.add_(TAU*self.Actor_eval.' + x + '.data)')
        for x in self.Critic_target.state_dict().keys():
            eval('self.Critic_target.' + x + '.data.mul_((1-TAU))')
            eval('self.Critic_target.' + x + '.data.add_(TAU*self.Critic_eval.' + x + '.data)')

        # soft target replacement
        #self.sess.run(self.soft_replace)  # 用ae、ce更新at,ct

        indices = np.random.choice(MEMORY_CAPACITY, size=BATCH_SIZE)
        bt = self.memory[indices, :]
        bs = torch.FloatTensor(bt[:, :self.s_dim])
        ba = torch.FloatTensor(bt[:, self.s_dim: self.s_dim + self.a_dim])
        br = torch.FloatTensor(bt[:, -self.s_dim - 1: -self.s_dim])
        bs_ = torch.FloatTensor(bt[:, -self.s_dim:])

        a = self.Actor_eval(bs)
        q = self.Critic_eval(bs,a)  # loss=-q=-ce(s,ae(s))更新ae   ae(s)=a   ae(s_)=a_
        # 如果 a是一个正确的行为的话,那么它的Q应该更贴近0
        loss_a = -torch.mean(q) 
        #print(q)
        #print(loss_a)
        self.atrain.zero_grad()
        loss_a.backward()
        self.atrain.step()

        a_ = self.Actor_target(bs_)  # 这个网络不及时更新参数, 用于预测 Critic 的 Q_target 中的 action
        q_ = self.Critic_target(bs_,a_)  # 这个网络不及时更新参数, 用于给出 Actor 更新参数时的 Gradient ascent 强度
        q_target = br+GAMMA*q_  # q_target = 负的
        #print(q_target)
        q_v = self.Critic_eval(bs,ba)
        #print(q_v)
        td_error = self.loss_td(q_target,q_v)
        # td_error=R + GAMMA * ct(bs_,at(bs_))-ce(s,ba) 更新ce ,但这个ae(s)是记忆中的ba,让ce得出的Q靠近Q_target,让评价更准确
        #print(td_error)
        self.ctrain.zero_grad()
        td_error.backward()
        self.ctrain.step()

    def store_transition(self, s, a, r, s_):
        transition = np.hstack((s, a, [r], s_))
        index = self.pointer % MEMORY_CAPACITY  # replace the old memory with new memory
        self.memory[index, :] = transition
        self.pointer += 1

###############################  training  ####################################
env = gym.make(ENV_NAME)
env = env.unwrapped
env.seed(1)
s_dim = env.observation_space.shape[0]
a_dim = env.action_space.shape[0]
a_bound = env.action_space.high

ddpg = DDPG(a_dim, s_dim, a_bound)

var = 3  # control exploration
t1 = time.time()
for i in range(MAX_EPISODES):
    s = env.reset()
    ep_reward = 0
    for j in range(MAX_EP_STEPS):
        if RENDER:
            env.render()

        # Add exploration noise
        a = ddpg.choose_action(s)
        a = np.clip(np.random.normal(a, var), -2, 2)    # add randomness to action selection for exploration
        s_, r, done, info = env.step(a)

        ddpg.store_transition(s, a, r / 10, s_)

        if ddpg.pointer > MEMORY_CAPACITY:
            var *= .9995    # decay the action randomness
            ddpg.learn()

        s = s_
        ep_reward += r
        if j == MAX_EP_STEPS-1:
            print('Episode:', i, ' Reward: %i' % int(ep_reward), 'Explore: %.2f' % var, )
            if ep_reward > -300:RENDER = True
            break
print('Running time: ', time.time() - t1)

The effect is shown in the figure:

in an inverted pendulum

Reinforcement Learning Based on Evolutionary Algorithms:

"""
According to https://morvanzhou.github.io/tutorials/
required pytorch=0.41

"""
import numpy as np
import gym
import multiprocessing as mp
import time
import torch
import torch.nn as nn
import torch.nn.functional as F

N_KID = 10                  # half of the training population
N_GENERATION = 5000         # training step
LR = .05                    # learning rate
SIGMA = .05                 # mutation strength or step size
N_CORE = mp.cpu_count()-1
CONFIG = [
    dict(game="CartPole-v0",
         n_feature=4, n_action=2, continuous_a=[False], ep_max_step=700, eval_threshold=500),
    dict(game="MountainCar-v0",
         n_feature=2, n_action=3, continuous_a=[False], ep_max_step=200, eval_threshold=-120),
    dict(game="Pendulum-v0",
         n_feature=3, n_action=1, continuous_a=[True, 2.], ep_max_step=200, eval_threshold=-180)
][0]    # choose your game


class net(nn.Module):
    def __init__(self,input_dim,output_dim):
        super(net,self).__init__()
        self.fc1 = nn.Linear(input_dim,30)
        self.fc1.weight.data.normal_(0,1)
        self.fc2 = nn.Linear(30,20)
        self.fc2.weight.data.normal_(0,1)
        self.fc3 = nn.Linear(20,output_dim)
        self.fc3.weight.data.normal_(0,1)
    def forward(self,x):
        x = F.tanh(self.fc1(x))
        x = F.tanh(self.fc2(x))
        out = self.fc3(x)
        return out


def sign(k_id): return -1. if k_id % 2 == 0 else 1.  # mirrored sampling


class SGD(object):                      # optimizer with momentum
    def __init__(self, params, learning_rate, momentum=0.9):
        self.v = np.zeros(params).astype(np.float32)
        self.lr, self.momentum = learning_rate, momentum

    def get_gradients(self, gradients):
        self.v = self.momentum * self.v + (1. - self.momentum) * gradients
        return self.lr * self.v


def get_reward(network_param, num_p,env, ep_max_step, continuous_a, seed_and_id=None,):
    # perturb parameters using seed
    if seed_and_id is not None:
        seed, k_id = seed_and_id
        # for layer in network.children():
        #     np.random.seed(seed)
        #     layer.weight.data += torch.FloatTensor(sign(k_id) * SIGMA * np.random.randn(layer.weight.shape[0],layer.weight.shape[1]))
        #     np.random.seed(seed)
        #     layer.bias.data += torch.FloatTensor(sign(k_id) * SIGMA * np.random.randn(layer.bias.shape[0]))
        np.random.seed(seed)
        params = torch.FloatTensor(sign(k_id) * SIGMA * np.random.randn(num_p))
        Net = net(CONFIG['n_feature'],CONFIG['n_action'])
        Net.load_state_dict(network_param)
        for layer in Net.children():
            layer.weight.data += params[:layer.weight.shape[0]*layer.weight.shape[1]].view(layer.weight.shape[0],layer.weight.shape[1])
            layer.bias.data += params[layer.weight.shape[0]*layer.weight.shape[1]:layer.bias.shape[0]+layer.weight.shape[0]*layer.weight.shape[1]]
            params = params[layer.bias.shape[0]+layer.weight.shape[0]*layer.weight.shape[1]:]
    else:
        Net = net(CONFIG['n_feature'], CONFIG['n_action'])
        Net.load_state_dict(network_param)
    # run episode
    s = env.reset()
    ep_r = 0.
    for step in range(ep_max_step):
        a = get_action(Net, s, continuous_a)  # continuous_a 动作是否连续
        s, r, done, _ = env.step(a)
        # mountain car's reward can be tricky
        if env.spec._env_name == 'MountainCar' and s[0] > -0.1: r = 0.
        ep_r += r
        if done: break
    return ep_r


def get_action(network, x, continuous_a):
    x = torch.unsqueeze(torch.FloatTensor(x), 0)
    x = network.forward(x)
    if not continuous_a[0]: return np.argmax(x.detach().numpy(), axis=1)[0]      # for discrete action
    else: return continuous_a[1] * np.tanh(x.detach().numpy())[0]                # for continuous action


def train(network_param, num_p,optimizer, utility, pool):
    # pass seed instead whole noise matrix to parallel will save your time
    noise_seed = np.random.randint(0, 2 ** 32 - 1, size=N_KID, dtype=np.uint32).repeat(2)    # mirrored sampling
    # 生成一些镜像的噪点,每一个种群一个噪点seed
    # distribute training in parallel
    '''apply_async 是异步非阻塞的。即不用等待当前进程执行完毕,随时根据系统调度来进行进程切换。'''
    jobs = [pool.apply_async(get_reward, (network_param, num_p,env, CONFIG['ep_max_step'], CONFIG['continuous_a'],
                                          [noise_seed[k_id], k_id], )) for k_id in range(N_KID*2)]
    # 塞了2*种群个进去
    rewards = np.array([j.get() for j in jobs])
    # 排列reward
    kids_rank = np.argsort(rewards)[::-1]               # rank kid id by reward
    #All_data = []

    # for layer in network.children():
    #     weight_data = 0
    #     bias_data = 0
    #     for ui, k_id in enumerate(kids_rank):
    #         np.random.seed(noise_seed[k_id])
    #         weight_data += utility[ui] * sign(k_id) * np.random.randn(layer.weight.shape[0],layer.weight.shape[1])
    #         np.random.seed(noise_seed[k_id])
    #         bias_data += utility[ui] * sign(k_id) * np.random.randn(layer.bias.shape[0])
    #     weight_data = weight_data.flatten()
    #     All_data.append(weight_data)
    #     All_data.append(bias_data)
    All_data = 0
    for ui, k_id in enumerate(kids_rank):
        np.random.seed(noise_seed[k_id])  # reconstruct noise using seed
        All_data += utility[ui] * sign(k_id) * np.random.randn(num_p)  # reward大的乘的utility也大
        # 用的噪声配列降序相乘系数 相加
    '''utility 就是将 reward 排序, reward 最大的那个, 对应上 utility 的第一个, 反之, reward 最小的对应上 utility 最后一位'''
    #All_data = [data/(2*N_KID*SIGMA) for data in All_data]
    #All_data = np.concatenate(All_data)
    gradients = optimizer.get_gradients(All_data/(2*N_KID*SIGMA))
    gradients = torch.FloatTensor(gradients)

    for layer in network_param.keys():
        if 'weight' in layer:
            network_param[layer] += gradients[:network_param[layer].shape[0]*network_param[layer].shape[1]].view(network_param[layer].shape[0],network_param[layer].shape[1])
            gradients = gradients[network_param[layer].shape[0] * network_param[layer].shape[1]:]
        if 'bias' in layer:
            network_param[layer] += gradients[:network_param[layer].shape[0]]
            gradients = gradients[network_param[layer].shape[0]:]
    return network_param, rewards


if __name__ == "__main__":
    # utility instead reward for update parameters (rank transformation)
    base = N_KID * 2    # *2 for mirrored sampling  种群数
    rank = np.arange(1, base + 1)
    util_ = np.maximum(0, np.log(base / 2 + 1) - np.log(rank))
    utility = util_ / util_.sum() - 1 / base

    # training
    Net_org = net(CONFIG['n_feature'],CONFIG['n_action']).state_dict()
    #print(Net.fc1.weight.data[0][0])
    num_params = 0
    for r in list(Net_org):
        num_params+=Net_org[r].numel()
    env = gym.make(CONFIG['game']).unwrapped
    optimizer = SGD(num_params, LR)
    pool = mp.Pool(processes=N_CORE)  # 多线程
    mar = None      # moving average reward
    for g in range(N_GENERATION):
        t0 = time.time()
        Net_org, kid_rewards = train(Net_org, num_params,optimizer, utility, pool)
        # 更新了参数
        # test trained net without noise
        net_r = get_reward(Net_org, num_params,env, CONFIG['ep_max_step'], CONFIG['continuous_a'], None,)
        mar = net_r if mar is None else 0.9 * mar + 0.1 * net_r       # moving average reward
        print(
            'Gen: ', g,
            '| Net_R: %.1f' % mar,
            '| Kid_avg_R: %.1f' % kid_rewards.mean(),
            '| Gen_T: %.2f' % (time.time() - t0),)
        if mar >= CONFIG['eval_threshold']: break

    # test
    print("\nTESTING....")
    #p = params_reshape(net_shapes, net_params)
    while True:
        s = env.reset()
        for _ in range(CONFIG['ep_max_step']):
            env.render()
            net_test = net(CONFIG['n_feature'],CONFIG['n_action'])
            net_test.load_state_dict(Net_org)
            a = get_action(net_test, s, CONFIG['continuous_a'])
            s, _, done, _ = env.step(a)
            if done: break

Guess you like

Origin blog.csdn.net/blanokvaffy/article/details/86232658