Continuous control with deep reinforcement learning (DDPG, depth determination strategy gradient) exercises

5 Depth determines the strategy gradient, the paper is here:
http://xueshu.baidu.com/s?wd=paperuri%3A%283752bdb69e8a3f4849ecba38b2b0168f%29&filter=sc_long_sign&tn=SE_xueshusource_2kduw22v&sc_vurl=http%3A%2F%2Fwww.oa lib.com%2Fpaper% 2F4051743&ie=utf-8&sc_us=1138439324812222606

=======================experience_replay.py=============

from collections import deque
import numpy as np
import random


'''
flag = tf.app.flags
FLAG = flag.FLAGS
flag.DEFINE_string('size','5','size')
print flag.FLAGS.size
'''

class Experience_replay:
    def __init__(self, size, action_dim,state_dim):
        self.d = deque(maxlen=size)
        self.action_dim = action_dim
        self.state_dim = state_dim

    def experience_in(self, memory):
        self.d.append(memory)

    def experience_out(self, sample_size):
        s_list = random.sample(self.d, sample_size)

        rs = np.asarray([i[0] for i in s_list], dtype=np.float32).reshape((sample_size, self.state_dim))
        ra = np.asarray([i[1] for i in s_list], dtype=np.float32).reshape((sample_size, self.action_dim))
        rr = np.asarray([i[2] for i in s_list], dtype=np.float32).reshape((sample_size, 1))
        rss = np.asarray([i[3] for i in s_list], dtype=np.float32).reshape((sample_size, self.state_dim))
        rt = np.asarray([i[4] for i in s_list], dtype=np.bool).reshape((sample_size, 1))

        return rs, ra, rr, rss, rt

    def experience_out_partly(self,sample_size,part_experience_size):
        sample_index = np.random.randint(0,part_experience_size,sample_size).tolist()

        rs = np.asarray([self.d[i][0] for i in sample_index], dtype=np.float32).reshape((sample_size, self.state_dim))

        return rs


#############test###########
if __name__ == "__main__":
    pass

============================Critic.py=========================

import tensorflow as tf
from tensorflow.contrib import layers
import math

class Critic:
    def __init__(self, sess,action_dim,state_dim):

        self.sess = sess
        self.state_dim = state_dim
        self.action_dim = action_dim

        self.batch_size = 32
        self.GAMMA = 0.9
        self.num_units_l1 = 50
        self.num_units_l2 = 40
        self.learning_rate = 0.001
        self.update_TDnet_rate = 0.2
        self.reg = layers.l2_regularizer(0.006)
        self.init_var = 0.01

        self.state_input = tf.placeholder(dtype=tf.float32, shape=[None, self.state_dim], name='state_input')
        self.actor_input = tf.placeholder(dtype=tf.float32, shape=[None, self.action_dim], name='actor_input')
        self.Q_value_input = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='TD_Q_value_input')
        self.reward = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='reward')
        self.terminal = tf.placeholder(dtype=tf.bool, shape=[None, 1], name='terminal')

        with tf.variable_scope('critic'):
            self.Q_output, self.Q_net_var_set = self.create_network(trainable=True)
        with tf.variable_scope('critic_T'):
            self.Q_T_output, self.QT_net_var_set = self.create_network(trainable=False)

        self.build_update_graph(rate=self.update_TDnet_rate)
        self.build_td_target_graph()
        self.build_cost_graph()
        self.build_gradient_graph()

        self.add_summary()
        self.merged = tf.summary.merge_all()
        self.writer = tf.summary.FileWriter('/home/wd/tf/summary')

    def create_network(self, trainable):
        l1_s_w = tf.get_variable('l1_s_w',
                                 shape=[self.state_dim, self.num_units_l1],
                                 dtype=tf.float32,
                                 initializer=tf.random_normal_initializer(-1/math.sqrt(self.state_dim), 1/math.sqrt(self.state_dim)),
                                 regularizer=self.reg,
                                 trainable=trainable)

        l1_a_w = tf.get_variable('l1_a_w',
                                 shape=[self.action_dim, self.num_units_l1],
                                 dtype=tf.float32,
                                 initializer=tf.random_normal_initializer(-1/math.sqrt(self.action_dim), 1/math.sqrt(self.action_dim)),
                                 regularizer=self.reg,
                                 trainable=trainable)

        l1_b = tf.get_variable('l1_b',
                               shape=[self.num_units_l1],
                               dtype=tf.float32,
                               initializer=tf.random_uniform_initializer(-self.init_var, self.init_var),
                               trainable=trainable)

        l2_w = tf.get_variable('l2_w',
                               shape=[self.num_units_l1, self.num_units_l2],
                               dtype=tf.float32,
                               initializer=tf.random_normal_initializer(-1/math.sqrt(self.num_units_l1), 1/math.sqrt(self.num_units_l1)),
                               regularizer=self.reg,
                               trainable=trainable)

        l2_b = tf.get_variable('l2_b',
                               shape=[self.num_units_l2],
                               dtype=tf.float32,
                               initializer=tf.random_uniform_initializer(-self.init_var, self.init_var),
                               trainable=trainable)

        l3_w = tf.get_variable('l3_w',
                               shape=[self.num_units_l2, 1],
                               dtype=tf.float32,
                               initializer=tf.random_normal_initializer(-1/math.sqrt(self.num_units_l2), 1/math.sqrt(self.num_units_l2)),
                               regularizer=self.reg,
                               trainable=trainable)

        l3_b = tf.get_variable('l3_b',
                               shape=[1],
                               dtype=tf.float32,
                               initializer=tf.random_uniform_initializer(-self.init_var, self.init_var),
                               trainable=trainable)

        l1 = tf.nn.tanh(tf.matmul(self.actor_input, l1_a_w) + tf.matmul(self.state_input, l1_s_w) + l1_b)
        l2 = tf.nn.tanh(tf.matmul(l1, l2_w) + l2_b)
        l3 = tf.matmul(l2, l3_w) + l3_b

        return l3,[l1_s_w,l1_a_w,l1_b,l2_w,l2_b,l3_w,l3_b]


    def build_update_graph(self, rate):
        self.update_T_net_compeletely_op_set = [tf.assign(i[1], i[0]) for i in
                                                zip(self.Q_net_var_set, self.QT_net_var_set)]  # QT = Q

        self.update_T_net_op_set = [tf.assign(i[1], i[0] * rate + i[1] * (1 - rate)) for i in
                                    zip(self.Q_net_var_set, self.QT_net_var_set)]  # QT = r*Q + (1-r)*QT

    def build_td_target_graph(self):
        self.td_target = tf.where(self.terminal,
                                  tf.constant(0, dtype=tf.float32, shape=[self.batch_size, 1]),
                                  self.Q_T_output * self.GAMMA) + self.reward

    def build_cost_graph(self):
        self.cost = tf.reduce_mean(tf.square(self.Q_value_input - self.Q_output))

        self.train = tf.train.AdamOptimizer(self.learning_rate).minimize(self.cost)

    def build_gradient_graph(self):
        gradient_temp = tf.gradients(self.Q_output, self.actor_input)
        # output's shape is [1,batch_size,4]????
        self.gradient = tf.reshape(gradient_temp, (self.batch_size, self.action_dim))

    def add_summary(self):
        self.summary_cost = tf.summary.scalar('critic_cost', self.cost)
        self.summary_Q = tf.summary.scalar('critic_Q_value', tf.reduce_mean(self.Q_output))
        self.summary_gradient = tf.summary.scalar('gradient', tf.reduce_mean(self.gradient))

    def operation_get_TDtarget(self, action_next, state_next, reward, terminal):
        return self.sess.run(self.td_target, feed_dict={self.actor_input: action_next,
                                                        self.state_input: state_next,
                                                        self.reward: reward,
                                                        self.terminal: terminal})

    def operation_critic_learn(self, TDtarget, action, state):
        summary_cost, summary_Q, _ = self.sess.run([self.summary_cost, self.summary_Q, self.train],
                                                   feed_dict={self.Q_value_input: TDtarget,
                                                              self.actor_input: action,
                                                              self.state_input: state})
        return summary_cost, summary_Q

    def operation_get_gradient(self, action, state):
        return self.sess.run([self.summary_gradient, self.gradient], feed_dict={self.actor_input: action,
                                                                                self.state_input: state})

    def operation_update_TDnet_compeletely(self):
        self.sess.run(self.update_T_net_compeletely_op_set)

    def operation_update_TDnet(self):
        self.sess.run(self.update_T_net_op_set)




#############test###########
if __name__ == "__main__":
    pass

======================Actor.py=======================================

import tensorflow as tf
from tensorflow.contrib import layers
import math


class Actor:
    def __init__(self,sess,action_dim,state_dim):
        self.sess = sess
        self.action_dim = action_dim
        self.state_dim = state_dim

        self.batch_size = 32
        self.num_units_l1 = 30
        self.num_units_l2 = 20
        self.learning_rate = 0.001
        self.init_var = 0.01
        self.update_TDnet_rate = 0.2
        self.reg = layers.l2_regularizer(0.006)

        self.state_input = tf.placeholder(dtype=tf.float32, shape=[None, self.state_dim], name='state_input')
        self.action_gradient_input = tf.placeholder(dtype=tf.float32, shape=[None, self.action_dim], name='actor_input')

        with tf.name_scope('ACTOR_A_O'):
            with tf.variable_scope('ACTOR_A_V'):
                self.action_output = self.create_network(trainable=True)
        with tf.name_scope('ACTOR_AT_O'):
            with tf.variable_scope('ACTOR_AT_V'):
                self.action_T_output = self.create_network(trainable=False)

        self.gather_var()
        self.build_update_graph(self.update_TDnet_rate)
        self.build_cost_graph()

        self.add_aummary()
        self.merged = tf.summary.merge_all()
        self.writer = tf.summary.FileWriter('/home/wd/tf/summary1')

    def create_network(self, trainable):
        l1 = tf.layers.dense(inputs=self.state_input,
                             units=self.num_units_l1,
                             activation=tf.nn.tanh,
                             use_bias=True,
                             kernel_initializer=tf.random_normal_initializer
                             (-1/math.sqrt(self.state_dim), 1/math.sqrt(self.state_dim)),
                             bias_initializer=tf.random_uniform_initializer(-self.init_var, self.init_var),
                             trainable=trainable,
                             name='l1',
                             kernel_regularizer=self.reg)

        l2 = tf.layers.dense(inputs=l1,
                             units=self.num_units_l2,
                             activation=tf.nn.tanh,
                             use_bias=True,
                             kernel_initializer=tf.random_normal_initializer
                             (-1/math.sqrt(self.num_units_l1), 1/math.sqrt(self.num_units_l1)),
                             bias_initializer=tf.random_uniform_initializer(-self.init_var, self.init_var),
                             trainable=trainable,
                             name='l2',
                             kernel_regularizer=self.reg)

        action_ouput = tf.layers.dense(inputs=l2,
                                       units=self.action_dim,
                                       activation=tf.nn.tanh,
                                       use_bias=True,
                                       kernel_initializer=tf.random_normal_initializer(-self.num_units_l2, self.num_units_l2),
                                       bias_initializer=tf.random_uniform_initializer(-self.init_var, self.init_var),
                                       trainable=trainable,
                                       name='action_output',
                                       kernel_regularizer=self.reg)

        return action_ouput

    def gather_var(self):
        graph = tf.get_default_graph()

        l1_w = graph.get_tensor_by_name("ACTOR_A_V/l1/kernel:0")
        l1_b = graph.get_tensor_by_name("ACTOR_A_V/l1/bias:0")
        l2_w = graph.get_tensor_by_name("ACTOR_A_V/l2/kernel:0")
        l2_b = graph.get_tensor_by_name("ACTOR_A_V/l2/bias:0")
        output_w = graph.get_tensor_by_name("ACTOR_A_V/action_output/kernel:0")
        output_b = graph.get_tensor_by_name("ACTOR_A_V/action_output/bias:0")

        l1_w_T = graph.get_tensor_by_name("ACTOR_AT_V/l1/kernel:0")
        l1_b_T = graph.get_tensor_by_name("ACTOR_AT_V/l1/bias:0")
        l2_w_T = graph.get_tensor_by_name("ACTOR_AT_V/l2/kernel:0")
        l2_b_T = graph.get_tensor_by_name("ACTOR_AT_V/l2/bias:0")
        output_w_T = graph.get_tensor_by_name("ACTOR_AT_V/action_output/kernel:0")
        output_b_T = graph.get_tensor_by_name("ACTOR_AT_V/action_output/bias:0")

        self.A_net_var_set = [l1_w, l1_b, l2_w, l2_b, output_w, output_b]
        self.AT_net_var_set = [l1_w_T, l1_b_T, l2_w_T, l2_b_T, output_w_T, output_b_T]

    def build_update_graph(self,rate):
        self.update_T_net_compeletely_op_set = [tf.assign(i[1], i[0]) for i in
                                                zip(self.A_net_var_set, self.AT_net_var_set)]  # AT = A

        self.update_T_net_op_set = [tf.assign(i[1], i[0] * rate + i[1] * (1 - rate)) for i in
                                    zip(self.A_net_var_set, self.AT_net_var_set)]  # AT = r*A + (1-r)*AT

    def build_cost_graph(self):
        self.cost = tf.reduce_mean(self.action_gradient_input*self.action_output)#[batch_size,1]*[batch_size,1]

        # must has a negtive learning_rate
        self.train = tf.train.AdamOptimizer(-self.learning_rate).minimize(self.cost)

    def add_aummary(self):
        self.summary_c = tf.summary.scalar('critic_cost',self.cost)


    def operation_get_action_to_environment(self,state):
        return self.sess.run(self.action_output,feed_dict={self.state_input:state})

    def operation_get_action_to_TDtarget(self,state):
        return self.sess.run(self.action_T_output,feed_dict={self.state_input:state})

    def operation_actor_learn(self,gradient,state):
        self.sess.run(self.train,feed_dict={self.action_gradient_input:gradient,self.state_input:state})

    def operation_update_TDnet_compeletely(self):
        self.sess.run(self.update_T_net_compeletely_op_set)

    def operation_update_TDnet(self):
        self.sess.run(self.update_T_net_op_set)

=======================train.py=====================================

import gym
import tensorflow as tf
from Critic import Critic
from Actor import Actor
from experience_replay import Experience_replay
import numpy as np


class Train:
    def __init__(self, action_dim, state_dim):
        self.sess = tf.Session()
        self.action_dim = action_dim
        self.state_dim = state_dim
        self.batch_size = 32
        self.episodes = 0
        self.iterations = 0

        self.critic = Critic(self.sess, self.action_dim, self.state_dim)
        self.actor = Actor(self.sess, self.action_dim, self.state_dim)
        self.ER = Experience_replay(500000, self.action_dim, self.state_dim)
        self.sess.run(tf.global_variables_initializer())
        self.critic.operation_update_TDnet_compeletely()
        self.actor.operation_update_TDnet_compeletely()
        self.env = gym.make('Pendulum-v0')

    def add_noise_and_reshape(self, action, var):
        return np.clip(np.random.normal(action, var), -1., 1.).reshape(self.action_dim)

    def operation_add_memory_by_episode(self,episodes,max_iters,var):
        for i in range(episodes):
            observation = self.env.reset()
            for j in range(max_iters):
                # env.render()
                action = self.actor.operation_get_action_to_environment(np.reshape(observation, (1, self.state_dim)))
                action_noise = self.add_noise_and_reshape(action, var)
                observation_next, reward, done, _ = self.env.step(action_noise * 2)  # a [-2,2]
                self.ER.experience_in((observation, action_noise, (reward + 5) / 100., observation_next, done))
                observation = observation_next
                if done:
                    #print 'done'
                    break

    def operation_train_actor(self,s):
        action = self.actor.operation_get_action_to_environment(s)
        summary_g, gradient = self.critic.operation_get_gradient(action, s)
        self.actor.operation_actor_learn(gradient, s)
        return summary_g

    def operation_train_critic(self,s,a,r,ss,t):
        action_next = self.actor.operation_get_action_to_TDtarget(ss)
        td_target = self.critic.operation_get_TDtarget(action_next, ss, r, t)
        summary_c, summary_Q = self.critic.operation_critic_learn(td_target, a, s)
        return summary_c, summary_Q

    def operation_write_summary(self,summary_g,summary_c,summary_Q,iters):
        self.critic.writer.add_summary(summary_g, iters)
        self.critic.writer.add_summary(summary_c, iters)
        self.critic.writer.add_summary(summary_Q, iters)

    def update_T_net(self):
        self.actor.operation_update_TDnet()
        self.critic.operation_update_TDnet()


train = Train(1,3)
train.operation_add_memory_by_episode(500,200,0.4)#add memory
for train.episodes in range(10000000):
    train.operation_add_memory_by_episode(1, 200, 0.4)#perceive
    for i in range(25):
        s,a,r,ss,t = train.ER.experience_out(train.batch_size)#sample
        actor_s = train.ER.experience_out_partly(train.batch_size,100000)#sample
        summary_c,summary_Q = train.operation_train_critic(s,a,r,ss,t)#critic learn
        summary_g = train.operation_train_actor(actor_s)#actor learn
        train.update_T_net()#update t net
        if i % 9 == 0:#write summary   2 times
            train.operation_write_summary(summary_g,summary_c,summary_Q,train.iterations)
        train.iterations += 1

The figure below is a summary of the Q value output by the critic network. It can be seen that the Q value has risen from about -0.24 (this is the value without training the actor network) to about 0.4. It should be that I did not attenuate the motion noise, so the improvement speed of the subsequent strategy is very slow.
Write picture description here

The figure below is a summary of the loss value of the critic network. It can be seen that it may be better if the learning frequency of the critic network is larger (currently 200 times of perception and 25 times of learning)
Write picture description here

The figure below is a summary of the gradient of the critic network. This gradient parameter reflects the learning of the actor network. Because the direct goal of actor network learning is to make the actions output by the actor network have a gradient (extreme value) closer to 0 in the critic network.
Write picture description here

It should be noted that the training is not always smooth and often encounters 'bumps', but generally converges eventually
Write picture description here

Guess you like

Origin blog.csdn.net/qq_32231743/article/details/73615120