5 Depth determines the strategy gradient, the paper is here:
http://xueshu.baidu.com/s?wd=paperuri%3A%283752bdb69e8a3f4849ecba38b2b0168f%29&filter=sc_long_sign&tn=SE_xueshusource_2kduw22v&sc_vurl=http%3A%2F%2Fwww.oa lib.com%2Fpaper% 2F4051743&ie=utf-8&sc_us=1138439324812222606
=======================experience_replay.py=============
from collections import deque
import numpy as np
import random
'''
flag = tf.app.flags
FLAG = flag.FLAGS
flag.DEFINE_string('size','5','size')
print flag.FLAGS.size
'''
class Experience_replay:
def __init__(self, size, action_dim,state_dim):
self.d = deque(maxlen=size)
self.action_dim = action_dim
self.state_dim = state_dim
def experience_in(self, memory):
self.d.append(memory)
def experience_out(self, sample_size):
s_list = random.sample(self.d, sample_size)
rs = np.asarray([i[0] for i in s_list], dtype=np.float32).reshape((sample_size, self.state_dim))
ra = np.asarray([i[1] for i in s_list], dtype=np.float32).reshape((sample_size, self.action_dim))
rr = np.asarray([i[2] for i in s_list], dtype=np.float32).reshape((sample_size, 1))
rss = np.asarray([i[3] for i in s_list], dtype=np.float32).reshape((sample_size, self.state_dim))
rt = np.asarray([i[4] for i in s_list], dtype=np.bool).reshape((sample_size, 1))
return rs, ra, rr, rss, rt
def experience_out_partly(self,sample_size,part_experience_size):
sample_index = np.random.randint(0,part_experience_size,sample_size).tolist()
rs = np.asarray([self.d[i][0] for i in sample_index], dtype=np.float32).reshape((sample_size, self.state_dim))
return rs
#############test###########
if __name__ == "__main__":
pass
============================Critic.py=========================
import tensorflow as tf
from tensorflow.contrib import layers
import math
class Critic:
def __init__(self, sess,action_dim,state_dim):
self.sess = sess
self.state_dim = state_dim
self.action_dim = action_dim
self.batch_size = 32
self.GAMMA = 0.9
self.num_units_l1 = 50
self.num_units_l2 = 40
self.learning_rate = 0.001
self.update_TDnet_rate = 0.2
self.reg = layers.l2_regularizer(0.006)
self.init_var = 0.01
self.state_input = tf.placeholder(dtype=tf.float32, shape=[None, self.state_dim], name='state_input')
self.actor_input = tf.placeholder(dtype=tf.float32, shape=[None, self.action_dim], name='actor_input')
self.Q_value_input = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='TD_Q_value_input')
self.reward = tf.placeholder(dtype=tf.float32, shape=[None, 1], name='reward')
self.terminal = tf.placeholder(dtype=tf.bool, shape=[None, 1], name='terminal')
with tf.variable_scope('critic'):
self.Q_output, self.Q_net_var_set = self.create_network(trainable=True)
with tf.variable_scope('critic_T'):
self.Q_T_output, self.QT_net_var_set = self.create_network(trainable=False)
self.build_update_graph(rate=self.update_TDnet_rate)
self.build_td_target_graph()
self.build_cost_graph()
self.build_gradient_graph()
self.add_summary()
self.merged = tf.summary.merge_all()
self.writer = tf.summary.FileWriter('/home/wd/tf/summary')
def create_network(self, trainable):
l1_s_w = tf.get_variable('l1_s_w',
shape=[self.state_dim, self.num_units_l1],
dtype=tf.float32,
initializer=tf.random_normal_initializer(-1/math.sqrt(self.state_dim), 1/math.sqrt(self.state_dim)),
regularizer=self.reg,
trainable=trainable)
l1_a_w = tf.get_variable('l1_a_w',
shape=[self.action_dim, self.num_units_l1],
dtype=tf.float32,
initializer=tf.random_normal_initializer(-1/math.sqrt(self.action_dim), 1/math.sqrt(self.action_dim)),
regularizer=self.reg,
trainable=trainable)
l1_b = tf.get_variable('l1_b',
shape=[self.num_units_l1],
dtype=tf.float32,
initializer=tf.random_uniform_initializer(-self.init_var, self.init_var),
trainable=trainable)
l2_w = tf.get_variable('l2_w',
shape=[self.num_units_l1, self.num_units_l2],
dtype=tf.float32,
initializer=tf.random_normal_initializer(-1/math.sqrt(self.num_units_l1), 1/math.sqrt(self.num_units_l1)),
regularizer=self.reg,
trainable=trainable)
l2_b = tf.get_variable('l2_b',
shape=[self.num_units_l2],
dtype=tf.float32,
initializer=tf.random_uniform_initializer(-self.init_var, self.init_var),
trainable=trainable)
l3_w = tf.get_variable('l3_w',
shape=[self.num_units_l2, 1],
dtype=tf.float32,
initializer=tf.random_normal_initializer(-1/math.sqrt(self.num_units_l2), 1/math.sqrt(self.num_units_l2)),
regularizer=self.reg,
trainable=trainable)
l3_b = tf.get_variable('l3_b',
shape=[1],
dtype=tf.float32,
initializer=tf.random_uniform_initializer(-self.init_var, self.init_var),
trainable=trainable)
l1 = tf.nn.tanh(tf.matmul(self.actor_input, l1_a_w) + tf.matmul(self.state_input, l1_s_w) + l1_b)
l2 = tf.nn.tanh(tf.matmul(l1, l2_w) + l2_b)
l3 = tf.matmul(l2, l3_w) + l3_b
return l3,[l1_s_w,l1_a_w,l1_b,l2_w,l2_b,l3_w,l3_b]
def build_update_graph(self, rate):
self.update_T_net_compeletely_op_set = [tf.assign(i[1], i[0]) for i in
zip(self.Q_net_var_set, self.QT_net_var_set)] # QT = Q
self.update_T_net_op_set = [tf.assign(i[1], i[0] * rate + i[1] * (1 - rate)) for i in
zip(self.Q_net_var_set, self.QT_net_var_set)] # QT = r*Q + (1-r)*QT
def build_td_target_graph(self):
self.td_target = tf.where(self.terminal,
tf.constant(0, dtype=tf.float32, shape=[self.batch_size, 1]),
self.Q_T_output * self.GAMMA) + self.reward
def build_cost_graph(self):
self.cost = tf.reduce_mean(tf.square(self.Q_value_input - self.Q_output))
self.train = tf.train.AdamOptimizer(self.learning_rate).minimize(self.cost)
def build_gradient_graph(self):
gradient_temp = tf.gradients(self.Q_output, self.actor_input)
# output's shape is [1,batch_size,4]????
self.gradient = tf.reshape(gradient_temp, (self.batch_size, self.action_dim))
def add_summary(self):
self.summary_cost = tf.summary.scalar('critic_cost', self.cost)
self.summary_Q = tf.summary.scalar('critic_Q_value', tf.reduce_mean(self.Q_output))
self.summary_gradient = tf.summary.scalar('gradient', tf.reduce_mean(self.gradient))
def operation_get_TDtarget(self, action_next, state_next, reward, terminal):
return self.sess.run(self.td_target, feed_dict={self.actor_input: action_next,
self.state_input: state_next,
self.reward: reward,
self.terminal: terminal})
def operation_critic_learn(self, TDtarget, action, state):
summary_cost, summary_Q, _ = self.sess.run([self.summary_cost, self.summary_Q, self.train],
feed_dict={self.Q_value_input: TDtarget,
self.actor_input: action,
self.state_input: state})
return summary_cost, summary_Q
def operation_get_gradient(self, action, state):
return self.sess.run([self.summary_gradient, self.gradient], feed_dict={self.actor_input: action,
self.state_input: state})
def operation_update_TDnet_compeletely(self):
self.sess.run(self.update_T_net_compeletely_op_set)
def operation_update_TDnet(self):
self.sess.run(self.update_T_net_op_set)
#############test###########
if __name__ == "__main__":
pass
======================Actor.py=======================================
import tensorflow as tf
from tensorflow.contrib import layers
import math
class Actor:
def __init__(self,sess,action_dim,state_dim):
self.sess = sess
self.action_dim = action_dim
self.state_dim = state_dim
self.batch_size = 32
self.num_units_l1 = 30
self.num_units_l2 = 20
self.learning_rate = 0.001
self.init_var = 0.01
self.update_TDnet_rate = 0.2
self.reg = layers.l2_regularizer(0.006)
self.state_input = tf.placeholder(dtype=tf.float32, shape=[None, self.state_dim], name='state_input')
self.action_gradient_input = tf.placeholder(dtype=tf.float32, shape=[None, self.action_dim], name='actor_input')
with tf.name_scope('ACTOR_A_O'):
with tf.variable_scope('ACTOR_A_V'):
self.action_output = self.create_network(trainable=True)
with tf.name_scope('ACTOR_AT_O'):
with tf.variable_scope('ACTOR_AT_V'):
self.action_T_output = self.create_network(trainable=False)
self.gather_var()
self.build_update_graph(self.update_TDnet_rate)
self.build_cost_graph()
self.add_aummary()
self.merged = tf.summary.merge_all()
self.writer = tf.summary.FileWriter('/home/wd/tf/summary1')
def create_network(self, trainable):
l1 = tf.layers.dense(inputs=self.state_input,
units=self.num_units_l1,
activation=tf.nn.tanh,
use_bias=True,
kernel_initializer=tf.random_normal_initializer
(-1/math.sqrt(self.state_dim), 1/math.sqrt(self.state_dim)),
bias_initializer=tf.random_uniform_initializer(-self.init_var, self.init_var),
trainable=trainable,
name='l1',
kernel_regularizer=self.reg)
l2 = tf.layers.dense(inputs=l1,
units=self.num_units_l2,
activation=tf.nn.tanh,
use_bias=True,
kernel_initializer=tf.random_normal_initializer
(-1/math.sqrt(self.num_units_l1), 1/math.sqrt(self.num_units_l1)),
bias_initializer=tf.random_uniform_initializer(-self.init_var, self.init_var),
trainable=trainable,
name='l2',
kernel_regularizer=self.reg)
action_ouput = tf.layers.dense(inputs=l2,
units=self.action_dim,
activation=tf.nn.tanh,
use_bias=True,
kernel_initializer=tf.random_normal_initializer(-self.num_units_l2, self.num_units_l2),
bias_initializer=tf.random_uniform_initializer(-self.init_var, self.init_var),
trainable=trainable,
name='action_output',
kernel_regularizer=self.reg)
return action_ouput
def gather_var(self):
graph = tf.get_default_graph()
l1_w = graph.get_tensor_by_name("ACTOR_A_V/l1/kernel:0")
l1_b = graph.get_tensor_by_name("ACTOR_A_V/l1/bias:0")
l2_w = graph.get_tensor_by_name("ACTOR_A_V/l2/kernel:0")
l2_b = graph.get_tensor_by_name("ACTOR_A_V/l2/bias:0")
output_w = graph.get_tensor_by_name("ACTOR_A_V/action_output/kernel:0")
output_b = graph.get_tensor_by_name("ACTOR_A_V/action_output/bias:0")
l1_w_T = graph.get_tensor_by_name("ACTOR_AT_V/l1/kernel:0")
l1_b_T = graph.get_tensor_by_name("ACTOR_AT_V/l1/bias:0")
l2_w_T = graph.get_tensor_by_name("ACTOR_AT_V/l2/kernel:0")
l2_b_T = graph.get_tensor_by_name("ACTOR_AT_V/l2/bias:0")
output_w_T = graph.get_tensor_by_name("ACTOR_AT_V/action_output/kernel:0")
output_b_T = graph.get_tensor_by_name("ACTOR_AT_V/action_output/bias:0")
self.A_net_var_set = [l1_w, l1_b, l2_w, l2_b, output_w, output_b]
self.AT_net_var_set = [l1_w_T, l1_b_T, l2_w_T, l2_b_T, output_w_T, output_b_T]
def build_update_graph(self,rate):
self.update_T_net_compeletely_op_set = [tf.assign(i[1], i[0]) for i in
zip(self.A_net_var_set, self.AT_net_var_set)] # AT = A
self.update_T_net_op_set = [tf.assign(i[1], i[0] * rate + i[1] * (1 - rate)) for i in
zip(self.A_net_var_set, self.AT_net_var_set)] # AT = r*A + (1-r)*AT
def build_cost_graph(self):
self.cost = tf.reduce_mean(self.action_gradient_input*self.action_output)#[batch_size,1]*[batch_size,1]
# must has a negtive learning_rate
self.train = tf.train.AdamOptimizer(-self.learning_rate).minimize(self.cost)
def add_aummary(self):
self.summary_c = tf.summary.scalar('critic_cost',self.cost)
def operation_get_action_to_environment(self,state):
return self.sess.run(self.action_output,feed_dict={self.state_input:state})
def operation_get_action_to_TDtarget(self,state):
return self.sess.run(self.action_T_output,feed_dict={self.state_input:state})
def operation_actor_learn(self,gradient,state):
self.sess.run(self.train,feed_dict={self.action_gradient_input:gradient,self.state_input:state})
def operation_update_TDnet_compeletely(self):
self.sess.run(self.update_T_net_compeletely_op_set)
def operation_update_TDnet(self):
self.sess.run(self.update_T_net_op_set)
=======================train.py=====================================
import gym
import tensorflow as tf
from Critic import Critic
from Actor import Actor
from experience_replay import Experience_replay
import numpy as np
class Train:
def __init__(self, action_dim, state_dim):
self.sess = tf.Session()
self.action_dim = action_dim
self.state_dim = state_dim
self.batch_size = 32
self.episodes = 0
self.iterations = 0
self.critic = Critic(self.sess, self.action_dim, self.state_dim)
self.actor = Actor(self.sess, self.action_dim, self.state_dim)
self.ER = Experience_replay(500000, self.action_dim, self.state_dim)
self.sess.run(tf.global_variables_initializer())
self.critic.operation_update_TDnet_compeletely()
self.actor.operation_update_TDnet_compeletely()
self.env = gym.make('Pendulum-v0')
def add_noise_and_reshape(self, action, var):
return np.clip(np.random.normal(action, var), -1., 1.).reshape(self.action_dim)
def operation_add_memory_by_episode(self,episodes,max_iters,var):
for i in range(episodes):
observation = self.env.reset()
for j in range(max_iters):
# env.render()
action = self.actor.operation_get_action_to_environment(np.reshape(observation, (1, self.state_dim)))
action_noise = self.add_noise_and_reshape(action, var)
observation_next, reward, done, _ = self.env.step(action_noise * 2) # a [-2,2]
self.ER.experience_in((observation, action_noise, (reward + 5) / 100., observation_next, done))
observation = observation_next
if done:
#print 'done'
break
def operation_train_actor(self,s):
action = self.actor.operation_get_action_to_environment(s)
summary_g, gradient = self.critic.operation_get_gradient(action, s)
self.actor.operation_actor_learn(gradient, s)
return summary_g
def operation_train_critic(self,s,a,r,ss,t):
action_next = self.actor.operation_get_action_to_TDtarget(ss)
td_target = self.critic.operation_get_TDtarget(action_next, ss, r, t)
summary_c, summary_Q = self.critic.operation_critic_learn(td_target, a, s)
return summary_c, summary_Q
def operation_write_summary(self,summary_g,summary_c,summary_Q,iters):
self.critic.writer.add_summary(summary_g, iters)
self.critic.writer.add_summary(summary_c, iters)
self.critic.writer.add_summary(summary_Q, iters)
def update_T_net(self):
self.actor.operation_update_TDnet()
self.critic.operation_update_TDnet()
train = Train(1,3)
train.operation_add_memory_by_episode(500,200,0.4)#add memory
for train.episodes in range(10000000):
train.operation_add_memory_by_episode(1, 200, 0.4)#perceive
for i in range(25):
s,a,r,ss,t = train.ER.experience_out(train.batch_size)#sample
actor_s = train.ER.experience_out_partly(train.batch_size,100000)#sample
summary_c,summary_Q = train.operation_train_critic(s,a,r,ss,t)#critic learn
summary_g = train.operation_train_actor(actor_s)#actor learn
train.update_T_net()#update t net
if i % 9 == 0:#write summary 2 times
train.operation_write_summary(summary_g,summary_c,summary_Q,train.iterations)
train.iterations += 1
The figure below is a summary of the Q value output by the critic network. It can be seen that the Q value has risen from about -0.24 (this is the value without training the actor network) to about 0.4. It should be that I did not attenuate the motion noise, so the improvement speed of the subsequent strategy is very slow.
The figure below is a summary of the loss value of the critic network. It can be seen that it may be better if the learning frequency of the critic network is larger (currently 200 times of perception and 25 times of learning)
The figure below is a summary of the gradient of the critic network. This gradient parameter reflects the learning of the actor network. Because the direct goal of actor network learning is to make the actions output by the actor network have a gradient (extreme value) closer to 0 in the critic network.
It should be noted that the training is not always smooth and often encounters 'bumps', but generally converges eventually