Use policies gradient solve discrete action space problem.
A, import the package, define hyper parameter
import gym import tensorflow as tf import numpy as np from collections import deque #################hyper parameters################、 #discount factor GAMMA = 0.95 LEARNING_RATE = 0.01
Two, PolicyGradient Agent constructor:
1, provided State space dimensions, the operation of spatial dimension;
2, of the sequence of sample storage structure;
3, a function call to create a policy function approximation neural networks, tensorflow the session; initial or neural network weights and bias.
def __init__(self, env): #self.time_step = 0 #state dimension self.state_dim = env.observation_space.shape[0] #action dimension self.action_dim = env.action_space.n #sample list self.ep_obs, self.ep_as, self.ep_rs = [],[],[] #create policy network self.create_softmax_network() self.session = tf.InteractiveSession() self.session.run(tf.global_variables_initializer())
Third, create a neural network:
As used herein, cross-entropy error function, calculating the gradient of the loss function using a neural network. Softmax output probability of each operation of the output layer.
tf.nn.sparse_softmax_cross_entropy_with_logits logits probability function of the first process to obtain softmax normalization will lables one-hot vector processing, and then find logits cross entropy of labels:
Wherein is the label of the i-th value, a corresponding vector component by softmax normalization outputted, can be seen, when the more accurate classification, corresponding components will be closer to 1, so that the value of also it will be smaller.
Therefore, where you can get a good understanding of (my own understanding): When the time-step-i moment, more similar strategies if action probability vector network output and time-step-i moment sampled, then cross entropy It will be smaller. Minimizing the cross-entropy error will be able to make policy decisions closer to the action of our network of sampling. Finally multiplied by the corresponding cross entropy reward time-step, the reward will be the size of the loss function is introduced, entropy * reward larger adjustment parameters calculated during neural network will be more toward the gradient direction.
def create_softmax_network(self): W1 = self.weight_variable([self.state_dim, 20]) b1 = self.bias_variable([20]) W2 = self.weight_variable([20, self.action_dim]) b2 = self.bias_variable([self.action_dim]) #input layer self.state_input = tf.placeholder(tf.float32, [None, self.state_dim]) self.tf_acts = tf.placeholder(tf.int32, [None, ], name='actions_num') self.tf_vt = tf.placeholder(tf.float32, [None, ], name="actions_value") #hidden layer h_layer = tf.nn.relu(tf.matmul(self.state_input,W1) + b1) #softmax layer self.softmax_input = tf.matmul(h_layer, W2) + b2 #softmax output self.all_act_prob = tf.nn.softmax(self.softmax_input, name='act_prob') #cross entropy loss function self.neg_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.softmax_input,labels=self.tf_acts) self.loss = tf.reduce_mean(self.neg_log_prob * self.tf_vt) # reward guided loss self.train_op = tf.train.AdamOptimizer(LEARNING_RATE).minimize(self.loss) def weight_variable(self, shape): initial = tf.truncated_normal(shape) #truncated normal distribution return tf.Variable(initial) def bias_variable(self, shape): initial = tf.constant(0.01, shape=shape) return tf.Variable(initial)
Fourth, the sampling sequence:
def store_transition(self, s, a, r): self.ep_obs.append(s) self.ep_as.append(a) self.ep_rs.append(r)
Fifth, the model of learning:
, The neural network is adjusted by the complete sequence of Monte Carlo sampling.
def learn(self): #evaluate the value of all states in present episode discounted_ep_rs = np.zeros_like(self.ep_rs) running_add = 0 for t in reversed(range(0, len(self.ep_rs))): running_add = running_add * GAMMA + self.ep_rs[t] discounted_ep_rs[t] = running_add #normalization discounted_ep_rs -= np.mean(discounted_ep_rs) discounted_ep_rs /= np.std(discounted_ep_rs) # train on episode self.session.run(self.train_op, feed_dict={ self.state_input: np.vstack(self.ep_obs), self.tf_acts: np.array(self.ep_as), self.tf_vt: discounted_ep_rs, }) self.ep_obs, self.ep_as, self.ep_rs = [], [], [] # empty episode data
Six training:
# Hyper Parameters ENV_NAME = 'CartPole-v0' EPISODE = 3000 # Episode limitation STEP = 3000 # Step limitation in an episode TEST = 10 # The number of experiment test every 100 episode def main(): # initialize OpenAI Gym env and dqn agent env = gym.make(ENV_NAME) agent = Policy_Gradient(env) for episode in range(EPISODE): # initialize task state = env.reset() # Train for step in range(STEP): action = agent.choose_action(state) # e-greedy action for train #take action next_state,reward,done,_ = env.step(action) #sample agent.store_transition(state, action, reward) state = next_state if done: #print("stick for ",step, " steps") #model learning after a complete sample agent.learn() break # Test every 100 episodes if episode % 100 == 0: total_reward = 0 for i in range(TEST): state = env.reset() for j in range(STEP): env.render() action = agent.choose_action(state) # direct action for test state,reward,done,_ = env.step(action) total_reward += reward if done: break ave_reward = total_reward/TEST print ('episode: ',episode,'Evaluation Average Reward:',ave_reward) if __name__ == '__main__': main()
reference:
https://www.cnblogs.com/pinard/p/10137696.html
https://github.com/ljpzzz/machinelearning/blob/master/reinforcement-learning/policy_gradient.py