首先吗,导入库
import numpy as np
import gym
import tensorflow as tf
from tensorflow.contrib.layers import flatten, conv2d, fully_connected
from collections import deque, Counter
import random
from datetime import datetime
初始化游戏环境
env = gym.make("MsPacman-v0")
n_outputs = env.action_space.n
现在,定义一个preprocess_observation函数来对输入的游戏画面进行预处理。在此,缩减图像大小并将图像转换为灰度图像:
color = np.array([210, 164, 74]).mean()
def preprocess_observation(obs):
# 裁剪并调整图像大小
img = obs[1:176:2, ::2]
# 将图像转换为灰度图
img = img.mean(axis=2)
# 提高图像的对比度
img[img==color] = 0
# 将图像归一化为 -1 ~ +1
img = (img - 128) / 128 - 1
return img.reshape(88,80,1)
好的,现在来定义一个q_network函数来构建Q网络。Q网络的输入为游戏状态 X。构建的Q网络包括同值填充的3个卷积层和1个全连接层:
tf.reset_default_graph()
def q_network(X, name_scope):
# 初始化各层
initializer = tf.contrib.layers.variance_scaling_initializer()
with tf.variable_scope(name_scope) as scope:
# 初始化卷积层
layer_1 = conv2d(X, num_outputs=32, kernel_size=(8,8), stride=4, padding='SAME', weights_initializer=initializer)
tf.summary.histogram('layer_1',layer_1)
layer_2 = conv2d(layer_1, num_outputs=64, kernel_size=(4,4), stride=2, padding='SAME', weights_initializer=initializer)
tf.summary.histogram('layer_2',layer_2)
layer_3 = conv2d(layer_2, num_outputs=64, kernel_size=(3,3), stride=1, padding='SAME', weights_initializer=initializer)
tf.summary.histogram('layer_3',layer_3)
# 在进入全连接层之前,将layer_的结果扁平化
flat = flatten(layer_3)
fc = fully_connected(flat, num_outputs=128, weights_initializer=initializer)
tf.summary.histogram('fc',fc)
output = fully_connected(fc, num_outputs=n_outputs, activation_fn=None, weights_initializer=initializer)
tf.summary.histogram('output',output)
# Vars变量将保存网络的参数和权重
vars = {v.name[len(scope.name):]: v for v in tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)}
return vars, output
接下来,定义一个epsilon_greedy函数来执行epsilon贪婪策略。在epsilon贪婪策略中,以概率 1-epsilon 选择最佳行为,并以概率epsilon选择随机行为。
由于并不想一直探索,因此采用一种衰减的 epsilon贪婪策略,其中epsilon值会随着时间推移而不断衰减。为此,随着时间的变化该策略将会只采用最好的行为:
epsilon = 0.5
eps_min = 0.05
eps_max = 1.0
eps_decay_steps = 500000
def epsilon_greedy(action, step):
p = np.random.random(1).squeeze()
epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step/eps_decay_steps)
if np.random.rand() < epsilon:
return np.random.randint(n_outputs)
else:
return action
现在,初始化经验回放缓存为20000,用于保存经验。
buffer_len = 20000
exp_buffer = deque(maxlen=buffer_len)
将智能体的所有经验(状态、行为、奖励)都保存在经验回放缓存中,并采用小批量经验来训练网络:
def sample_memories(batch_size):
perm_batch = np.random.permutation(len(exp_buffer))[:batch_size]
mem = np.array(exp_buffer)[perm_batch]
return mem[:,0], mem[:,1], mem[:,2], mem[:,3], mem[:,4]
接着,定义所有的超参数:
num_episodes = 800
batch_size = 48
input_shape = (None, 88, 80, 1)
learning_rate = 0.001
X_shape = (None, 88, 80, 1)
discount_factor = 0.97
global_step = 0
copy_steps = 100
steps_train = 4
start_steps = 2000
logdir = 'logs'
tf.reset_default_graph()
这时,定义输入的placeholder,如游戏状态
X = tf.placeholder(tf.float32, shape=X_shape)
定义in_training_model的布尔值来切换训练
in_training_mode = tf.placeholder(tf.bool)
取输入X来构建Q网络,并生成该状态下所有行为的Q值:
扫描二维码关注公众号,回复:
10258708 查看本文章
mainQ, mainQ_outputs = q_network(X, 'mainQ')
同理,构建目标Q网络:
targetQ, targetQ_outputs = q_network(X, 'targetQ')
定义行为值的placeholder:
X_action = tf.placeholder(tf.int32, shape=(None,))
Q_action = tf.reduce_sum(targetQ_outputs * tf.one_hot(X_action, n_outputs), axis=-1, keep_dims=True)
将主Q网络的参数复制到目标Q网络:
copy_op = [tf.assign(main_name, targetQ[var_name]) for var_name, main_name in mainQ.items()]
copy_target_to_main = tf.group(*copy_op)
定义输出的placeholder,如行为:
y = tf.placeholder(tf.float32, shape=(None,1))
然后计算损失,即实际值与预测值之间的均方差:
loss = tf.reduce_mean(tf.square(y - Q_action))
采用 AdamOptimizer 来最小损失:
optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)
在 TensorBoard中设置可视化的日志文件:
loss_summary = tf.summary.scalar('LOSS', loss)
merge_summary = tf.summary.merge_all()
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())
这时,启动TensorFlow会话并运行模型:
init = tf.global_variables_initializer()
with tf.Session() as sess:
init.run()
# 对于每个情景
for i in range(num_episodes):
done = False
obs = env.reset()
epoch = 0
episodic_reward = 0
actions_counter = Counter()
episodic_loss = []
# 不是最终状态时
while not done:
#env.render()
# 得到预处理游戏画面
obs = preprocess_observation(obs)
# 输入游戏画面,并得到每个行为的Q值
actions = mainQ_outputs.eval(feed_dict={X:[obs], in_training_mode:False})
# 获得行为
action = np.argmax(actions, axis=-1)
actions_counter[str(action)] += 1
# 利用epsilon贪婪策略来选择行为
action = epsilon_greedy(action, global_step)
# 执行行为,并转移下一状态next_obs,获得奖励
next_obs, reward, done, _ = env.step(action)
# 将上述转移信息作为经验保存在回放缓存中
exp_buffer.append([obs, action, preprocess_observation(next_obs), reward, done])
# 经过一定时间,利用经验回放缓存中的样本数据来训练Q网络
if global_step % steps_train == 0 and global_step > start_steps:
# 样本经验
o_obs, o_act, o_next_obs, o_rew, o_done = sample_memories(batch_size)
# 状态
o_obs = [x for x in o_obs]
# 下一状态
o_next_obs = [x for x in o_next_obs]
# 下一行为
next_act = mainQ_outputs.eval(feed_dict={X:o_next_obs, in_training_mode:False})
# 奖励
y_batch = o_rew + discount_factor * np.max(next_act, axis=-1) * (1-o_done)
# 合并所有信息,并写入文件
mrg_summary = merge_summary.eval(feed_dict={X:o_obs, y:np.expand_dims(y_batch, axis=-1), X_action:o_act, in_training_mode:False})
file_writer.add_summary(mrg_summary, global_step)
# 这时训练网络并计算损失
train_loss, _ = sess.run([loss, training_op], feed_dict={X:o_obs, y:np.expand_dims(y_batch, axis=-1), X_action:o_act, in_training_mode:True})
episodic_loss.append(train_loss)
# 经过一定时间,将主Q网络的权重复制到目标Q网络
if (global_step+1) % copy_steps == 0 and global_step > start_steps:
copy_target_to_main.run()
obs = next_obs
epoch += 1
global_step += 1
episodic_reward += reward
print('Epoch', epoch, 'Reward', episodic_reward,)