基于DQN的Atari游戏

首先吗,导入库

import numpy as np
import gym
import tensorflow as tf
from tensorflow.contrib.layers import flatten, conv2d, fully_connected
from collections import deque, Counter
import random
from datetime import datetime

初始化游戏环境

env = gym.make("MsPacman-v0")
n_outputs = env.action_space.n

现在,定义一个preprocess_observation函数来对输入的游戏画面进行预处理。在此,缩减图像大小并将图像转换为灰度图像:

color = np.array([210, 164, 74]).mean()

def preprocess_observation(obs):

    # 裁剪并调整图像大小
    img = obs[1:176:2, ::2]

    # 将图像转换为灰度图
    img = img.mean(axis=2)

    # 提高图像的对比度
    img[img==color] = 0

    # 将图像归一化为 -1 ~ +1
    img = (img - 128) / 128 - 1

    return img.reshape(88,80,1)

好的,现在来定义一个q_network函数来构建Q网络。Q网络的输入为游戏状态 X。构建的Q网络包括同值填充的3个卷积层和1个全连接层:

tf.reset_default_graph()

def q_network(X, name_scope):
    
    # 初始化各层
    initializer = tf.contrib.layers.variance_scaling_initializer()

    with tf.variable_scope(name_scope) as scope: 

        # 初始化卷积层
        layer_1 = conv2d(X, num_outputs=32, kernel_size=(8,8), stride=4, padding='SAME', weights_initializer=initializer) 
        tf.summary.histogram('layer_1',layer_1)
        
        layer_2 = conv2d(layer_1, num_outputs=64, kernel_size=(4,4), stride=2, padding='SAME', weights_initializer=initializer)
        tf.summary.histogram('layer_2',layer_2)
        
        layer_3 = conv2d(layer_2, num_outputs=64, kernel_size=(3,3), stride=1, padding='SAME', weights_initializer=initializer)
        tf.summary.histogram('layer_3',layer_3)
        
        # 在进入全连接层之前,将layer_的结果扁平化
        flat = flatten(layer_3)

        fc = fully_connected(flat, num_outputs=128, weights_initializer=initializer)
        tf.summary.histogram('fc',fc)
        
        output = fully_connected(fc, num_outputs=n_outputs, activation_fn=None, weights_initializer=initializer)
        tf.summary.histogram('output',output)
        

        # Vars变量将保存网络的参数和权重
        vars = {v.name[len(scope.name):]: v for v in tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name)} 
        return vars, output

接下来,定义一个epsilon_greedy函数来执行epsilon贪婪策略。在epsilon贪婪策略中,以概率 1-epsilon 选择最佳行为,并以概率epsilon选择随机行为。
由于并不想一直探索,因此采用一种衰减的 epsilon贪婪策略,其中epsilon值会随着时间推移而不断衰减。为此,随着时间的变化该策略将会只采用最好的行为:

epsilon = 0.5
eps_min = 0.05
eps_max = 1.0
eps_decay_steps = 500000

def epsilon_greedy(action, step):
    p = np.random.random(1).squeeze()
    epsilon = max(eps_min, eps_max - (eps_max-eps_min) * step/eps_decay_steps)
    if np.random.rand() < epsilon:
        return np.random.randint(n_outputs)
    else:
        return action

现在,初始化经验回放缓存为20000,用于保存经验。

buffer_len = 20000
exp_buffer = deque(maxlen=buffer_len)

将智能体的所有经验(状态、行为、奖励)都保存在经验回放缓存中,并采用小批量经验来训练网络:

def sample_memories(batch_size):
    perm_batch = np.random.permutation(len(exp_buffer))[:batch_size]
    mem = np.array(exp_buffer)[perm_batch]
    return mem[:,0], mem[:,1], mem[:,2], mem[:,3], mem[:,4]

接着,定义所有的超参数:

num_episodes = 800
batch_size = 48
input_shape = (None, 88, 80, 1)
learning_rate = 0.001
X_shape = (None, 88, 80, 1)
discount_factor = 0.97

global_step = 0
copy_steps = 100
steps_train = 4
start_steps = 2000
logdir = 'logs'
tf.reset_default_graph()

这时,定义输入的placeholder,如游戏状态

X = tf.placeholder(tf.float32, shape=X_shape)

定义in_training_model的布尔值来切换训练

in_training_mode = tf.placeholder(tf.bool)

取输入X来构建Q网络,并生成该状态下所有行为的Q值:

扫描二维码关注公众号,回复: 10258708 查看本文章
mainQ, mainQ_outputs = q_network(X, 'mainQ')

同理,构建目标Q网络:

targetQ, targetQ_outputs = q_network(X, 'targetQ')

定义行为值的placeholder:

X_action = tf.placeholder(tf.int32, shape=(None,))
Q_action = tf.reduce_sum(targetQ_outputs * tf.one_hot(X_action, n_outputs), axis=-1, keep_dims=True)

将主Q网络的参数复制到目标Q网络:

copy_op = [tf.assign(main_name, targetQ[var_name]) for var_name, main_name in mainQ.items()]
copy_target_to_main = tf.group(*copy_op)

定义输出的placeholder,如行为:

y = tf.placeholder(tf.float32, shape=(None,1))

然后计算损失,即实际值与预测值之间的均方差:

loss = tf.reduce_mean(tf.square(y - Q_action))

采用 AdamOptimizer 来最小损失:

optimizer = tf.train.AdamOptimizer(learning_rate)
training_op = optimizer.minimize(loss)

在 TensorBoard中设置可视化的日志文件:

loss_summary = tf.summary.scalar('LOSS', loss)
merge_summary = tf.summary.merge_all()
file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

这时,启动TensorFlow会话并运行模型:

init = tf.global_variables_initializer()
with tf.Session() as sess:
    init.run()
    
    # 对于每个情景
    for i in range(num_episodes):
        done = False
        obs = env.reset()
        epoch = 0
        episodic_reward = 0
        actions_counter = Counter() 
        episodic_loss = []

        # 不是最终状态时
        while not done:

           #env.render()
        
            # 得到预处理游戏画面
            obs = preprocess_observation(obs)

            # 输入游戏画面,并得到每个行为的Q值
            actions = mainQ_outputs.eval(feed_dict={X:[obs], in_training_mode:False})

            # 获得行为
            action = np.argmax(actions, axis=-1)
            actions_counter[str(action)] += 1 

            # 利用epsilon贪婪策略来选择行为
            action = epsilon_greedy(action, global_step)
            
            # 执行行为,并转移下一状态next_obs,获得奖励
            next_obs, reward, done, _ = env.step(action)

            # 将上述转移信息作为经验保存在回放缓存中
            exp_buffer.append([obs, action, preprocess_observation(next_obs), reward, done])
            
            # 经过一定时间,利用经验回放缓存中的样本数据来训练Q网络
            if global_step % steps_train == 0 and global_step > start_steps:
                
                # 样本经验
                o_obs, o_act, o_next_obs, o_rew, o_done = sample_memories(batch_size)

                # 状态
                o_obs = [x for x in o_obs]

                # 下一状态
                o_next_obs = [x for x in o_next_obs]

                # 下一行为
                next_act = mainQ_outputs.eval(feed_dict={X:o_next_obs, in_training_mode:False})

                # 奖励
                y_batch = o_rew + discount_factor * np.max(next_act, axis=-1) * (1-o_done) 

                # 合并所有信息,并写入文件
                mrg_summary = merge_summary.eval(feed_dict={X:o_obs, y:np.expand_dims(y_batch, axis=-1), X_action:o_act, in_training_mode:False})
                file_writer.add_summary(mrg_summary, global_step)

                # 这时训练网络并计算损失
                train_loss, _ = sess.run([loss, training_op], feed_dict={X:o_obs, y:np.expand_dims(y_batch, axis=-1), X_action:o_act, in_training_mode:True})
                episodic_loss.append(train_loss)
            
            # 经过一定时间,将主Q网络的权重复制到目标Q网络
            if (global_step+1) % copy_steps == 0 and global_step > start_steps:
                copy_target_to_main.run()
                
            obs = next_obs
            epoch += 1
            global_step += 1
            episodic_reward += reward
        
        print('Epoch', epoch, 'Reward', episodic_reward,)
发布了378 篇原创文章 · 获赞 43 · 访问量 4万+

猜你喜欢

转载自blog.csdn.net/weixin_43283397/article/details/105011433
DQN