Double Deep Q-Learning Netwok的理解与实现

版权声明:所有的博客都是个人笔记,交流可以留言。未经允许,谢绝转载。。。 https://blog.csdn.net/qq_35976351/article/details/89223000

理论简介

Double Deep Q-Learning Netwok (DQN),基础理论来自于这篇论文。基础理论部分,参考这篇笔记这篇笔记。下面给出最核心的强化学习公式:
Y t D o u b l e Q = R t + 1 + γ Q ^ ( S t + 1 , a r g m a x a Q ( S t + 1 , a ) ) Y_{t}^{DoubleQ} = R_{t+1}+\gamma \hat{Q}\left(S_{t+1},\mathop{argmax}_{a}Q\left(S_{t+1},a\right)\right)
算法利用了两个结构相同,但是参数不同的神经网络

首先是 Q Q 网络,这就是DQN中的 Q Q 网络,是为了用来训练的神经网络。 Q ^ \hat{Q} 网络与 Q Q 的架构相同,只不过参数是某几步之前的,这是为了计算评估分数使用的。上面的公式含义如下:

  • 利用 Q Q 选择出 S t + 1 S_{t+1} 状态下,分数最大的一步的行动索引
  • 利用 Q ^ \hat{Q} 评估这一步的分数
  • 把这一步的分数,与之前 Q Q 选择的行动的作比较,注意可能不是同一个行动,然后进行误差反向传播

代码实现

代码基础框架来自于这篇博客

Agent.py强化学习

import tensorflow as tf
from tensorflow import keras
from collections import deque
import numpy as np
import random

MAX_LEN = 10000
BATCH_SIZE = 64
GAMMA = 0.95
EXPLORATION_DECAY = 0.995
EXPLORATION_MIN = 0.1


class Agent(object):
    def __init__(self, input_space, output_space, lr=0.001, exploration=0.9, update_model_step=10):
        self._model = keras.Sequential()
        self._model.add(keras.layers.Dense(input_shape=(input_space,), units=24, activation=tf.nn.relu))
        self._model.add(keras.layers.Dense(units=24, activation=tf.nn.relu))
        self._model.add(keras.layers.Dense(units=output_space, activation='linear'))
        self._model.compile(loss='mse', optimizer=keras.optimizers.Adam(lr))

        self._replayBuffer = deque(maxlen=MAX_LEN)
        self._exploration = exploration

        self._target_model = keras.models.clone_model(self._model)

        self._update_model_step = update_model_step  # 更新模型需要的最少步数
        self._cur_step = 0  # 当前使用模型计算的次数

    def update_target_model(self):
        self._target_model.set_weights(self._model.get_weights())

    @property
    def exploration(self):
        return self._exploration

    def add_data(self, state, action, reward, state_next, done):
        self._replayBuffer.append((state, action, reward, state_next, done))

    def act(self, state):
        if np.random.uniform() <= self._exploration:
            return np.random.randint(0, 2)
        action = self._model.predict(state)
        return np.argmax(action[0])

    def train_from_buffer(self):
        if len(self._replayBuffer) < BATCH_SIZE:
            return

        batch = random.sample(self._replayBuffer, BATCH_SIZE)
        for state, action, reward, state_next, done in batch:
            new_action = np.argmax(self._model.predict(state_next)[0])
            q_update = reward
            if not done:
            	# 这是DDQN公式
                q_update = reward + GAMMA * self._target_model.predict(state_next)[0][new_action]
                # q_update += GAMMA * np.amax(self._model.predict(state_next)[0])  # 注释掉的是DQN
            q_values = self._model.predict(state)
            q_values[0][action] = q_update
            self._model.fit(state, q_values, verbose=0)
            self._exploration *= EXPLORATION_DECAY
            self._exploration = max(EXPLORATION_MIN, self._exploration)

train.py训练

import gym
from Agent import Agent
import numpy as np
import matplotlib.pyplot as plt


def train():
    env = gym.make("CartPole-v1")
    input_space = env.observation_space.shape[0]
    output_space = env.action_space.n
    print(input_space, output_space)
    agent = Agent(input_space, output_space)
    run = 0
    x = []
    y = []
    while run < 100:
        run += 1
        state = env.reset()
        state = np.reshape(state, [1, -1])
        step = 0
        while True:
            step += 1
            # env.render()
            action = agent.act(state)
            state_next, reward, done, _ = env.step(action)
            reward = reward if not done else -reward
            state_next = np.reshape(state_next, [1, -1])
            agent.add_data(state, action, reward, state_next, done)
            state = state_next
            if done:
                print("Run: " + str(run) + ", exploration: " +
                      str(agent.exploration) + ", score:" + str(step))
                # 这里是每个episode更新一次,也可以根据实际调整
                agent.update_target_model()  
                x.append(run)
                y.append(step)
                break
            agent.train_from_buffer()
    plt.plot(x, y)
    plt.show()


if __name__ == "__main__":
    train()

训练结果

学习率是0.001,100个批次的训练,batch-size是64。每个批次执行一次参数更新。波动属于正常现象,基本能在15步之后取得较大的优势。

猜你喜欢

转载自blog.csdn.net/qq_35976351/article/details/89223000