Tensorflow reinforcement learning (Reinforcement learning)

In reinforcement learning problems, the object has a perception and decision-making ability is called the agent, it may be a piece of code algorithm, the robot can also be hardware and software systems with mechanical structure. Agent by interacting with the external environment in order to complete a task, where the environment is a contract that action by the Agent and influence, and gives the sum of the corresponding feedback of the external environment. For the agent, it is produced by the operation of state-aware decision-making environment; for the environment, it starts from some initial state s1, dynamically change its operation state by receiving the agent, and the corresponding reward (Reward )signal.

Strengthen the learning process, including the five basic objects:

S state reflecting the features of the environment, in a state referred to as the timestamp t St, which may be a visual image of the original, and other speech waveform signal may be a high-level features, such as speed, position and other data, all the state constitutes state space S

A behavior action taken by the agent status on the timestamp t is denoted at, can be left, right, and other discrete actions, may also be a strength, location and other continuous action, all actions constitute the action space A

After performing an action probability distribution | (s a) represents a model of intelligent decision-making body, the state accepts input S, and gives policy decisions p π (a | s)

R & lt reward (s, a) the expression of the feedback signal in the state of the environment is given after receiving a s is a scalar value, an operation expression of good and bad to some extent, obtained on excitation referred to rt timestamp t

State transition probability model expresses the changes of state of the environment, i.e. the current environment state s operation after receiving a, the probability of changing state s' distribution.

Example 1 balance bar

import tensorflow as tf
import numpy as np
import gym
import random
from collections import deque

num_episodes = 500              # 游戏训练的总episode数量
num_exploration_episodes = 100  # 探索过程所占的episode数量
max_len_episode = 1000          # 每个episode的最大回合数
batch_size = 32                 # 批次大小
learning_rate = 1e-3            # 学习率
gamma = 1.                      # 折扣因子
initial_epsilon = 1.            # 探索起始时的探索率
final_epsilon = 0.01            # 探索终止时的探索率

class QNetwork(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.dense1 = tf.keras.layers.Dense(units=24, activation=tf.nn.relu)
        self.dense2 = tf.keras.layers.Dense(units=24, activation=tf.nn.relu)
        self.dense3 = tf.keras.layers.Dense(units=2)

    def call(self, inputs):
        x = self.dense1(inputs)
        x = self.dense2(x)
        x = self.dense3(x)
        return x

    def predict(self, inputs):
        q_values = self(inputs)
        return tf.argmax(q_values, axis=-1)

if __name__ == '__main__':
    env = gym.make('CartPole-v1')       # 实例化一个游戏环境,参数为游戏名称
    model = QNetwork()
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    replay_buffer = deque(maxlen=10000) # 使用一个 deque 作为 Q Learning 的经验回放池
    epsilon = initial_epsilon
    for episode_id in range(num_episodes):
        state = env.reset()             # 初始化环境,获得初始状态
        epsilon = max(                  # 计算当前探索率
            initial_epsilon * (num_exploration_episodes - episode_id) / num_exploration_episodes,
            final_epsilon)
        for t in range(max_len_episode):
            env.render()                                # 对当前帧进行渲染,绘图到屏幕
            if random.random() < epsilon:               # epsilon-greedy 探索策略,以 epsilon 的概率选择随机动作
                action = env.action_space.sample()      # 选择随机动作(探索)
            else:
                action = model.predict(np.expand_dims(state, axis=0)).numpy()   # 选择模型计算出的 Q Value 最大的动作
                action = action[0]

            # 让环境执行动作,获得执行完动作的下一个状态,动作的奖励,游戏是否已结束以及额外信息
            next_state, reward, done, info = env.step(action)
            # 如果游戏Game Over,给予大的负奖励
            reward = -10. if done else reward
            # 将(state, action, reward, next_state)的四元组(外加 done 标签表示是否结束)放入经验回放池
            replay_buffer.append((state, action, reward, next_state, 1 if done else 0))
            # 更新当前 state
            state = next_state

            if done:                                    # 游戏结束则退出本轮循环,进行下一个 episode
                print("episode %d, epsilon %f, score %d" % (episode_id, epsilon, t))
                break

            if len(replay_buffer) >= batch_size:
                # 从经验回放池中随机取一个批次的四元组,并分别转换为 NumPy 数组
                batch_state, batch_action, batch_reward, batch_next_state, batch_done = zip(
                    *random.sample(replay_buffer, batch_size))
                batch_state, batch_reward, batch_next_state, batch_done = \
                    [np.array(a, dtype=np.float32) for a in [batch_state, batch_reward, batch_next_state, batch_done]]
                batch_action = np.array(batch_action, dtype=np.int32)

                q_value = model(batch_next_state)
                y = batch_reward + (gamma * tf.reduce_max(q_value, axis=1)) * (1 - batch_done)  # 计算 y 值
                with tf.GradientTape() as tape:
                    loss = tf.keras.losses.mean_squared_error(  # 最小化 y 和 Q-value 的距离
                        y_true=y,
                        y_pred=tf.reduce_sum(model(batch_state) * tf.one_hot(batch_action, depth=2), axis=1)
                    )
                grads = tape.gradient(loss, model.variables)
                optimizer.apply_gradients(grads_and_vars=zip(grads, model.variables))       # 计算梯度并更新参数

Example 2 game balance bar

import 	gym,os
import  numpy as np
import  matplotlib
from 	matplotlib import pyplot as plt
# Default parameters for plots
matplotlib.rcParams['font.size'] = 18
matplotlib.rcParams['figure.titlesize'] = 18
matplotlib.rcParams['figure.figsize'] = [9, 7]
matplotlib.rcParams['font.family'] = ['KaiTi']
matplotlib.rcParams['axes.unicode_minus']=False

import 	tensorflow as tf
from    tensorflow import keras
from    tensorflow.keras import layers,optimizers,losses
from    PIL import Image
env = gym.make('CartPole-v1')  # 创建游戏环境
env.seed(2333)
tf.random.set_seed(2333)
np.random.seed(2333)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
assert tf.__version__.startswith('2.')

learning_rate = 0.0002
gamma         = 0.98

class Policy(keras.Model):
    # 策略网络,生成动作的概率分布
    def __init__(self):
        super(Policy, self).__init__()
        self.data = [] # 存储轨迹
        # 输入为长度为4的向量,输出为左、右2个动作
        self.fc1 = layers.Dense(128, kernel_initializer='he_normal')
        self.fc2 = layers.Dense(2, kernel_initializer='he_normal')
        # 网络优化器
        self.optimizer = optimizers.Adam(lr=learning_rate)

    def call(self, inputs, training=None):
        # 状态输入s的shape为向量:[4]
        x = tf.nn.relu(self.fc1(inputs))
        x = tf.nn.softmax(self.fc2(x), axis=1)
        return x

    def put_data(self, item):
        # 记录r,log_P(a|s)
        self.data.append(item)

    def train_net(self, tape):
        # 计算梯度并更新策略网络参数。tape为梯度记录器
        R = 0 # 终结状态的初始回报为0
        for r, log_prob in self.data[::-1]:#逆序取
            R = r + gamma * R # 计算每个时间戳上的回报
            # 每个时间戳都计算一次梯度
            # grad_R=-log_P*R*grad_theta
            loss = -log_prob * R
            with tape.stop_recording():
                # 优化策略网络
                grads = tape.gradient(loss, self.trainable_variables)
                # print(grads)
                self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
        self.data = [] # 清空轨迹

def main():
    pi = Policy() # 创建策略网络
    pi(tf.random.normal((4,4)))
    pi.summary()
    score = 0.0 # 计分
    print_interval = 20 # 打印间隔
    returns = []

    for n_epi in range(400):
        s = env.reset() # 回到游戏初始状态,返回s0
        with tf.GradientTape(persistent=True) as tape:
            for t in range(501): # CartPole-v1 forced to terminates at 500 step.
                # 送入状态向量,获取策略
                s = tf.constant(s,dtype=tf.float32)
                # s: [4] => [1,4]
                s = tf.expand_dims(s, axis=0)
                prob = pi(s) # 动作分布:[1,2]
                # 从类别分布中采样1个动作, shape: [1]
                a = tf.random.categorical(tf.math.log(prob), 1)[0]
                a = int(a) # Tensor转数字
                s_prime, r, done, info = env.step(a)
                # 记录动作a和动作产生的奖励r
                # prob shape:[1,2]
                pi.put_data((r, tf.math.log(prob[0][a])))
                s = s_prime # 刷新状态
                score += r # 累积奖励

                env.render()

                # if n_epi >1000:
                #     env.render()
                #     # im = Image.fromarray(s)
                #     # im.save("res/%d.jpg" % info['frames'][0])

                if done:  # 当前episode终止
                    break
            # episode终止后,训练一次网络
            pi.train_net(tape)
        del tape

        if n_epi%print_interval==0 and n_epi!=0:
            returns.append(score/print_interval)
            print(f"# of episode :{n_epi}, avg score : {score/print_interval}")
            score = 0.0
    env.close() # 关闭环境

    plt.plot(np.arange(len(returns))*print_interval, returns)
    plt.plot(np.arange(len(returns))*print_interval, returns, 's')
    plt.xlabel('回合数')
    plt.ylabel('总回报')
    plt.savefig('reinforce-tf-cartpole.png')

if __name__ == '__main__':
    main()

 

Published 110 original articles · won praise 2 · Views 3745

Guess you like

Origin blog.csdn.net/qq_40041064/article/details/105047546