Introduction to Reinforcement Learning
Definition: Reinforcement learning (English:, Reinforcement learning
abbreviated RL
) is a field in machine learning that emphasizes how to act based on the environment to maximize the expected benefits.
Core idea: The agent learns agent
in the environment , executes actions environment
according to the state state
(or observed observation
) of the action
environment, and reward
guides better actions according to the feedback (reward) of the environment . It simply boils down to the following figure:
Note: The state obtained from the environment is sometimes called state
, sometimes called observation
. In fact, one of the two represents the global state and the other represents the local observation value. There will be differences in the multi-agent environment, but we have just started learning to encounter The environment is not that complicated yet, so you can equate these two concepts first.
The difference between reinforcement learning and supervised learning
- Reinforcement learning, supervised learning, and unsupervised learning are three different fields in machine learning, and they all overlap with deep learning.
- Supervised learning finds the mapping between input and output, such as classification and regression problems.
- Unsupervised learning mainly looks for hidden relationships between data, such as clustering problems.
- Reinforcement learning needs to learn and find the best decision-making plan in the interaction with the environment.
- Supervised learning deals with cognitive problems, and reinforcement learning deals with decision-making problems.
Introduction to Environmental Tools
Common reinforcement learning tools
OPEN AI:GYM,baselines,mujoco-py,Retro
Gym project address: https://github.com/openai/gym integrates the code of various environment configurations
Baseline project address: https://github.com/openai/baselines integrates the basic strategy code of various reinforcement learning
Mujoco-py project address: https://github.com/openai/mujoco-py a physics engine in the fields of simulated robots, biomechanics, graphics and animation
Retro project address: https://github.com/openai/retro integrates some small games with interface
DEEP MIND:pysc2,Lab
pysc2 project address: https://github.com/deepmind/pysc2 intensive training for StarCraft II
Lab project address: https://github.com/deepmind/lab integrates a series of 3D game scenes
Introduction to PARL library
Project address: https://github.com/PaddlePaddle/PARL Document link: https://parl.readthedocs.io/en/latest/index.html
Reproducibility guarantee : We provide high-quality implementation of mainstream reinforcement learning algorithms, and strictly reproduce the indicators corresponding to the paper.
Large-scale parallel support : The framework can support simultaneous concurrent computing with up to tens of thousands of CPUs, and supports the training of multi-GPU reinforcement learning models.
Strong reusability : Users do not need to re-implement the algorithm by themselves. The classic reinforcement learning algorithm can be easily applied to specific scenarios through the algorithm provided by the reuse framework.
Good scalability : When users want to investigate new algorithms, they can quickly implement their own reinforcement learning algorithms by inheriting the base class we provide.
Easy to use and simple interface
model interface
import parl
class Policy(parl.Model):
def __init__(self):
self.fc = parl.layers.fc(size=12, act='softmax')
def policy(self, obs):
out = self.fc(obs)
return out
policy = Policy()
copied_policy = copy.deepcopy(model)
Algorithm interface
model = Model()
dqn = parl.algorithms.DQN(model, lr=1e-3)
agent interface
class MyAgent(parl.Agent):
def __init__(self, algorithm, act_dim):
super(MyAgent, self).__init__(algorithm)
self.act_dim = act_dim
Let's take a look at a simple demo
import paddle.fluid as fluid
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from parl.utils import machine_info
import gym
from parl.utils import logger
#定义策略
class PolicyGradient(object):
def __init__(self, model, lr):
self.model = model
self.optimizer = fluid.optimizer.Adam(learning_rate=lr)
#使用adam作为优化器
def predict(self, obs):
obs = fluid.dygraph.to_variable(obs)
obs = layers.cast(obs, dtype='float32')
return self.model(obs)
#将观测的值传入模型得到结果
def learn(self, obs, action, reward):
obs = fluid.dygraph.to_variable(obs)
obs = layers.cast(obs, dtype='float32')
act_prob = self.model(obs)
action = fluid.dygraph.to_variable(action)
reward = fluid.dygraph.to_variable(reward)
#根据观测结果和模型得到 可能动作的概率
log_prob = layers.cross_entropy(act_prob, action)
#计算可能动作与正确动作来计算loss
cost = log_prob * reward
cost = layers.cast(cost, dtype='float32')
cost = layers.reduce_mean(cost)
cost.backward()
self.optimizer.minimize(cost)
self.model.clear_gradients()
#根据loss更新model
return cost
#定义model
#此处为简单的DNN
class CartpoleModel(fluid.dygraph.Layer):
def __init__(self, name_scope, act_dim):
super(CartpoleModel, self).__init__(name_scope)
hid1_size = act_dim * 10
self.fc1 = fluid.FC('fc1', hid1_size, act='tanh')
self.fc2 = fluid.FC('fc2', act_dim, act='softmax')
def forward(self, obs):
out = self.fc1(obs)
out = self.fc2(out)
return out
#定义agent
class CartpoleAgent(object):
def __init__(
self,
alg,
obs_dim,
act_dim,
):
#学习策略
self.alg = alg
#环境情况可能数
self.obs_dim = obs_dim
#动作情况可能数
self.act_dim = act_dim
def sample(self, obs):
#得到环境观测值
obs = np.expand_dims(obs, axis=0)
act_prob = self.alg.predict(obs).numpy()
act_prob = np.squeeze(act_prob, axis=0)
#算出相应可能性
act = np.random.choice(self.act_dim, p=act_prob)
#随机采样 根据可能性获得动作
return act
def predict(self, obs):
obs = np.expand_dims(obs, axis=0)
act_prob = self.alg.predict(obs).numpy()
act_prob = np.squeeze(act_prob, axis=0)
act = np.argmax(act_prob)
#根据观测或可能性最高的动作
return act
def learn(self, obs, act, reward):
act = np.expand_dims(act, axis=-1)
reward = np.expand_dims(reward, axis=-1)
cost = self.alg.learn(obs, act, reward)
#根据动作的奖励,重新优化模型
return cost
#训练过程
OBS_DIM = 4
ACT_DIM = 2
LEARNING_RATE = 1e-3
def run_episode(env, agent, train_or_test='train'):
obs_list, action_list, reward_list = [], [], []
obs = env.reset()#初始环境
while True:
#观测列表,动作列表,奖励列表更新
obs_list.append(obs)
if train_or_test == 'train':
action = agent.sample(obs)
else:
action = agent.predict(obs)
action_list.append(action)
obs, reward, done, _ = env.step(action)
reward_list.append(reward)
if done:
break
return obs_list, action_list, reward_list
def calc_reward_to_go(reward_list):
#生成奖励值
for i in range(len(reward_list) - 2, -1, -1):
reward_list[i] += reward_list[i + 1]
return np.array(reward_list)
def main():
#三大部分及环境初始化
env = gym.make('CartPole-v0')
model = CartpoleModel(name_scope='noIdeaWhyNeedThis', act_dim=ACT_DIM)
alg = PolicyGradient(model, LEARNING_RATE)
agent = CartpoleAgent(alg, OBS_DIM, ACT_DIM)
with fluid.dygraph.guard():
for i in range(1000): # 100 episodes
obs_list, action_list, reward_list = run_episode(env, agent)
#训练一步
if i % 10 == 0:
logger.info("Episode {}, Reward Sum {}.".format(
i, sum(reward_list)))
#更新列表
batch_obs = np.array(obs_list)
batch_action = np.array(action_list)
batch_reward = calc_reward_to_go(reward_list)
agent.learn(batch_obs, batch_action, batch_reward)
if (i + 1) % 100 == 0:
_, _, reward_list = run_episode(
env, agent, train_or_test='test')
total_reward = np.sum(reward_list)
logger.info('Test reward: {}'.format(total_reward))
if __name__ == '__main__':
main()
Eventually the reward converges to 200.