Value-based and decision-based
In reinforcement learning, there are two broad categories of methods, one based on value (Value-based) and one based on policy (Policy-based)
- Typical representatives of Value-based algorithms are Q-learning and SARSA, which optimize the Q function to the best, and then take the optimal strategy according to the Q function.
- The typical representative of Policy-based algorithms is Policy Gradient, which directly optimizes the policy function.
The difference between the two is clear at a glance, that is, one is to make decisions based on the value-determined plan, and the other is to get the result of the decision directly in one step, instead of calculating the rewards of each state in turn to make the decision gradually.
Random strategy
In determining the strategy, the output of the neural network is the Q value calculated according to the state, and then we use a fixed strategy to make decisions based on this Q value.
In the random strategy, we input different states, and then directly calculate the probability of making a certain decision, and directly learn to make the decision in one step without generating Q values.
After doing this, we are faced with a problem that is how to optimize our network and how to evaluate the quality of the network?
The strategy-based model cannot backpropagate the neural network after one calculation, because its result is generated after multiple iterations, so its optimization is different from ordinary neural networks.
Trajectory expected return
Since our ultimate goal is to obtain the maximum expectation after decision-making, we can obtain the expected return based on the final return and the probability of the decision trajectory to represent the return as a trajectory. Then according to the size of the return, we can perform the network optimization.
So, we have the policy gradient, and use the policy gradient as Loss to optimize the neural network
There are two sampling methods for policy gradients:
- Monte Carlo updates the parameters after each round (reinforce)
- Timing difference is to update the parameters after each step, and its update frequency is higher (Actor-critic)
Reinforce
Algorithm core
Practice as code
Algorithm flow
Code practice
Refer to the picture above
model
class Model(parl.Model):
def __init__(self, act_dim):
self.fc1 = layers.fc(size = 256,act='tanh')
self.fc2 = layers.fc(size = act_dim,act='softmax')#输出个动作的概率
def forward(self, obs): # 可直接用 model = Model(5); model(obs)调用
out = self.fc1(obs)
out = self.fc2(out)
return out
Algorithm
class PolicyGradient(parl.Algorithm):
def __init__(self, model, lr=None):
""" Policy Gradient algorithm
Args:
model (parl.Model): policy的前向网络.
lr (float): 学习率.
"""
self.model = model
assert isinstance(lr, float)
self.lr = lr
def predict(self, obs):
""" 使用policy model预测输出的动作概率
"""
return self.model(obs)
def learn(self, obs, action, reward):
""" 用policy gradient 算法更新policy model
"""
act_prob = self.model(obs) # 获取输出动作概率
# log_prob = layers.cross_entropy(act_prob, action) # 交叉熵
log_prob = layers.reduce_sum(
-1.0 * layers.log(act_prob) * layers.one_hot(
action, act_prob.shape[1]),
dim=1)
cost = log_prob * reward
cost = layers.reduce_mean(cost)
optimizer = fluid.optimizer.Adam(self.lr)
optimizer.minimize(cost)
return cost
Agent
class Agent(parl.Agent):
def __init__(self, algorithm, obs_dim, act_dim):
self.obs_dim = obs_dim
self.act_dim = act_dim
super(Agent, self).__init__(algorithm)
def build_program(self):
self.pred_program = fluid.Program()
self.learn_program = fluid.Program()
with fluid.program_guard(self.pred_program): # 搭建计算图用于 预测动作,定义输入输出变量
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
self.act_prob = self.alg.predict(obs)
with fluid.program_guard(
self.learn_program): # 搭建计算图用于 更新policy网络,定义输入输出变量
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
act = layers.data(name='act', shape=[1], dtype='int64')
reward = layers.data(name='reward', shape=[], dtype='float32')
self.cost = self.alg.learn(obs, act, reward)
def sample(self, obs):
obs = np.expand_dims(obs, axis=0) # 增加一维维度
act_prob = self.fluid_executor.run(
self.pred_program,
feed={'obs': obs.astype('float32')},
fetch_list=[self.act_prob])[0]
act_prob = np.squeeze(act_prob, axis=0) # 减少一维维度
act = np.random.choice(range(self.act_dim), p=act_prob) # 根据动作概率选取动作
return act
def predict(self, obs):
obs = np.expand_dims(obs, axis=0)
act_prob = self.fluid_executor.run(
self.pred_program,
feed={'obs': obs.astype('float32')},
fetch_list=[self.act_prob])[0]
act_prob = np.squeeze(act_prob, axis=0)
act = np.argmax(act_prob) # 根据动作概率选择概率最高的动作
return act
def learn(self, obs, act, reward):
act = np.expand_dims(act, axis=-1)
feed = {
'obs': obs.astype('float32'),
'act': act.astype('int64'),
'reward': reward.astype('float32')
}
cost = self.fluid_executor.run(
self.learn_program, feed=feed, fetch_list=[self.cost])[0]
return cost
Training and testing
def run_episode(env, agent):
obs_list, action_list, reward_list = [], [], []
obs = env.reset()
while True:
obs = preprocess(obs) # from shape (210, 160, 3) to (100800,)
obs_list.append(obs)
action = agent.sample(obs) # 采样动作
action_list.append(action)
obs, reward, done, info = env.step(action)
reward_list.append(reward)
if done:
break
return obs_list, action_list, reward_list
# 评估 agent, 跑 5 个episode,求平均
def evaluate(env, agent, render=False):
eval_reward = []
for i in range(5):
obs = env.reset()
episode_reward = 0
while True:
obs = preprocess(obs) # from shape (210, 160, 3) to (100800,)
action = agent.predict(obs) # 选取最优动作
obs, reward, isOver, _ = env.step(action)
episode_reward += reward
if render:
env.render()
if isOver:
break
eval_reward.append(episode_reward)
return np.mean(eval_reward)
Preprocess image input and calculate step return
def preprocess(image):
""" 预处理 210x160x3 uint8 frame into 6400 (80x80) 1维 float vector """
image = image[35:195] # 裁剪
image = image[::2,::2,0] # 下采样,缩放2倍
image[image == 144] = 0 # 擦除背景 (background type 1)
image[image == 109] = 0 # 擦除背景 (background type 2)
image[image != 0] = 1 # 转为灰度图,除了黑色外其他都是白色
return image.astype(np.float).ravel()
# 根据一个episode的每个step的reward列表,计算每一个Step的Gt
def calc_reward_to_go(reward_list, gamma=0.99):
"""calculate discounted reward"""
reward_arr = np.array(reward_list)
for i in range(len(reward_arr) - 2, -1, -1):
# G_t = r_t + γ·r_t+1 + ... = r_t + γ·G_t+1
reward_arr[i] += gamma * reward_arr[i + 1]
# normalize episode rewards
reward_arr -= np.mean(reward_arr)
reward_arr /= np.std(reward_arr)
return reward_arr
Environment configuration and process control
# 创建环境
env = gym.make('Pong-v0')
obs_dim = 80 * 80
act_dim = env.action_space.n
logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))
# 根据parl框架构建agent
model = Model(act_dim=act_dim)
alg = PolicyGradient(model,lr = LEARNING_RATE)
agent = Agent(alg,obs_dim = obs_dim,act_dim =act_dim)
# 加载模型
if os.path.exists('./model.ckpt'):
agent.restore('./model.ckpt')
for i in range(3000):
obs_list, action_list, reward_list = run_episode(env, agent)
if i % 10 == 0:
logger.info("Train Episode {}, Reward Sum {}.".format(i,
sum(reward_list)))
batch_obs = np.array(obs_list)
batch_action = np.array(action_list)
batch_reward = calc_reward_to_go(reward_list)
agent.learn(batch_obs, batch_action, batch_reward)
if (i + 1) % 100 == 0:
total_reward = evaluate(env, agent, render=False)
logger.info('Episode {}, Test reward: {}'.format(i + 1,
total_reward))
agent.save('./model.ckpt')
Result display