这个是学习强化学习过程中遇到的作业,记录一下。
目标:
- 实现下图的环境,需要实现环境中的动态转移函数。
- 实现一个 agent, 策略是随机的,通过仿真的方式,用回报值的经验平均去估计每个状态的值函数。验证仿真的结果和课件中计算的结果。(分别仿真 γ= 0.5, 1)
这是一个全观测马尔科夫决策过程,状态转移矩阵已知,动作执行后的回报也已知,这点在环境类中已有体现,并且环境类中有执行动作得到下一状态和当前回报的方法;智能体当前采用的是随机策略;定义了simulate_eval函数来执行多轮模拟过程,并返回平均值作为输入状态的状态值。
import numpy as np
class Env(object):
def __init__(self):
self.S = ["s1", "s2", "s3", "s4", "s5"]
self.P=dict(
s1=dict(phone=[1,0,0,0,0],quit=[0,1,0,0,0]),
s2=dict(phone=[1,0,0,0,0],study=[0,0,1,0,0]),
s3=dict(study=[0,0,0,1,0],sleep=[0,0,0,0,1]),
s4=dict(review=[0,0,0,0,1],noreview=[0,0.2,0.4,0.4,0])
)
self.R=dict(
s1=dict(phone=-1,quit=0),
s2=dict(phone=-1,study=-2),
s3=dict(sleep=0,study=-2),
s4=dict(review=10,noreview=-5)
)
def step(self,s,a):
s_n=np.random.choice(self.S,p=self.P[s][a])
r=self.R[s][a]
terminal=s_n=="s5"
return s_n,r,terminal
env=Env()
## print(env.step('s4',"review"))
class Agent(object):
def __init__(self):
self.AvailableActions=dict(
s1=["phone","quit"],
s2=["phone","study"],
s3=["study","sleep"],
s4=["review","noreview"]
)
self.policy=self.random_policy
def random_policy(self,s):
a =None
if s in self.AvailableActions:
available_actions=self.AvailableActions[s]
a = np.random.choice(available_actions)
return a
agent=Agent()
def simulate_eval(s,gamma,max_step=100,N=10000):
g=[]
for i in range(N):
gain=0
current_s=s
current_gamma=gamma
for step in range(max_step):
a=agent.policy(current_s)
current_s,r,termi=env.step(current_s,a)
gain+=current_gamma*r
current_gamma*=gamma
if termi:
break
g.append(gain)
return(np.average(g))
for s in ["s1","s2","s3","s4"]:
print("V({}):{}".format(s,simulate_eval(s,1)))
```