我知道在具有离散动作空间的环境中使用的策略梯度会更新为
我尝试保持方差不变并使用均方误差损失更新输出,目标是它采取的行动。我认为这最终会将平均值推向具有更大总回报的行动,但它在 OpenAI 的 Pendulum 环境中毫无进展。
如果以损失函数和目标的方式描述它也将非常有帮助,例如如何使用交叉熵损失更新具有离散动作空间的策略梯度。这就是我最了解它的方式,但如果那是不可能的,那也没关系。
编辑:对于@Philipp。我理解它的方式是损失函数与连续动作空间相同,唯一改变的是我们从中获得对数概率的分布。在 PyTorch 中,我们可以将正态分布用于连续动作空间,将分类用于离散动作空间。David Ireland 的答案涉及数学,但在 PyTorch 中,看起来像log_prob = distribution.log_prob(action_taken)
适用于任何类型的分布。对于不良行为,我们希望降低采取该行为的可能性是有道理的。下面是两种类型的动作空间的工作代码来比较它们。连续动作空间代码应该是正确的,但代理不会学习,因为使用连续动作空间很难学习正确的动作,而且我们的简单方法还不够。研究更高级的方法,如PPO和DDPG。
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.categorical import Categorical #discrete distribution
import numpy as np
import gym
import math
import matplotlib.pyplot as plt
class Agent(nn.Module):
def __init__(self,lr):
super(Agent,self).__init__()
self.fc1 = nn.Linear(4,64)
self.fc2 = nn.Linear(64,32)
self.fc3 = nn.Linear(32,2) #neural network with layers 4,64,32,2
self.optimizer = optim.Adam(self.parameters(),lr=lr)
def forward(self,x):
x = torch.relu(self.fc1(x)) #relu and tanh for output
x = torch.relu(self.fc2(x))
x = torch.sigmoid(self.fc3(x))
return x
env = gym.make('CartPole-v0')
agent = Agent(0.001) #hyperparameters
DISCOUNT = 0.99
total = []
for e in range(500):
log_probs, rewards = [], []
done = False
state = env.reset()
while not done:
#mu = agent.forward(torch.from_numpy(state).float())
#distribution = Normal(mu, SIGMA)
distribution = Categorical(agent.forward(torch.from_numpy(state).float()))
action = distribution.sample()
log_probs.append(distribution.log_prob(action))
state, reward, done, info = env.step(action.item())
rewards.append(reward)
total.append(sum(rewards))
cumulative = 0
d_rewards = np.zeros(len(rewards))
for t in reversed(range(len(rewards))): #get discounted rewards
cumulative = cumulative * DISCOUNT + rewards[t]
d_rewards[t] = cumulative
d_rewards -= np.mean(d_rewards) #normalize
d_rewards /= np.std(d_rewards)
loss = 0
for t in range(len(rewards)):
loss += -log_probs[t] * d_rewards[t] #loss is - log prob * total reward
agent.optimizer.zero_grad()
loss.backward() #update
agent.optimizer.step()
if e%10==0:
print(e,sum(rewards))
plt.plot(total,color='blue') #plot
plt.pause(0.0001)
def run(i): #to visualize performance
for _ in range(i):
done = False
state = env.reset()
while not done:
env.render()
distribution = Categorical(agent.forward(torch.from_numpy(state).float()))
action = distribution.sample()
state,reward,done,info = env.step(action.item())
env.close()
上面是 CartPole 的离散动作空间代码,下面是 Pendulum 的连续动作空间代码。Sigma
(方差或标准偏差)在这里是恒定的,但添加它很容易。只需使最后一层有两个神经元并确保sigma
不是负数。同样,钟摆代码不起作用,因为大多数具有连续动作空间的环境对于这种简单的方法来说太复杂了。让它工作可能需要对超参数进行大量测试。
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.normal import Normal #continuous distribution
import numpy as np
import gym
import math
import matplotlib.pyplot as plt
import keyboard
class Agent(nn.Module):
def __init__(self,lr):
super(Agent,self).__init__()
self.fc1 = nn.Linear(3,64)
self.fc2 = nn.Linear(64,32)
self.fc3 = nn.Linear(32,1) #neural network with layers 3,64,32,1
self.optimizer = optim.Adam(self.parameters(),lr=lr)
def forward(self,x):
x = torch.relu(self.fc1(x)) #relu and tanh for output
x = torch.relu(self.fc2(x))
x = torch.tanh(self.fc3(x)) * 2
return x
env = gym.make('Pendulum-v0')
agent = Agent(0.01) #hyperparameters
SIGMA = 0.2
DISCOUNT = 0.99
total = []
for e in range(1000):
log_probs, rewards = [], []
done = False
state = env.reset()
while not done:
mu = agent.forward(torch.from_numpy(state).float())
distribution = Normal(mu, SIGMA)
action = distribution.sample().clamp(-2.0,2.0)
log_probs.append(distribution.log_prob(action))
state, reward, done, info = env.step([action.item()])
#reward = abs(state[1])
rewards.append(reward)
total.append(sum(rewards))
cumulative = 0
d_rewards = np.zeros(len(rewards))
for t in reversed(range(len(rewards))): #get discounted rewards
cumulative = cumulative * DISCOUNT + rewards[t]
d_rewards[t] = cumulative
d_rewards -= np.mean(d_rewards) #normalize
d_rewards /= np.std(d_rewards)
loss = 0
for t in range(len(rewards)):
loss += -log_probs[t] * d_rewards[t] #loss is - log prob * total reward
agent.optimizer.zero_grad()
loss.backward() #update
agent.optimizer.step()
if e%10==0:
print(e,sum(rewards))
plt.plot(total,color='blue') #plot
plt.pause(0.0001)
if keyboard.is_pressed("space"): #holding space exits training
raise Exception("Exited")
def run(i): #to visualize performance
for _ in range(i):
done = False
state = env.reset()
while not done:
env.render()
distribution = Normal(agent.forward(torch.from_numpy(state).float()), SIGMA)
action = distribution.sample()
state,reward,done,info = env.step([action.item()])
env.close()
大卫爱尔兰也写了这个关于我有一个不同的问题:
在这种情况下,算法不会改变。假设您的 NN 输出高斯的平均参数,那么 logπ(at|st) 只是在您采取的操作中评估的正态密度的对数,其中密度中的平均参数是您的 NN 的输出。然后,您可以通过它进行反向传播以更新网络的权重。