我想在井字游戏上实施策略梯度。我尝试将适用于任何环境的代码(例如 CartPole-v0)用于我的井字游戏。但这不是学习。没有错误。只是结果太糟糕了。
RandomPlayer(“玩家 X”)vs PolicyAgent(“玩家 O”)
所以可以看到策略代理在 500 场战斗后没有学习。每场战斗包括与随机玩家的 100 场比赛。一起 500 * 100 场比赛。
有人可以告诉我代码中的问题或错误。我想不明白。或者我需要改进的地方。会很棒的。
这也是一个做同样的项目,我想做,但成功了。 https://github.com/fcarsten/tic-tac-toe/blob/master/tic_tac_toe/DirectPolicyAgent.py 我没有得到我所做的不同。
代码:
套餐:
import torch
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import gym
from gym import wrappers
神经网络:
class PolicyNetwork(nn.Module):
def __init__(self, lr, input_dims, fc1_dims, fc2_dims, n_actions):
super(PolicyNetwork, self).__init__()
self.input_dims = input_dims
self.lr = lr
self.fc1_dims = fc1_dims
self.fc2_dims = fc2_dims
self.n_actions = n_actions
self.fc1 = nn.Linear(self.input_dims, self.fc1_dims)
self.fc2 = nn.Linear(self.fc1_dims, self.fc2_dims)
self.fc3 = nn.Linear(self.fc2_dims, self.n_actions)
self.optimizer = optim.Adam(self.parameters(), lr=lr)
def forward(self, observation):
state = T.Tensor(observation)
x = F.relu(self.fc1(state))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
保单代理:
class PolicyAgent:
def __init__(self, player_name):
self.name = player_name
self.value = PLAYER[self.name]
def board_to_input(self, board):
input_ = np.array([0] * 27)
for i, val in enumerate(board):
if val == self.value:
input_[i] = 1
if val == self.value * -1:
input_[i+9] = 1
if val == 0:
input_[i+18] = 1
return np.reshape(input_, (1,-1))
def start(self, learning_rate=0.001, gamma=0.1):
self.lr = learning_rate
self.gamma = gamma
self.all_moves = list(range(0,9))
self.policy = PolicyNetwork(self.lr, 27, 243, 91, 9)
self.reward_memory = []
self.action_memory = []
def turn(self, board, availableMoves):
state = self.board_to_input(board.copy())
prob = F.softmax(self.policy.forward(state))
action_probs = torch.distributions.categorical.Categorical(prob)
action = action_probs.sample()
while action.item() not in availableMoves:
state = self.board_to_input(board.copy())
prob = F.softmax(self.policy.forward(state))
action_probs = torch.distributions.categorical.Categorical(prob)
action = action_probs.sample()
log_probs = action_probs.log_prob(action)
self.action_memory.append(log_probs)
self.reward_memory.append(0)
return action.item()
def learn(self, result):
if result == 0:
reward = 0.5
elif result == self.value:
reward = 1.0
else:
reward = 0
self.reward_memory.append(reward)
#print(self.reward_memory)
self.policy.optimizer.zero_grad()
#G = np.zeros_like(self.action_memory, dtype=np.float64)
G = np.zeros_like(self.reward_memory, dtype=np.float64)
#running_add = reward
#for t in reversed(range(0, len(self.action_memory))):
# G[t] = running_add
# running_add = running_add * self.gamma
#'''
running_add = 0
for t in reversed(range(0, len(self.reward_memory))):
if self.reward_memory[t] != 0:
running_add = 0
running_add = running_add * self.gamma + self.reward_memory[t]
G[t] = running_add
for t in range(len(self.reward_memory)):
G_sum = 0
discount = 1
for k in range(t, len(self.reward_memory)):
G_sum += self.reward_memory[k] * discount
discount *= self.gamma
G[t] = G_sum
mean = np.mean(G)
std = np.std(G) if np.std(G) > 0 else 1
G = (G-mean)/std
#'''
G = T.tensor(G, dtype=T.float)
loss = 0
for g, logprob in zip(G, self.action_memory):
loss += -g * logprob
loss.backward()
self.policy.optimizer.step()
self.reward_memory = []
self.action_memory = []

