我一直在尝试训练一个 DDQN 来玩 OpenAI Gym 的 CartPole-v1,但发现虽然它开始很好并且开始反复获得满分(500)(在下图中大约 600 集),然后它似乎去了偏离轨道,玩得越多,做得越差。
我对 ML 还很陌生,所以我不确定是什么原因造成的,所以我不确定如何开始调试(我尝试过调整一些超参数,但似乎没有什么能阻止这种趋势)。
如果有帮助,这是我的代理的(可能)相关部分:
def _build_model(self):
model = Sequential()
model.add(Dense(24, input_dim=self.state_size, activation="relu"))
model.add(Dense(24, activation="relu"))
model.add(Dense(self.action_size, activation="linear"))
model.compile(optimizer=Adam(lr=self.learning_rate), loss="mse")
return model
def get_action(self, state):
# Use random exploration for the current rate.
if np.random.rand() < self.epsilon:
return random.randrange(self.action_size)
# Otherwise use the model to predict the rewards and select the max.
q_values = self.model.predict(state)
return np.argmax(q_values[0])
def replay(self, batch_size):
if len(agent.memory) < minibatch_size:
return
# Decay the exploration rate.
self.epsilon *= self.epsilon_decay
self.epsilon = max(self.epsilon_min, self.epsilon)
minibatch = random.sample(self.memory, minibatch_size)
state_batch, q_values_batch = [], []
for state, action, reward, next_state, done in minibatch:
# Get predictions for all actions for the current state.
q_values = self.model.predict(state)
# If we're not done, add on the future predicted reward at the discounted rate.
if done:
q_values[0][action] = reward
else:
f = self.target_model.predict(next_state)
future_reward = max(self.target_model.predict(next_state)[0])
q_values[0][action] = reward + self.gamma * future_reward
state_batch.append(state[0])
q_values_batch.append(q_values[0])
# Re-fit the model to move it closer to this newly calculated reward.
self.model.fit(np.array(state_batch), np.array(q_values_batch), batch_size=batch_size, epochs=1, verbose=0)
self.update_weights()
def update_weights(self):
weights = self.model.get_weights()
target_weights = self.target_model.get_weights()
for i in range(len(target_weights)):
target_weights[i] = weights[i] * self.tau + target_weights[i] * (1 - self.tau)
self.target_model.set_weights(target_weights)
完整的笔记本在这里。
