Tflearn“nan”权重矩阵

数据挖掘 强化学习 Python q学习
2022-03-13 05:51:52

我想建立一个 DQN。所以我跟着这段代码看了一些关于 DQN 想法的视频。我的代码是这样的(我的代码是用 tflearn 编写的,而他的代码是 keras 编写的):

import tflearn as tfl
import numpy as np
import gym
from collections import deque
import random

class DeepQ():
def __init__(self,game="SpaceInvaders-v0"):
    self.game=game
    self.env=gym.make(game)
    self.storage=deque()
    self.filter_size=[4,4]
    self.itertime=1000
    self.random_move_prop=0.8
    np.random.seed(1)
    self.minibatch_size=250
    self.discounted_future_reward=0.9

def Q_Network(self,learning_rate=0.0000001,load=False,model_path=None,checkpoint_path="X://xxx//xxx//Documents//GitHub//Deeplearning_for_starters//Atari_modells//checkpoint.ckpt"):

    if load==False:
        net=tfl.layers.core.input_data(shape=[None,210,160,3])# rework this stuff
        net=tfl.layers.conv.conv_2d(net,nb_filter=3,filter_size=self.filter_size,activation='relu')
        net=tfl.layers.conv.conv_2d(net,nb_filter=3,filter_size=self.filter_size,activation="relu")
        #net=tfl.layers.fully_connected(net,20,activation="relu")
        net=tfl.layers.flatten(net)
        #net=tfl.layers.fully_connected(net,18,activation="relu")
        net=tfl.layers.fully_connected(net,10,activation='relu')
        net=tfl.layers.fully_connected(net,self.env.action_space.n,activation="linear")
        net=tfl.layers.estimator.regression(net,learning_rate=learning_rate)
        self.modell=tfl.DNN(net,checkpoint_path=checkpoint_path)
    else:
        net=tfl.layers.core.input_data(shape=[None,210,160,3])
        net=tfl.layers.conv.conv_2d(net,nb_filter=3,filter_size=self.filter_size,activation='relu')
        net=tfl.layers.conv.conv_2d(net,nb_filter=3,filter_size=self.filter_size,activation="relu")
        #net=tfl.layers.fully_connected(net,20,activation="relu")
        net=tfl.layers.flatten(net)
        #net=tfl.layers.fully_connected(net,18,activation="relu")
        net=tfl.layers.fully_connected(net,10,activation='relu')
        net=tfl.layers.fully_connected(net,self.env.action_space.n,activation="linear")
        net=tfl.layers.estimator.regression(net,learning_rate=learning_rate)
        self.modell=tfl.DNN(net)
        self.modell.load(model_path,weights_only=True)
def Q_Learning(self):
    observation=self.env.reset()
    for i in range(self.itertime):
        #self.env.render()
        observation=observation.reshape(1,210,160,3) 
        if np.random.rand()<=self.random_move_prop: 
            #print("Random step")
            action=np.random.randint(low=0,high=self.env.action_space.n) 
        else:
            #print("Random prediction") #for debugging usefull
            action=self.modell.predict(observation)
            action=np.argmax(action)
        new_observation, reward, done, info=self.env.step(action)
        self.storage.append((observation,action,reward,new_observation,done))
        observation=new_observation
        if done:
            self.env.reset()
    print("###############################################")
    print("Done with observing!")
    print("###############################################")
    minibatch=random.sample(self.storage,self.minibatch_size)# take random observations from our data
    x=np.zeros((self.minibatch_size,)+observation.shape)
    y=np.zeros((self.minibatch_size,self.env.action_space.n))
    for i in range(0,self.minibatch_size):
        Observation=minibatch[i][0]
        Action=minibatch[i][1]
        Reward=minibatch[i][2]
        New_observation=minibatch[i][3]
        done=minibatch[i][4]
        print("Processing batch data... (step:"+str(i)+" from "+str(self.minibatch_size)+")")
        x[i:i+1]=Observation.reshape((1,)+observation.shape)
        y[i]=self.modell.predict(Observation)
        Q_sa=self.modell.predict(Observation)
        if done:
            y[i,action]=reward
        else:
            y[i,action]=reward+self.discounted_future_reward*np.max(Q_sa)
        self.modell.fit_batch(x,y)
    self.modell.save("X://xxx//xxx//xxx//SpaceInvaders1.tfl")
    print("")
    print("Modell fitting acomplished!")
    print("")
def Q_predict(self,model_path="Your path here"):
    self.Q_Network(load=True,model_path=model_path)
    observation=self.env.reset()
    observation=observation.reshape((1,)+observation.shape)
    done=False
    total_reward=0.0
    while not done:
        self.env.render()
        Q=self.modell.predict(observation)
        print(Q)
        action=np.argmax(Q)
        print(action)
        new_observation,reward,done,info=self.env.step(action)
        observation=new_observation
        observation=new_observation.reshape((1,)+observation.shape)
        total_reward+=reward
    print("Game ends with a score of: "+str(total_reward))
    print("")

问题是,如果我运行预测函数,网络什么也不做。我发现所有的权重都用nan. 我读到的是它可能取决于学习率,所以我将学习率从实际降低1e-3到了实际水平,但这并没有改变。

1个回答

所以,我想通了。问题是损失函数。我在这里发现了类似的问题所以因为我是 tflearn 的菜鸟,我不知道你是否可以将损失函数更改为自定义函数(我想你可以)。我改用mean_squared(均方误差)。这解决了我的问题。如果有人能解释这个问题,我将不胜感激,这样我就能更好地理解它。