数据挖掘 - Keras 的 fit_generator() 没有调用我的生成器 - 吾爱随笔录

当我调用 Keras'fit_generator()并传入我创建的自定义生成器类时，我在 spew 中看到“Epoch 1/1”，仅此而已。它就挂在那里，并且永远不会调用生成器。我知道这一点，因为我将打印语句放入getitem从未打印过的语句中。

此数据生成器是Shervine Amidi 的教程示例的修改版本，该示例生成器继承自 Keras 序列对象：

class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, batchID, 
                 batch_size = 32, 
                 dim = (32,32,32)):

        self.dim = dim
        self.batch_size = batch_size
        self.datafile_IDs = []
        self.labelfile_IDs = []
        self.batchID = batchID    
        self.DataDir = "data/"
        self.BatchDir = ""

        DataDir = self.DataDir
        BatchDir = DataDir + batchID + "/"
        self.BatchDir = BatchDir

        path = BatchDir + "datafilenames_" + batchID + ".pkl"                                    
        fd = open(path, "rb")
        self.datafile_IDs = pkl.load(fd)
        fd.close()

        path = BatchDir + "labelfilenames_" + batchID + ".pkl"        
        fd = open(path, "rb")
        self.labelfile_IDs = pkl.load(fd)
        fd.close()

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(
            np.floor(len(self.datafile_IDs) / self.batch_size))

    def __getitem__(self, index):

        'Generate one batch of data'

        datafn = self.datafile_IDs[index]        
        labelfn = self.labelfile_IDs[index]  

        print("In getitem: index = %d, datafn = %s, labelfn = %s" % (
               index, datafn, labelfn))

        batch_size = self.batch_size

        # Initialize data arrays for this batch     
        X = np.empty((self.batch_size, *self.dim))
        y = np.empty((self.batch_size), dtype=int)         

        BatchDir = self.BatchDir

        # Load data
        datafn = BatchDir + datafn
        X = np.load(datafn)

        # Load label
        labelfn = BatchDir + labelfn
        y = np.load(labelfn)

        return X, y        

genbatchfiles(df_short, batchID = "short", batch_size = 20)
params = {'batchID': "short", 'batch_size': 20, 'dim': (100, 10088)} 
dg = DataGenerator(**params) 
time_series_length, input_dim, output_dim = 100, 10088, 1
model = Sequential()
model.add(LSTM(20, input_shape=(time_series_length, input_dim))) 
model.add(Dense(output_dim, activation='relu'))

model.compile(loss='mean_squared_error',
              optimizer='sgd',
              metrics=['accuracy'])
model.fit_generator(generator = dg,
                    steps_per_epoch = 5,
                    use_multiprocessing = True, 
                    workers = 6,
                    verbose = 2)

根据下面回复的人的请求，我正在为用于genbatchfiles生成输入数据的函数添加代码：

def genbatchfiles(df, batchID, batch_size = 5):

    DataDir = "data/"
    BatchDir = DataDir + batchID + "/"

    # If directory does not already exist
    # for this batch, create it.
    if not os.path.isdir(BatchDir):
        mkdir(BatchDir)

    # Column "sigccm" (signal cross correlation matrix) holds
    # a series each of whose elements is a 100 X 10088 array.
    # Column "Attack" holds labels with values 1's and 0's that
    # indicate whether a cell phone spoofing signal is present
    sigccm = df["sigccm"]
    attack = df["Attack"]

    idcnt = 0
    datafiles = []
    labelfiles = []

    # number of records to process
    nrecs = len(sigccm)

    for i in range(0, nrecs, batch_size):

        if i + batch_size > nrecs:
            upperbound = nrecs 
        else: 
            upperbound = i + batch_size

        x = np.stack(sigccm[i : upperbound]) 
        y = np.stack(attack[i : upperbound]) 

        fnm = 'data_{:s}{:02d}.npy'.format(batchID, idcnt)
        datafiles.append(fnm)        
        np.save(BatchDir + fnm, x)

        fnm = 'labels_{:s}{:02d}.npy'.format(batchID, idcnt)
        labelfiles.append(fnm)        
        np.save(BatchDir + fnm, y)

        idcnt = idcnt + 1

    path = BatchDir + "datafilenames_" + batchID + ".pkl"

    fd = open(path, "wb")
    pkl.dump(datafiles, fd)
    fd.close() 

    path = BatchDir + "labelfilenames_" + batchID + ".pkl"

    fd = open(path, "wb")
    pkl.dump(labelfiles, fd)
    fd.close()

以下是如何生成一些与我输入 Keras LSTM 的数据具有相同形状的虚假数据的方法：

x = np.random.rand(150, 100, 10088).tolist()
df = pd.DataFrame({"sigccm" : x})
bools = np.round(np.random.rand(150), decimals=0)
attack = pd.Series(bools)
df["Attack"] = attack

其次是：

genbatchfiles(df, batchID = "phony", batch_size = 20)
params = {'batchID': "phony", 'batch_size': 20, 'dim': (100, 10088)} 
dg = DataGenerator(**params)

其次是 LSTM 代码。

当我将这些虚假数据输入 LSTM 时，Keras 挂在生成器中的方式与挂在真实数据中的方式完全相同，因此任何希望这样做的人都应该能够重现我看到的错误。

警告：即使这个玩具数据集也很大。每条记录大约 1 兆字节，而我上面生成的虚假数据有 150 条记录，因此它包含大约 150MB。