为未来时间序列预测 LSTM(Keras)重新格式化数据

数据挖掘 喀拉斯 时间序列 lstm
2022-02-25 06:33:30

所以我正在关注这个笔记本(至少对于数据部分)并且有一个训练有素的模型。

我想做的是真正的未来预测,因为 LSTM 能够为看不见的时间步生成数据(如果它们是有状态的)。

尝试在这里关注这篇文章,因为这是我在这个概念上唯一能找到的。

这是默认功能:

def load_data(filename, sequence_length):
    # Read the data file
    raw_data = pd.read_csv(filename, dtype=float)
    raw_data = raw_data.values
    print (raw_data)

    # Change all zeros to the number before the zero occurs
    for x in range(0, raw_data.shape[0]):
        for y in range(0, raw_data.shape[1]):
            if (raw_data[x][y] == 0):
                raw_data[x][y] = raw_data[x - 1][y]

    # Convert the file to a list
    data = raw_data.tolist()

    # Convert the data to a 3D array (a x b x c)
    # Where a is the number of days, b is the window size, and c is the number of features in the data file
    result = []
    for index in range(len(data) - sequence_length):
        result.append(data[index: index + sequence_length])

    # Normalizing data by going through each window
    # Every value in the window is divided by the first value in the window, and then 1 is subtracted
    d0 = np.array(result)
    dr = np.zeros_like(d0)
    dr[:, 1:, :] = d0[:, 1:, :] / d0[:, 0:1, :] - 1

    # Keeping the unnormalized prices for Y_test
    # Useful when graphing bitcoin price over time later
    start = 2400
    end = int(dr.shape[0] + 1)
    unnormalized_bases = d0[start:end, 0:1, 7]

    # Splitting data set into training (First 90% of data points) and testing data (last 10% of data points)
    split_line = round(0.9 * dr.shape[0])
    training_data = dr[:int(split_line), :]

    # Shuffle the data
    np.random.shuffle(training_data)

    # Training Data
    X_train = training_data[:, :-1]
    Y_train = training_data[:, -1]
    Y_train = Y_train[:, 7]

    # Testing data
    X_test = dr[int(split_line):, :-1]
    Y_test = dr[int(split_line):, 6, :]
    Y_test = Y_test[:, 7]

    # Get the day before Y_test's price
    Y_daybefore = dr[int(split_line):, 5, :]
    Y_daybefore = Y_daybefore[:, 7]

    # Get window size and sequence length
    sequence_length = sequence_length
    window_size = sequence_length - 1  # because the last value is reserved as the y value

    return X_train, Y_train, X_test, Y_test, Y_daybefore, unnormalized_bases, window_size

并尝试根据帖子进行重构:

def load_data_future(filename, sequence_length):
    # Read the data file
    raw_data = pd.read_csv(filename, dtype=float)
    raw_data = raw_data.values
    print (raw_data)

    # Change all zeros to the number before the zero occurs
    for x in range(0, raw_data.shape[0]):
        for y in range(0, raw_data.shape[1]):
            if (raw_data[x][y] == 0):
                raw_data[x][y] = raw_data[x - 1][y]

    # Convert the file to a list
    data = raw_data.tolist()

    # Convert the data to a 3D array (a x b x c)
    # Where a is the number of days, b is the window size, and c is the number of features in the data file
    result = []
    for index in range(len(data) - sequence_length):
        result.append(data[index: index + sequence_length])

    # Normalizing data by going through each window
    # Every value in the window is divided by the first value in the window, and then 1 is subtracted
    d0 = np.array(result)
    dr = np.zeros_like(d0)
    dr[:, 1:, :] = d0[:, 1:, :] / d0[:, 0:1, :] - 1

    # Keeping the unnormalized prices for Y_test
    # Useful when graphing bitcoin price over time later
    start = 2400
    end = int(dr.shape[0] + 1)
    unnormalized_bases = d0[start:end, 0:1, 7]

    # Splitting data set into training (First 90% of data points) and testing data (last 10% of data points)
    split_line = round(1 * dr.shape[0])
    training_data = dr[:int(split_line), :]

    # Shuffle the data
    np.random.shuffle(training_data)

    #Reformatting training data
    X_train = training_data[:,:-1]
    Y_train = training_data[:,1]
    #Y_train = Y_train[:, 1]


    # Get the day before Y_test's price
    Y_daybefore = dr[int(split_line):, 5, :]
    Y_daybefore = Y_daybefore[:, 7]

    # Get window size and sequence length
    sequence_length = sequence_length
    window_size = sequence_length - 1  # because the last value is reserved as the y value
    entire_data = dr

    return X_train, Y_train, Y_daybefore, unnormalized_bases, window_size,entire_data

我究竟做错了什么??我可以看到每个特征的格式化样本、输出、时间步长。不知道如何操纵它来实现我的目标。

即使是高级指针也会在评论中有所帮助!

0个回答
没有发现任何回复~