数据挖掘 - 我的支持向量回归和其他模型中的混淆偏移 - 吾爱随笔录

我在支持向量回归预测中遇到了奇怪的偏移（下面的代码）。

简要概述：我有一组 10 个 .xls 工作表作为输入数据（每个有 30 行和 27 列），以及我用作测试数据集的第 11 个 .xls 工作表。

我正在使用电力数据集，并试图使用 SVR 预测第 11 个月的用电量。我正在使用 Luke Benning 在 Github 上编写和发布的代码：https ://github.com/lbenning/Load-Forecasting

我所有的数据集的结构都相同，但是当我运行预测模型时，我收到了一个偏移量（如下图所示）。实际数据与预测数据似乎遵循类似的模式，但是这两个图是偏移的。

有谁知道我可能想探索什么以尝试了解此偏移的来源？

#! /usr/bin/python

import math
import statistics
import visualizer
import numpy as np
from datagen import constructData
from sklearn import svm

# Applies Support Vector Regression to the electricity dataset,
# prints out the accuracy rate to the terminal and plots
# predictions against actual values
def suppVectorRegress():

    kernelList = ["linear","rbf",polyKernel]
    names = ["linear","radial basis","poly"]
    preds = []

    # Retrieve time series data & apply preprocessing
    data = constructData()

    cutoff = len(data)-30
    xTrain = data[0][0:cutoff]
    yTrain = data[1][0:cutoff]
    xTest = data[0][cutoff:]
    yTest = data[1][cutoff:]

    # Fill in missing values denoted by zeroes as an average of
    # both neighbors
    statistics.estimateMissing(xTrain,0.0)
    statistics.estimateMissing(xTest,0.0)

    # Logarithmically scale the data
    xTrain = [[math.log(y) for y in x] for x in xTrain]
    xTest = [[math.log(y) for y in x] for x in xTest]
    yTrain = [math.log(x) for x in yTrain]

    # Detrend the time series
    indices = np.arange(len(data[1]))
    trainIndices = indices[0:cutoff]
    testIndices = indices[cutoff:]
    detrended,slope,intercept = statistics.detrend(trainIndices,yTrain)
    yTrain = detrended

    for gen in range(len(kernelList)):

        # Use SVR to predict test observations based upon training observations
        pred = svrPredictions(xTrain,yTrain,xTest,kernelList[gen])
        # Add the trend back into the predictions
        trendedPred = statistics.reapplyTrend(testIndices,pred,slope,intercept)
        # Reverse the normalization
        trendedPred = [math.exp(x) for x in trendedPred]
        # Compute the NRMSE
        err = statistics.normRmse(yTest,trendedPred)

        print ("The Normalized Root-Mean Square Error is " + str(err) + " using kernel " + names[gen] + "...")

        preds.append(trendedPred)

    names.append("actual")
    preds.append(yTest)

    # Change the parameters 2017,2,1 based on the month you want to predict.
    visualizer.comparisonPlot(2017,2,1,preds,names,plotName="Support Vector Regression Load Predictions vs. Actual", 
        yAxisName="Predicted Kilowatts")

# Construct a support vector machine and get predictions
# for the test set
# Returns a 1-d vector of predictions
def svrPredictions(xTrain,yTrain,xTest,k):
    clf = svm.SVR(C=2.0,kernel=k)
    clf.fit(xTrain,yTrain)
    return clf.predict(xTest)

# A scale invariant kernel (note only conditionally semi-definite)
def polyKernel(x,y):
    return (np.dot(x,y.T)+1.0)**0.95

if __name__=="__main__":
    suppVectorRegress()

我正在生成如下数据：

'''
Functions for retrieving Elia dataset 
& forming training/testing datasets
'''

# constructs dataset for simulations
# the last dataset in this file list, is the one used as the training set.
def constructData():
  files = ["data/jan_16_elec_scaled.xls", "data/feb_16_elec_scaled.xls", 
           "data/mar_16_elec_scaled.xls", "data/apr_16_elec_scaled.xls",
           "data/may_16_elec_scaled.xls","data/jun_16_elec_scaled.xls",
           "data/jul_16_elec_scaled.xls", "data/aug_16_elec_scaled.xls",
           "data/sep_16_elec_scaled.xls", "data/oct_16_elec_scaled.xls",
           "data/nov_16_elec_scaled.xls"]

#  files = ["data/jan_16_elec_NOscaled.xls", "data/feb_16_elec_NOscaled.xls", 
#           "data/mar_16_elec_NOscaled.xls", "data/apr_16_elec_NOscaled.xls",
#           "data/may_16_elec_NOscaled.xls","data/jun_16_elec_NOscaled.xls",
#           "data/jul_16_elec_NOscaled.xls", "data/aug_16_elec_NOscaled.xls",
#           "data/sep_16_elec_NOscaled.xls", "data/oct_16_elec_NOscaled.xls",
#           "data/nov_16_elec_NOscaled.xls"]         
  return labelSeries(loadSeries(files))

# constructs labelled data from a
# univariate time series
sdef labelSeries(series):
  xData = []
  yData = []
  for x in range(len(series)-1):
    xData.append(series[x]) # xData contains all of the items up until the last item
    yData.append(np.mean(series[x+1])) # yData is the last item in the list
  return (xData,yData)

# arg1 : list of excel spreadsheets filenames
# returns : load univariate time series
def loadSeries(fileList):
  # Retrieve time series examples
  xData = []
  for fileName in fileList:
    book = xlrd.open_workbook(fileName)
    sheet = book.sheet_by_index(0)
    for rx in range(2,sheet.nrows):
      row = sheet.row(rx)[3:]
      row = [row[x].value for x in range(0,len(row)-4)]
      xData.append(row)
  return xData