我对数据科学这个话题完全陌生。在以下资源的帮助下,我想我已经设法在火车数据集上做了一个非常简单和基本的线性回归:
我实际执行计算的Python 代码(编写为 iPython 笔记本)如下所示:
### Stage 0: "Import some stuff"
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
### Stage 1: "Prepare train dataset"
my_train_dataset = pd.read_csv("../train.csv")
### remove categorical cols
only_numerical_train_dataset = my_train_dataset.loc[:, my_train_dataset.dtypes!=object]
### remove 'Id' and 'SalePrice' columns
my_train_dataset_X = only_numerical_train_dataset.drop(['Id','SalePrice'], axis = 1)
### insert median into cells with missing values
print("Before: Number of cells with missing values in train data: " + str(np.sum(np.sum(my_train_dataset_X.isnull()))))
null_values_per_col = np.sum(my_train_dataset_X.isnull(), axis=0)
cols_to_impute = []
for key in null_values_per_col.keys():
if null_values_per_col.get(key) != 0:
cols_to_impute.append(key)
print("Before: Need to replace values in the columns in train data: " + str(cols_to_impute) + "\n")
imputation_val_for_na_cols = dict()
for col in cols_to_impute:
if (my_train_dataset_X[col].dtype == 'float64' ) or (my_train_dataset_X[col].dtype == 'int64'):
#numerical col
imputation_val_for_na_cols[col] = np.nanmedian(my_train_dataset_X[col]) #with median
for key, val in imputation_val_for_na_cols.items():
my_train_dataset_X[key].fillna(value= val, inplace = True)
print("After: Number of cells with missing values in train data: " + str(np.sum(np.sum(my_train_dataset_X.isnull()))))
null_values_per_col = np.sum(my_train_dataset_X.isnull(), axis=0)
cols_to_impute = []
for key in null_values_per_col.keys():
if null_values_per_col.get(key) != 0:
cols_to_impute.append(key)
print("After: Need to replace values in the columns in train data: " + str(cols_to_impute) + "\n")
### Stage 2: "Sanity Check - the better the quality, the higher the price?"
plt.scatter(my_train_dataset.OverallQual, my_train_dataset.SalePrice)
plt.xlabel("Overall Quality of the house")
plt.ylabel("Price of the house")
plt.title("Relationship between Price and Quality")
plt.show()
### Stage 3: "Prepare the test dataset"
my_test_dataset = pd.read_csv("../test.csv")
### remove categorical cols
only_numerical_test_dataset = my_test_dataset.loc[:, my_test_dataset.dtypes!=object]
### remove 'Id' column
my_test_dataset_X = only_numerical_test_dataset.drop(['Id'], axis = 1)
### insert median into cells with missing values
print("Before: Number of cells with missing values in test data: " + str(np.sum(np.sum(my_test_dataset_X.isnull()))))
null_values_per_col = np.sum(my_test_dataset_X.isnull(), axis=0)
cols_to_impute = []
for key in null_values_per_col.keys():
if null_values_per_col.get(key) != 0:
cols_to_impute.append(key)
print("Before: Need to replace values in the columns in test data: " + str(cols_to_impute) + "\n")
imputation_val_for_na_cols = dict()
for col in cols_to_impute:
if (my_test_dataset_X[col].dtype == 'float64' ) or (my_test_dataset_X[col].dtype == 'int64'):
#numerical col
imputation_val_for_na_cols[col] = np.nanmedian(my_test_dataset_X[col]) #with median
for key, val in imputation_val_for_na_cols.items():
my_test_dataset_X[key].fillna(value= val, inplace = True)
print("After: Number of cells with missing values in test data: " + str(np.sum(np.sum(my_test_dataset_X.isnull()))))
null_values_per_col = np.sum(my_test_dataset_X.isnull(), axis=0)
cols_to_impute = []
for key in null_values_per_col.keys():
if null_values_per_col.get(key) != 0:
cols_to_impute.append(key)
print("After: Need to replace values in the columns in test data: " + str(cols_to_impute) + "\n")
### Stage 4: "Apply the model"
lm = LinearRegression()
lm.fit(my_train_dataset_X, my_train_dataset.SalePrice)
### Stage 5: "Sanity Check - the better the quality, the higher the predicted SalesPrice?"
plt.scatter(my_test_dataset.OverallQual, lm.predict(my_test_dataset_X))
plt.xlabel("Overall Quality of the house in test data")
plt.ylabel("Price of the house in test data")
plt.title("Relationship between Price and Quality in test data")
plt.show()
### Stage 6: "Check the performance of the Prediction"
from sklearn.model_selection import cross_val_score
scores = cross_val_score(lm, my_train_dataset_X, lm.predict(my_test_dataset_X), cv=10)
print("scores = " + str(scores))
我的问题是:
1. 为什么我在第 6 阶段收到错误以及如何解决?
ValueError Traceback (most recent call last)
<ipython-input-2-700c31f0d410> in <module>()
85 ### test the performance of the model
86 from sklearn.model_selection import cross_val_score
---> 87 scores = cross_val_score(lm, my_train_dataset_X, lm.predict(my_test_dataset_X), cv=10)
88 print("scores = " + str(scores))
89
ValueError: Found input variables with inconsistent numbers of samples: [1460, 1459]
2.我的简单基本线性回归方法有什么根本错误吗?
评论编辑:
@CalZ - 第一条评论:
my_test_dataset_X.shape = (1459, 36)
my_train_dataset_X.shape = (1460, 36)
@CalZ - 第二条评论: 一旦我确定我的方法没有根本错误,我会考虑重构代码。