我知道 Python 有 4 种方法。在下文中,我复制了我为回归目的编写的代码。分类将非常相似:
第一:选择KBest:
from sklearn.feature_selection import SelectKBest, f_regression
train_data = train_data.apply(pd.to_numeric).astype('float32')
kb = SelectKBest(score_func=f_regression, k=70)
kb.fit(train_data.loc[:, train_data.columns != 'SalePriceLog'], train_data.SalePriceLog)
indices = np.argsort(kb.scores_)[::-1]
selected_features = []
for i in range(5):
selected_features.append(train_data.columns[indices[i]])
plt.figure()
plt.bar(selected_features, kb.scores_[indices[range(5)]], color='r', align='center')
plt.xticks(rotation=45)
结果:
二:RFE
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.feature_selection import RFE
model = LinearRegression()
rfe = RFE(model, 10)
fit_rfe = rfe.fit(train_data.loc[:, train_data.columns != 'SalePriceLog'], train_data.SalePriceLog)
indices_rfe = np.argsort(fit_rfe.ranking_)
selected_features_rfe = []
for i in range(10):
selected_features_rfe.append(train_data.columns[indices_rfe[i]])
selected_features_rfe
plt.figure()
plt.bar(selected_features_rfe, fit_rfe.ranking_[indices[range(10)]], color='r', align='center')
plt.xticks(rotation=45)
结果:
第三:PCA
from sklearn.decomposition import PCA
# pca = PCA(n_components=5)
pca = PCA(0.999)
fit = pca.fit(train_data.loc[:, train_data.columns != 'SalePriceLog'])
第四:ExtraTrees
from sklearn.ensemble import ExtraTreesRegressor
model_extra_tree = ExtraTreesRegressor()
model_extra_tree.fit(train_data.loc[:, train_data.columns != 'SalePriceLog'], train_data.SalePriceLog)
indices_extra_tree = np.argsort(model_extra_tree.feature_importances_)[::-1]
selected_feature_extra_tree = []
for i in range(10):
selected_feature_extra_tree.append(train_data.columns[indices_extra_tree[i]])
plt.figure
plt.bar(selected_feature_extra_tree, model_extra_tree.feature_importances_[indices_extra_tree[range(10)]])
plt.xticks(rotation=45)
结果: