我使用 Python 和 Weka 在我的数据集(91 个预测变量)上运行特征选择。我可以看到不同算法的巨大差异(特征排名)。而且这些结果与随机森林或梯度提升拟合得出的结果仍有很大不同。那么我该如何处理这个差距或者我应该信任哪种算法呢?是否有任何绩效评估方法或经验法则?
# Univariate Selection
import pandas
import numpy
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# feature extraction
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, y)
# summarize scores
numpy.set_printoptions(precision=3)
print(fit.scores_)
# Feature Extraction with RFE
from sklearn.feature_selection import RFE
# feature extraction
model = LogisticRegression()
rfe = RFE(model, 15)
fit = rfe.fit(X, y)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)
# VarianceThreshold
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
sel.fit_transform(X)
idxs = sel.get_support(indices=True)
np.array(X)[:, idxs]