import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from scipy.io import arff
data = arff.loadarff("C:\\Users\\manib\\Desktop\\Python Job\\Project Work\\Breast\\Breast.arff")
df = pd.DataFrame(data[0])
df.head()
df["Class"].value_counts()
X = df.iloc[:,:24481].values
y = df.iloc[:, -1].values
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
y=y.astype('str')
y= label_encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
sel = SelectFromModel(RandomForestClassifier(n_estimators = 100))
sel.fit(X_train, y_train)
sel.get_support()
selected_feat= X_train.columns[(sel.get_support())]
len(selected_feat)
print(selected_feat)
AttributeError:“numpy.ndarray”对象没有属性“列”
数据挖掘
scikit-学习
熊猫
麻木的
2021-10-11 03:24:05
2个回答
问题是train_test_split(X, y, ...)
返回 numpy 数组而不是pandas 数据帧。Numpy 数组没有名为的属性columns
如果您想查看保留了哪些功能SelectFromModel
,您需要将X_train
(这是一个 numpy.array)替换X
为pandas.DataFrame
.
selected_feat= X.columns[(sel.get_support())]
这将返回特征选择器保留的列列表。
如果您想查看保留了多少功能,您可以运行以下命令:
sel.get_support().sum() # by default this will count 'True' as 1 and 'False' as 0
因为这 :
X = df.iloc[:,:24481].values
y = df.iloc[:, -1].values
你应该删除.values
或做额外的X_col
,y_col
就像那样
X_col = df.iloc[:,:24481]
y_col = df.iloc[:, -1]