我正在使用 pandas 和 scikti-learn 使用在 DataFrame 上使用 TfidfVectorizer 编码的文本特征进行二进制文本分类。这是一些虚拟代码,说明了我在做什么:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
data_dict = {'tid': [0,1,2,3,4,5,6,7,8,9],
'text':['This is the first.', 'This is the second.', 'This is the third.', 'This is the fourth.', 'This is the fourth.', 'This is the fourth.', 'This is the nintieth.', 'This is the fourth.', 'This is the fourth.', 'This is the first.'],
df = pd.DataFrame(data_dict)
tfidf = TfidfVectorizer(analyzer='word')
df['text'] = tfidf.fit_transform(df['text'])
X_train, X_test, y_train, y_test = train_test_split(df[['tid', 'text']], df[['cat']])
clf = LinearSVC()
clf.fit(X_train, y_train)
Traceback (most recent call last):
File "<ipython-input-151-b0953fbb1d6e>", line 1, in <module>
clf.fit(X, y)
File "C:\Users\Me\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\svm\classes.py", line 227, in fit
dtype=np.float64, order="C")
File "C:\Users\Me\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 573, in check_X_y
ensure_min_features, warn_on_dtype, estimator)
File "C:\Users\Me\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 433, in check_array
array = np.array(array, dtype=dtype, order=order, copy=copy)
ValueError: setting an array element with a sequence.