我已经看到了这个、这个和这个问题,但是这些建议似乎都没有解决我的问题(所以我已经恢复了它们)。
我有以下代码:
nlp = spacy.load('en_core_web_sm')
parser = English()
class CleanTextTransformer(TransformerMixin):
def transform(self, X, **transform_params):
return [cleanText(text) for text in X]
def fit(self, X, y=None, **fit_params):
return self
def get_params(self, deep=True):
return {}
def cleanText(text):
text = text.strip().replace("\n", " ").replace("\r", " ")
text = text.lower()
return text
def tokenizeText(sample):
tokens = parser(sample)
lemmas = []
for tok in tokens:
lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
tokens = lemmas
tokens = [tok for tok in tokens if tok not in STOPLIST]
tokens = [nlp(tok)[0].lemma_ for tok in tokens if tok not in SYMBOLS]
return tokens
class multilabelbin(TransformerMixin):
def __init__(self, *args, **kwargs):
self.encoder = MultiLabelBinarizer(*args, **kwargs)
def fit(self, x, y=0):
self.encoder.fit(x)
return self
def transform(self, x, y=0):
return self.encoder.transform(x)
def represent(rd, ed, number, category, text):
doc_train = rd
doc_test = ed
for column in category:
doc_train[column] = [tuple(doc.split(",")) for doc in rd[column]]
doc_test[column] = [tuple(doc.split(",")) for doc in ed[column]]
print("columns split")
mlb = multilabelbin(sparse_output=False)
mlb.fit(doc_train)
transformed_r = mlb.transform(doc_train)
for row in range(len(doc_train[column])):
print(doc_train[column][row])
doc_train[column][row] = transformed_r[row]
transformed_e = mlb.transform(doc_test)
for row in range(len(doc_test[column])):
print(doc_test[column][row])
doc_test[column][row] = transformed_e[row]
print("categorical columns encoded using MultiLabelBinarizer()")
for column in number:
ss = StandardScaler()
ss.fit(doc_train[column].values.reshape(-1, 1))
doc_train[column] = ss.transform(doc_train[column].values.reshape(-1, 1))
doc_test[column] = ss.transform(doc_test[column].values.reshape(-1, 1))
print("numbers scaled using StandardScaler()")
for column in text:
cleaner = CleanTextTransformer()
cleaner.fit(doc_train[column].tolist())
doc_train[column] = cleaner.transform(doc_train[column])
doc_test[column] = cleaner.transform(doc_test[column])
print(doc_train[column])
vec = TfidfVectorizer(tokenizer=tokenizeText, ngram_range=(1, 1))
vec.fit(doc_train[column].tolist())
doc_train[column] = vec.transform(doc_train[column]).todense()
doc_test[column] = vec.transform(doc_test[column]).todense()
print(doc_train[column])
print("text vectorized")
print("preprocessing completed successfully")
return doc_train, doc_test
def train_classifier(train_docs, classAxis):
clf = OneVsRestClassifier(LogisticRegression(solver='saga'))
X = [list(train_docs[list(train_docs)[i]]) for i in range(1, len(train_docs))]
y = list(train_docs[classAxis])
classifier = clf.fit(X, y)
return classifier
df = pd.DataFrame(pd.read_csv("testdata.csv", header=0))
test_data = pd.DataFrame(pd.read_csv("test.csv", header=0))
train, test = represent(df, test_data, ["Cat2", "Cat5"], ["Cat6"], ["Cat1", "Cat3", "Cat4", "Cat7"])
print(train, test)
model = train_classifier(train, "Class")
train.csv包含以下格式的数据:
test.csv是相同的格式。
如您所见,有文本值、数字值和分类值。我的代码首先拆分分类值(以逗号分隔),然后通过MultiLabelBinarizer(). 然后,我只是简单地缩放数字。最后,我使用本教程spaCy中的设置处理文本。我确保也对测试数据应用转换,所以那里不会有不一致的地方。最后,我-enise函数中的所有内容,这应该会有所帮助......但它没有。在该行中,我收到以下错误:listtrain_classifierclassifier = clf.fit(list(X), y)
Traceback (most recent call last):
File "<input>", line 1, in <module>
File "C:\Users\User\AppData\Local\JetBrains\Toolbox\apps\PyCharm-P\ch-0\191.7141.48\helpers\pydev\_pydev_bundle\pydev_umd.py", line 197, in runfile
pydev_imports.execfile(filename, global_vars, local_vars) # execute the script
File "C:\Users\User\AppData\Local\JetBrains\Toolbox\apps\PyCharm-P\ch-0\191.7141.48\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
exec(compile(contents+"\n", file, 'exec'), glob, loc)
File "C:/Users/User/PycharmProjects/ml/ml.py", line 148, in <module>
model = train_classifier(train, "Class")
File "C:/Users/User/PycharmProjects/ml/ml.py", line 124, in train_classifier
classifier = clf.fit(list(X), y)
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\multiclass.py", line 215, in fit
for i, column in enumerate(columns))
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\externals\joblib\parallel.py", line 917, in __call__
if self.dispatch_one_batch(iterator):
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\externals\joblib\parallel.py", line 759, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\externals\joblib\parallel.py", line 716, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 182, in apply_async
result = ImmediateResult(func)
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 549, in __init__
self.results = batch()
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\externals\joblib\parallel.py", line 225, in __call__
for func, args, kwargs in self.items]
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\externals\joblib\parallel.py", line 225, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\multiclass.py", line 80, in _fit_binary
estimator.fit(X, y)
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\linear_model\logistic.py", line 1288, in fit
accept_large_sparse=solver != 'liblinear')
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\utils\validation.py", line 756, in check_X_y
estimator=estimator)
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\utils\validation.py", line 527, in check_array
array = np.asarray(array, dtype=dtype, order=order)
File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\numpy\core\numeric.py", line 538, in asarray
return array(a, dtype, copy=False, order=order)
ValueError: setting an array element with a sequence.
我曾尝试通读文档,并且不是回避阅读源代码的人(PyCharm 帮助我查明了错误的根源),但离修复它还差得远。我觉得我诚实地尝试了 Google 前 3 页上的所有内容,但没有成功。
我该如何解决这个错误?为什么会这样?我的预处理错误吗?我知道它在某些地方有点狡猾,但这是否不起作用?如果是这样,我如何在预处理器中解决这些问题?这会修复ValueError: setting an array element with a sequence.错误吗?
一些注意事项:
- 出于某种原因,
spaCy每列中的大多数值似乎都返回 0.0。 - 我不确定我是否可以
MultiLabelVectorizer()像这样将我的输出插入到 DataFrame 中(只是作为 2D 数组) - 可以吗?是否需要更多步骤? - 我尝试过 Pipelines 以获得更多语义代码,以及对不同数据类型使用不同的分类器(例如,对文本使用 Chi^2,对其他类型使用其他东西),但它似乎总是导致无穷无尽的错误。
- 我什至无法确定引发此错误的原因:是列数据、文本数据还是数字数据?我不知道。
