获取“ValueError:使用序列设置数组元素。” 尝试拟合混合类型数据时

数据挖掘 机器学习 Python scikit-学习 向量空间模型
2022-03-05 22:55:04

我已经看到了这个这个这个问题,但是这些建议似乎都没有解决我的问题(所以我已经恢复了它们)。

我有以下代码:

nlp = spacy.load('en_core_web_sm')
parser = English()

class CleanTextTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        return [cleanText(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}


def cleanText(text):
    text = text.strip().replace("\n", " ").replace("\r", " ")
    text = text.lower()
    return text


def tokenizeText(sample):
    tokens = parser(sample)
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas
    tokens = [tok for tok in tokens if tok not in STOPLIST]
    tokens = [nlp(tok)[0].lemma_ for tok in tokens if tok not in SYMBOLS]
    return tokens

class multilabelbin(TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.encoder = MultiLabelBinarizer(*args, **kwargs)

    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self

    def transform(self, x, y=0):
        return self.encoder.transform(x)


def represent(rd, ed, number, category, text):
    doc_train = rd
    doc_test = ed

    for column in category:
        doc_train[column] = [tuple(doc.split(",")) for doc in rd[column]]
        doc_test[column] = [tuple(doc.split(",")) for doc in ed[column]]

        print("columns split")

        mlb = multilabelbin(sparse_output=False)
        mlb.fit(doc_train)

        transformed_r = mlb.transform(doc_train)
        for row in range(len(doc_train[column])):
            print(doc_train[column][row])
            doc_train[column][row] = transformed_r[row]

        transformed_e = mlb.transform(doc_test)
        for row in range(len(doc_test[column])):
            print(doc_test[column][row])
            doc_test[column][row] = transformed_e[row]

        print("categorical columns encoded using MultiLabelBinarizer()")

    for column in number:
        ss = StandardScaler()
        ss.fit(doc_train[column].values.reshape(-1, 1))

        doc_train[column] = ss.transform(doc_train[column].values.reshape(-1, 1))
        doc_test[column] = ss.transform(doc_test[column].values.reshape(-1, 1))
        print("numbers scaled using StandardScaler()")

    for column in text:
        cleaner = CleanTextTransformer()
        cleaner.fit(doc_train[column].tolist())

        doc_train[column] = cleaner.transform(doc_train[column])
        doc_test[column] = cleaner.transform(doc_test[column])

        print(doc_train[column])

        vec = TfidfVectorizer(tokenizer=tokenizeText, ngram_range=(1, 1))
        vec.fit(doc_train[column].tolist())

        doc_train[column] = vec.transform(doc_train[column]).todense()
        doc_test[column] = vec.transform(doc_test[column]).todense()

        print(doc_train[column])

        print("text vectorized")

    print("preprocessing completed successfully")

    return doc_train, doc_test


def train_classifier(train_docs, classAxis):
    clf = OneVsRestClassifier(LogisticRegression(solver='saga'))

    X = [list(train_docs[list(train_docs)[i]]) for i in range(1, len(train_docs))]
    y = list(train_docs[classAxis])

    classifier = clf.fit(X, y)
    return classifier

df = pd.DataFrame(pd.read_csv("testdata.csv", header=0))
test_data = pd.DataFrame(pd.read_csv("test.csv", header=0))

train, test = represent(df, test_data, ["Cat2", "Cat5"], ["Cat6"], ["Cat1", "Cat3", "Cat4", "Cat7"])

print(train, test)

model = train_classifier(train, "Class")

train.csv包含以下格式的数据:

数据格式

test.csv是相同的格式。

如您所见,有文本值、数字值和分类值。我的代码首先拆分分类值(以逗号分隔),然后通过MultiLabelBinarizer(). 然后,我只是简单地缩放数字。最后,我使用本教程spaCy中的设置处理文本我确保也对测试数据应用转换,所以那里不会有不一致的地方。最后,我-enise函数中的所有内容,这应该会有所帮助......但它没有。在该行中,我收到以下错误:listtrain_classifierclassifier = clf.fit(list(X), y)

Traceback (most recent call last):
  File "<input>", line 1, in <module>
  File "C:\Users\User\AppData\Local\JetBrains\Toolbox\apps\PyCharm-P\ch-0\191.7141.48\helpers\pydev\_pydev_bundle\pydev_umd.py", line 197, in runfile
    pydev_imports.execfile(filename, global_vars, local_vars)  # execute the script
  File "C:\Users\User\AppData\Local\JetBrains\Toolbox\apps\PyCharm-P\ch-0\191.7141.48\helpers\pydev\_pydev_imps\_pydev_execfile.py", line 18, in execfile
    exec(compile(contents+"\n", file, 'exec'), glob, loc)
  File "C:/Users/User/PycharmProjects/ml/ml.py", line 148, in <module>
    model = train_classifier(train, "Class")
  File "C:/Users/User/PycharmProjects/ml/ml.py", line 124, in train_classifier
    classifier = clf.fit(list(X), y)
  File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\multiclass.py", line 215, in fit
    for i, column in enumerate(columns))
  File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\externals\joblib\parallel.py", line 917, in __call__
    if self.dispatch_one_batch(iterator):
  File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\externals\joblib\parallel.py", line 759, in dispatch_one_batch
    self._dispatch(tasks)
  File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\externals\joblib\parallel.py", line 716, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 182, in apply_async
    result = ImmediateResult(func)
  File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 549, in __init__
    self.results = batch()
  File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\externals\joblib\parallel.py", line 225, in __call__
    for func, args, kwargs in self.items]
  File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\externals\joblib\parallel.py", line 225, in <listcomp>
    for func, args, kwargs in self.items]
  File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\multiclass.py", line 80, in _fit_binary
    estimator.fit(X, y)
  File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\linear_model\logistic.py", line 1288, in fit
    accept_large_sparse=solver != 'liblinear')
  File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\utils\validation.py", line 756, in check_X_y
    estimator=estimator)
  File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\sklearn\utils\validation.py", line 527, in check_array
    array = np.asarray(array, dtype=dtype, order=order)
  File "C:\Users\User\AppData\Local\Programs\Python\Python36\lib\site-packages\numpy\core\numeric.py", line 538, in asarray
    return array(a, dtype, copy=False, order=order)
ValueError: setting an array element with a sequence.

我曾尝试通读文档,并且不是回避阅读源代码的人(PyCharm 帮助我查明了错误的根源),但离修复它还差得远。我觉得我诚实地尝试了 Google 前 3 页上的所有内容,但没有成功。

我该如何解决这个错误?为什么会这样?我的预处理错误吗?我知道它在某些地方有点狡猾,但这是否不起作用?如果是这样,我如何在预处理器中解决这些问题?这会修复ValueError: setting an array element with a sequence.错误吗?

一些注意事项:

  • 出于某种原因,spaCy每列中的大多数值似乎都返回 0.0。
  • 我不确定我是否可以MultiLabelVectorizer()像这样将我的输出插入到 DataFrame 中(只是作为 2D 数组) - 可以吗?是否需要更多步骤?
  • 我尝试过 Pipelines 以获得更多语义代码,以及对不同数据类型使用不同的分类器(例如,对文本使用 Chi^2,对其他类型使用其他东西),但它似乎总是导致无穷无尽的错误。
  • 我什至无法确定引发此错误的原因:是列数据、文本数据还是数字数据?我不知道。
1个回答

您正在使用的软件包旨在以非常特定的方式工作,并且数据可能不是每个阶段的预期数据。

NumPy 数组必须具有一致的 dtype/datatype。对于必须是数值的机器学习,通常是浮点数。如果尝试将对象类型传递给 scikit-learn,它将不起作用。

Scikit-learn 期望 NumPy 数组作为输入,而不是 Python 列表。如果数据保留 NumPy 数组,则代码更有可能工作。

您当前正在手动循环数据以对其进行转换。如果重构您的代码以主要使用 scikit-learn 管道,它将更加自动化,并且可能有更多信息性错误消息。

由于您拥有异构数据,Feature Union是为 scikit-learn 处理此类数据的最佳实践。