如何删除管道中 Transformer 函数中的列?

数据挖掘 Python scikit-学习
2021-09-15 16:18:59

我已经在 scikit-learn 管道中使用了自定义转换函数。在这个函数中,我只向我的数据框添加了特性。它工作得很好。

下面是一个工作示例:

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_blobs

x, y = make_blobs(n_samples=300, n_features=2, centers=1)
x_train = pd.DataFrame(x[:150,:], columns=['x1','x2'])
x_test = pd.DataFrame(x[150:,:], columns=['x1','x2'])

class myTransformation(object) :
  def __init__(self, colname):
    self.colname = colname

  def transform(self, x) :
    dat = x.copy()
    squared = dat.loc[:,self.colname]**2
    squared.name = "%s_sqre"%self.colname
    dat.loc[:,squared.name] = squared
    dat.loc[:, self.colname+'_2'] = dat[self.colname]
    return dat

  def fit(self, dat, y=None) :
    return self

makePipe = Pipeline([('makeTransfo', myTransformation(colname="x2"))])
fittedPipe = makePipe.fit(x_train)
x_1 = fittedPipe.transform(x_train)
x_2 = fittedPipe.transform(x_test)

现在我希望能够添加删除数据框中相等列的功能。现在,我有以下功能:

def delSameCols(df) :
  cols = []
  for i in range(df.shape[1]) :
    for j in range(i+1, df.shape[1]) :
      if (df.iloc[:,i].dtype!='O') | (df.iloc[:,j].dtype!='O') :
        if np.array_equal(df.iloc[:,i],df.iloc[:,j]) :
          cols.append(df.columns[j])
  cols = list(set(cols))
  print( u'      -%s features removed'%len(cols) )
  return df.drop(cols, axis=1), cols

我不知道如何处理这个/如何在管道中或直接在现有函数中添加新函数?有谁有想法吗?

1个回答

我成功地得到了一个令人满意的解决方案。我发布了一个完整的工作脚本。你怎么看待这件事?尤其是在init函数中未初始化的属性(self.lstRemCols)的创建

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_blobs

class myTransfo(object) :
  def __init__(self, colname):
    self.colname = colname

  def transform(self, x) :
    dat = x.copy()
    squared = dat.loc[:,self.colname]**2
    squared.name = "%s_sqre"%self.colname
    dat.loc[:,squared.name] = squared
    dat.loc[:, self.colname+'_2'] = dat[self.colname]
    return dat

  def fit(self, dat, y=None) :
    return self

class removeSameCols(object) :
  def __init__(self) :
    pass

  def _delSameCols(self, df) :
    cols = []
    for i in range(df.shape[1]) :
      for j in range(i+1, df.shape[1]) :
        if (df.iloc[:,i].dtype!='O') | (df.iloc[:,j].dtype!='O') :
          if np.array_equal(df.iloc[:,i],df.iloc[:,j]) :
            cols.append(df.columns[j])
    cols = list(set(cols))
    print( u'      - %s features to be removed'%len(cols) )
    return cols

  def transform(self, x) :
    dat = x.copy()
    lstcols = list(set(dat.columns) - set(self.lstRemCols))
    return dat.loc[:, lstcols]

  def fit(self, x, y=None) :
    dat = x.copy()
    self.lstRemCols = self._delSameCols(dat)
    return self


x, y = make_blobs(n_samples=300, n_features=5)
x_train = pd.DataFrame(x[:150,:], columns=['x1','x2','x3','x4','x5'])
x_test = pd.DataFrame(x[150:,:], columns=['x1','x2','x3','x4','x5'])

makePipe2 = Pipeline([('makeCols', myTransfo(colname="x2")),
                      ('remCols', removeSameCols())])
makePipe2.fit(x_train)
x_1 = makePipe2.transform(x_train)
# test if only same columns in x_train are removed.
x_test.x4 = x_test.x5
x_2 = makePipe2.transform(x_test)