数据挖掘 - 如何使用 pandas 执行 pca 分析 - 吾爱随笔录

这是我的数据集：

    A1        A2   A3  Class
3    1  0.440671  2.3      1
14   1  0.868410  1.5      1
29   1  0.587487  2.6      1
34   1  0.652936  3.0      1
45   1  0.953230  3.0      1
..  ..       ...  ...    ...
92   2  0.775842  2.3      2
95   2  0.844920  2.2      2
96   2  0.428071  2.2      2
97   2  0.356044  2.2      3
99   2  0.815400  2.2      3

我的目标是对数据集应用主成分分析，将其转换为新数据集，并保存新数据集。我创建了一个示例程序，对三个向量执行此操作：

import numpy as np
from numpy import linalg as LA

A = [45,37,42,35,39]
B = [38,31,26,28,33]
C = [10,15,17,21,12]

dataset = np.array(A,B,C)

A_norm = A/np.linalg.norm(A)
B_norm = B/np.linalg.norm(B)
C_norm = C/np.linalg.norm(C)

data = np.array([A_norm,B_norm,C_norm])

# determine covariance
covMatrix = np.cov(data,bias=True)
print (covMatrix)

# compute eigen vactors and eigenvalues
w, v = LA.eig(covMatrix)
print("eigen vectors")
print(v)

print("eigen values")
print(w)

varianceV = np.empty(3)

# calculate variances
varianceV[0] = w[0]/(w[0]+w[1]+w[2])
varianceV[1] = w[1]/(w[0]+w[1]+w[2])
varianceV[2] = w[2]/(w[0]+w[1]+w[2])


print(f' variance of v1 : {varianceV[0]}')
print(f' variance of v2 : {varianceV[1]}')
print(f' variance of v3 : {varianceV[2]}')

# calculate feature vector
v_initial = 0
featureVector = np.empty(3)
for i in range(0,3):
    if varianceV[i] > v_initial:
        featureVector = v[i]

print(f'feature vector: {featureVector}')
resolved_dataset = np.concatenate(featureVector,dataset)
print(f'dataset = {resolved_dataset}')

我正在尝试使用上面的示例来替换方法

def pca(s):
    # Normalize each s
    s_normalized=(s - s.mean()) / s.std()
    pca = PCA(n_components=s.shape[1])
    pca.fit(s_normalized)

    # build the covariance matrix of the s.

    # rank eigenvectors in descending order of their eigenvalues
    # and keep the the significant eigenvectors

    # build the feature vector our of the selected eigenvectors
    
    # Reformat and view results
    loadings = pd.DataFrame(pca.components_.T,
    columns=['PC%s' % _ for _ in range(len(s_normalized.columns))],
    index=s.columns)
    print(loadings)

    plot.plot(pca.explained_variance_ratio_)
    plot.ylabel('Explained Variance')
    plot.xlabel('Components')
    plot.show()

    # TODO: return transformed data.
    s = pca.transform(s_normalized)
    return s

最小的可重现示例

数据

A1,A2,A3,Class
2,0.4631338,1.5,3
8,0.7460648,3.0,3
6,0.264391038,2.5,2
5,0.4406713,2.3,1
2,0.410438159,1.5,3
2,0.302901816,1.5,2
6,0.275869396,2.5,3
8,0.084782428,3.0,3
2,0.53226533,1.5,2
8,0.070034818,2.9,1
2,0.668631847,1.5,2
2,0.215622639,1.5,2
2,0.148916231,1.5,3
2,0.51335434,1.5,3

程序


from numpy.core.defchararray import count
import pandas as pd
import numpy as np
import numpy as np
from math import ceil, floor, log2
from sklearn.decomposition import PCA
import matplotlib.pyplot as plot

def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')

def main():
    s = pd.read_csv('A1-dm.csv')
    s = pca(s)


def pca(s):
    # Normalize each s
    s_normalized=(s - s.mean()) / s.std()
    pca = PCA(n_components=s.shape[1])
    pca.fit(s_normalized)

    # build the covariance matrix of the s.

    # rank eigenvectors in descending order of their eigenvalues
    # and keep the the significant eigenvectors

    # build the feature vector our of the selected eigenvectors
    
    # Reformat and view results
    loadings = pd.DataFrame(pca.components_.T,
    columns=['PC%s' % _ for _ in range(len(s_normalized.columns))],
    index=s.columns)
    print(loadings)

    plot.plot(pca.explained_variance_ratio_)
    plot.ylabel('Explained Variance')
    plot.xlabel('Components')
    plot.show()

    # TODO: return transformed data.
    s = pca.transform(s_normalized)
    return s
    
main()

我面临的问题是试图解决我将数据集乘以特征向量的最后一步。