我正在使用 TSNE 来可视化我的集群,但输出看起来有点奇怪。应该有 3 个集群,但实际上有 4 行。我如何可视化它们有什么问题还是kmeans方法本身有问题?
import pandas as pd
import numpy as np
import ast
from sklearn import metrics
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
colNames = ['unixTime', 'sampleAmount','Time','samplingRate', 'Data']
data = pd.read_csv("project_fan.csv", sep = ';', error_bad_lines = False, names = colNames)
# changing data into list
data['Data'] = data.Data.transform(ast.literal_eval)
# Selecting the average value from the list and replacing the list with it
data['Data'] = data.Data.apply(np.mean)
kmeanModel = KMeans(n_clusters = 3)
kmeanModel.fit(data)
y = kmeanModel.labels_
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size = 0.2, random_state = 1)
k = 3
tfs_reduced = TruncatedSVD(n_components=k, random_state=0).fit_transform(data)
tfs_embedded = TSNE(n_components=2, perplexity=40, verbose=2).fit_transform(tfs_reduced)
fig = plt.figure(figsize = (10, 10))
ax = plt.axes()
plt.scatter(tfs_embedded[:, 0], tfs_embedded[:, 1], marker = "x", c = km.labels_)
plt.show()
样本数据集:
unixTime sampleAmount Time samplingRate Data
0 1.556891e+09 16384 340 48188.235294 1620.242170
1 1.556891e+09 16384 341 48046.920821 1620.237716
2 1.556891e+09 16384 340 48188.235294 1620.236340
3 1.556891e+09 16384 340 48188.235294 1620.229289
4 1.556891e+09 16384 340 48188.235294 1620.227541
