Sklearn:无监督 knn 与 k-means
数据挖掘
机器学习
scikit-学习
k-均值
k-nn
2021-10-01 18:17:38
1个回答
无监督 k-NN
与 k-means 不同,无监督 k-nn 不会将标签与实例相关联。它所能做的就是告诉您训练数据中的哪些实例与您轮询的点最接近。
例如:
import numpy as np
from sklearn.neighbors import NearestNeighbors
samples = [[0, 0, 2], [1, 0, 0], [0, 0, 1]]
neigh = NearestNeighbors(2, 0.4)
neigh.fit(samples)
neigh.kneighbors([[0, 0, 1.3]], 2, return_distance=False)
数组([[2, 0]]...)
您可以看到这返回了 k 最近点的索引,而不是标签。
k-均值
这个算法完全不同。这里的 k 表示数据集中存在的假定类的数量。例如,如果你有红色和绿色苹果的未标记图片,你知道. 然后,该算法会将质心(聚类分布的平均值)移动到稳定的解决方案。
这是一个例子:
我们先做一些人工的高斯分布数据。
import numpy as np
import matplotlib.pyplot as plt
params = [[[ 0,1], [ 0,1]],
[[ 5,1], [ 5,1]],
[[-2,5], [ 2,5]],
[[ 2,1], [ 2,1]],
[[-5,1], [-5,1]]]
n = 300
dims = len(params[0])
data = []
y = []
for ix, i in enumerate(params):
inst = np.random.randn(n, dims)
for dim in range(dims):
inst[:,dim] = params[ix][dim][0]+params[ix][dim][1]*inst[:,dim]
label = ix + np.zeros(n)
if len(data) == 0: data = inst
else: data = np.append( data, inst, axis= 0)
if len(y) == 0: y = label
else: y = np.append(y, label)
num_clusters = len(params)
print(y.shape)
print(data.shape)
(1500,)
(1500, 2)
plt.scatter(data[:,0], data[:,1])
plt.show()
从零开始的 k-means 算法
class Kmeans(object):
def __init__(self, k=1):
self.k = k
def train(self, data, verbose=1):
shape = data.shape
ranges = np.zeros((shape[1], 2))
centroids = np.zeros((shape[1], 2))
for dim in range(shape[1]):
ranges[dim, 0] = np.min(data[:,dim])
ranges[dim, 1] = np.max(data[:,dim])
if verbose == 1:
print('Ranges: ')
print(ranges)
centroids = np.zeros((self.k, shape[1]))
for i in range(self.k):
for dim in range(shape[1]):
centroids[i, dim] = np.random.uniform(ranges[dim, 0], ranges[dim, 1], 1)
if verbose == 1:
print('Centroids: ')
print(centroids)
plt.scatter(data[:,0], data[:,1])
plt.scatter(centroids[:,0], centroids[:,1], c = 'r')
plt.show()
count = 0
while count < 100:
count += 1
if verbose == 1:
print('-----------------------------------------------')
print('Iteration: ', count)
distances = np.zeros((shape[0],self.k))
for ix, i in enumerate(data):
for ic, c in enumerate(centroids):
distances[ix, ic] = np.sqrt(np.sum((i-c)**2))
labels = np.argmin(distances, axis = 1)
new_centroids = np.zeros((self.k, shape[1]))
for centroid in range(self.k):
temp = data[labels == centroid]
if len(temp) == 0:
return 0
for dim in range(shape[1]):
new_centroids[centroid, dim] = np.mean(temp[:,dim])
if verbose == 1:
plt.scatter(data[:,0], data[:,1], c = labels)
plt.scatter(new_centroids[:,0], new_centroids[:,1], c = 'r')
plt.show()
if np.linalg.norm(new_centroids - centroids) < np.finfo(float).eps:
print("DONE!")
break
centroids = new_centroids
self.centroids = centroids
self.labels = labels
if verbose == 1:
print(labels)
print(centroids)
return 1
def getAverageDistance(self, data):
dists = np.zeros((len(self.centroids),))
for ix, centroid in enumerate(self.centroids):
temp = data[self.labels == ix]
dist = 0
for i in temp:
dist += np.linalg.norm(i - centroid)
dists[ix] = dist/len(temp)
return dists
def getLabels(self):
return self.labels
和结果
kmeans = Kmeans(5)
kmeans.train(data)
范围
:[[-15.42553872 14.88894099]
[-13.33192554 16.15415347]
质心
:[[-11.39200726 -10.71208054]
[3.73634888 -8.9230959]
[6.17589734 -10.66376228]
[0.78973744 -0.44245535]
[9.29524253 9.59127574]]
初始化,红点是随机质心
迭代 1
经过几次迭代