[初学者]
对不起,如果这是一个愚蠢的问题。
我正在遵循本文及以下的模型。
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn.decomposition import PCA
# Dataset
iris = datasets.load_iris()
data = pd.DataFrame(iris.data,columns = iris.feature_names)
target = iris.target_names
labels = iris.target
#Scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)
#PCA Transformation
pca = PCA(n_components=3)
principalComponents = pca.fit_transform(data)
PCAdf = pd.DataFrame(data = principalComponents , columns = ['principal component 1', 'principal component 2','principal component 3'])
datapoints = PCAdf.values
m, f = datapoints.shape
k = 3
#Visualization
fig = plt.figure(1, figsize=(8, 6))
ax = Axes3D(fig, elev=-150, azim=110)
X_reduced = datapoints
ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=labels,
cmap=plt.cm.Set1, edgecolor='k', s=40)
ax.set_title("First three PCA directions")
ax.set_xlabel("principal component 1")
ax.w_xaxis.set_ticklabels([])
ax.set_ylabel("principal component 1")
ax.w_yaxis.set_ticklabels([])
ax.set_zlabel("principal component 1")
ax.w_zaxis.set_ticklabels([])
plt.show()
def init_medoids(X, k):
from numpy.random import choice
from numpy.random import seed
seed(1)
samples = choice(len(X), size=k, replace=False)
return X[samples, :]
medoids_initial = init_medoids(datapoints, 3)
def compute_d_p(X, medoids, p):
m = len(X)
medoids_shape = medoids.shape
# If a 1-D array is provided,
# it will be reshaped to a single row 2-D array
if len(medoids_shape) == 1:
medoids = medoids.reshape((1,len(medoids)))
k = len(medoids)
S = np.empty((m, k))
for i in range(m):
d_i = np.linalg.norm(X[i, :] - medoids, ord=p, axis=1)
S[i, :] = d_i**p
return S
S = compute_d_p(datapoints, medoids_initial, 2)
def assign_labels(S):
return np.argmin(S, axis=1)
labels = assign_labels(S)
def update_medoids(X, medoids, p):
S = compute_d_p(datapoints, medoids, p)
labels = assign_labels(S)
out_medoids = medoids
for i in set(labels):
avg_dissimilarity = np.sum(compute_d_p(datapoints, medoids[i], p))
cluster_points = datapoints[labels == i]
for datap in cluster_points:
new_medoid = datapoints
new_dissimilarity= np.sum(compute_d_p(datapoints, datap, p))
if new_dissimilarity < avg_dissimilarity :
avg_dissimilarity = new_dissimilarity
out_medoids[i] = datap
return out_medoids
def has_converged(old_medoids, medoids):
return set([tuple(x) for x in old_medoids]) == set([tuple(x) for x in medoids])
#Full algorithm
def kmedoids(X, k, p, starting_medoids=None, max_steps=np.inf):
if starting_medoids is None:
medoids = init_medoids(X, k)
else:
medoids = starting_medoids
converged = False
labels = np.zeros(len(X))
i = 1
while (not converged) and (i <= max_steps):
old_medoids = medoids.copy()
S = compute_d_p(X, medoids, p)
labels = assign_labels(S)
medoids = update_medoids(X, medoids, p)
converged = has_converged(old_medoids, medoids)
i += 1
return (medoids,labels)
results = kmedoids(datapoints, 3, 2)
final_medoids = results[0]
data['clusters'] = results[1]
#Count
def mark_matches(a, b, exact=False):
"""
Given two Numpy arrays of {0, 1} labels, returns a new boolean
array indicating at which locations the input arrays have the
same label (i.e., the corresponding entry is True).
This function can consider "inexact" matches. That is, if `exact`
is False, then the function will assume the {0, 1} labels may be
regarded as the same up to a swapping of the labels. This feature
allows
a == [0, 0, 1, 1, 0, 1, 1]
b == [1, 1, 0, 0, 1, 0, 0]
to be regarded as equal. (That is, use `exact=False` when you
only care about "relative" labeling.)
"""
assert a.shape == b.shape
a_int = a.astype(dtype=int)
b_int = b.astype(dtype=int)
all_axes = tuple(range(len(a.shape)))
assert ((a_int == 0) | (a_int == 1) | (a_int == 2)).all()
assert ((b_int == 0) | (b_int == 1) | (b_int == 2)).all()
exact_matches = (a_int == b_int)
if exact:
return exact_matches
assert exact == False
num_exact_matches = np.sum(exact_matches)
if (2*num_exact_matches) >= np.prod (a.shape):
return exact_matches
return exact_matches == False # Invert
def count_matches(a, b, exact=False):
"""
Given two sets of {0, 1} labels, returns the number of mismatches.
This function can consider "inexact" matches. That is, if `exact`
is False, then the function will assume the {0, 1} labels may be
regarded as similar up to a swapping of the labels. This feature
allows
a == [0, 0, 1, 1, 0, 1, 1]
b == [1, 1, 0, 0, 1, 0, 0]
to be regarded as equal. (That is, use `exact=False` when you
only care about "relative" labeling.)
"""
matches = mark_matches(a, b, exact=exact)
return np.sum(matches)
n_matches = count_matches(labels, data['clusters'])
print(n_matches,
"matches out of",
len(data), "data points",
"(~ {:.1f}%)".format(100.0 * n_matches / len(labels)))
如何在训练后保存上述模型,而不必每次将未分配集群的新记录添加到数据集中时都重新运行上述代码?
我还简化了本地计算机上的代码,以注释掉所有可视化和之后的所有内容,#Count并且仍然能够在我的数据集上获得集群分配。只是不想每次获得新记录时都运行上述代码。
我可以使用 Keras/Tensorflow 保存和加载模型后期训练,但不确定我是否必须只使用这些工具来做我想做的事。