如何从无监督学习中保存和加载模型?

数据挖掘 聚类 机器学习模型
2022-02-19 07:06:49

[初学者]

对不起,如果这是一个愚蠢的问题。

我正在遵循本文及以下的模型。

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn.decomposition import PCA

# Dataset
iris = datasets.load_iris()
data = pd.DataFrame(iris.data,columns = iris.feature_names)

target = iris.target_names
labels = iris.target

#Scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

#PCA Transformation
pca = PCA(n_components=3)
principalComponents = pca.fit_transform(data)
PCAdf = pd.DataFrame(data = principalComponents , columns = ['principal component 1', 'principal component 2','principal component 3'])

datapoints = PCAdf.values
m, f = datapoints.shape
k = 3

#Visualization
fig = plt.figure(1, figsize=(8, 6))
ax = Axes3D(fig, elev=-150, azim=110)
X_reduced = datapoints
ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=labels,
          cmap=plt.cm.Set1, edgecolor='k', s=40)
ax.set_title("First three PCA directions")
ax.set_xlabel("principal component 1")
ax.w_xaxis.set_ticklabels([])
ax.set_ylabel("principal component 1")
ax.w_yaxis.set_ticklabels([])
ax.set_zlabel("principal component 1")
ax.w_zaxis.set_ticklabels([])
plt.show()

def init_medoids(X, k):
   from numpy.random import choice
   from numpy.random import seed

   seed(1)
   samples = choice(len(X), size=k, replace=False)
   return X[samples, :]

medoids_initial = init_medoids(datapoints, 3)

def compute_d_p(X, medoids, p):
   m = len(X)
   medoids_shape = medoids.shape
   # If a 1-D array is provided, 
   # it will be reshaped to a single row 2-D array
   if len(medoids_shape) == 1: 
       medoids = medoids.reshape((1,len(medoids)))
   k = len(medoids)

   S = np.empty((m, k))

   for i in range(m):
       d_i = np.linalg.norm(X[i, :] - medoids, ord=p, axis=1)
       S[i, :] = d_i**p

   return S

S = compute_d_p(datapoints, medoids_initial, 2)


def assign_labels(S):
   return np.argmin(S, axis=1)

labels = assign_labels(S)

def update_medoids(X, medoids, p):

   S = compute_d_p(datapoints, medoids, p)
   labels = assign_labels(S)

   out_medoids = medoids

   for i in set(labels):

       avg_dissimilarity = np.sum(compute_d_p(datapoints, medoids[i], p))

       cluster_points = datapoints[labels == i]

       for datap in cluster_points:
           new_medoid = datapoints
           new_dissimilarity= np.sum(compute_d_p(datapoints, datap, p))

           if new_dissimilarity < avg_dissimilarity :
               avg_dissimilarity = new_dissimilarity

               out_medoids[i] = datap

   return out_medoids

def has_converged(old_medoids, medoids):
   return set([tuple(x) for x in old_medoids]) == set([tuple(x) for x in medoids])

#Full algorithm
def kmedoids(X, k, p, starting_medoids=None, max_steps=np.inf):
   if starting_medoids is None:
       medoids = init_medoids(X, k)
   else:
       medoids = starting_medoids

   converged = False
   labels = np.zeros(len(X))
   i = 1
   while (not converged) and (i <= max_steps):
       old_medoids = medoids.copy()

       S = compute_d_p(X, medoids, p)

       labels = assign_labels(S)

       medoids = update_medoids(X, medoids, p)

       converged = has_converged(old_medoids, medoids)
       i += 1
   return (medoids,labels)

results = kmedoids(datapoints, 3, 2)
final_medoids = results[0]
data['clusters'] = results[1]

#Count
def mark_matches(a, b, exact=False):
   """
   Given two Numpy arrays of {0, 1} labels, returns a new boolean
   array indicating at which locations the input arrays have the
   same label (i.e., the corresponding entry is True).

   This function can consider "inexact" matches. That is, if `exact`
   is False, then the function will assume the {0, 1} labels may be
   regarded as the same up to a swapping of the labels. This feature
   allows

     a == [0, 0, 1, 1, 0, 1, 1]
     b == [1, 1, 0, 0, 1, 0, 0]

   to be regarded as equal. (That is, use `exact=False` when you
   only care about "relative" labeling.)
   """
   assert a.shape == b.shape
   a_int = a.astype(dtype=int)
   b_int = b.astype(dtype=int)
   all_axes = tuple(range(len(a.shape)))
   assert ((a_int == 0) | (a_int == 1) | (a_int == 2)).all()
   assert ((b_int == 0) | (b_int == 1) | (b_int == 2)).all()

   exact_matches = (a_int == b_int)
   if exact:
       return exact_matches

   assert exact == False
   num_exact_matches = np.sum(exact_matches)
   if (2*num_exact_matches) >= np.prod (a.shape):
       return exact_matches
   return exact_matches == False # Invert

def count_matches(a, b, exact=False):
   """
   Given two sets of {0, 1} labels, returns the number of mismatches.

   This function can consider "inexact" matches. That is, if `exact`
   is False, then the function will assume the {0, 1} labels may be
   regarded as similar up to a swapping of the labels. This feature
   allows

     a == [0, 0, 1, 1, 0, 1, 1]
     b == [1, 1, 0, 0, 1, 0, 0]

   to be regarded as equal. (That is, use `exact=False` when you
   only care about "relative" labeling.)
   """
   matches = mark_matches(a, b, exact=exact)
   return np.sum(matches)

n_matches = count_matches(labels, data['clusters'])
print(n_matches,
     "matches out of",
     len(data), "data points",
     "(~ {:.1f}%)".format(100.0 * n_matches / len(labels)))

如何在训练后保存上述模型,而不必每次将未分配集群的新记录添加到数据集中时都重新运行上述代码?

我还简化了本地计算机上的代码,以注释掉所有可视化和之后的所有内容,#Count并且仍然能够在我的数据集上获得集群分配。只是不想每次获得新记录时都运行上述代码。

我可以使用 Keras/Tensorflow 保存和加载模型后期训练,但不确定我是否必须只使用这些工具来做我想做的事。

1个回答

对k-medoids不太熟悉,但我想它类似于k-means,对吧?如果是这样,整个模型中最耗时的部分就是更新 medoids我们随机选择初始开始并更新质心以获得更好的聚类结果。

我建议你腌制 final_medoids当您有新数据时,计算 pca,将其传递给以kmedoids腌制final_medoids作为起始 medoid 的函数。然后你可以使用以下函数来计算分数或其他东西。可能会有一些错误,但我认为主要的想法是保存稳定的中心点,这样我们就不需要很多时间来更新。