数据挖掘 - 如何从无监督学习中保存和加载模型？ - 吾爱随笔录

[初学者]

对不起，如果这是一个愚蠢的问题。

我正在遵循本文及以下的模型。

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets
from sklearn.decomposition import PCA

# Dataset
iris = datasets.load_iris()
data = pd.DataFrame(iris.data,columns = iris.feature_names)

target = iris.target_names
labels = iris.target

#Scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data = pd.DataFrame(scaler.fit_transform(data), columns=data.columns)

#PCA Transformation
pca = PCA(n_components=3)
principalComponents = pca.fit_transform(data)
PCAdf = pd.DataFrame(data = principalComponents , columns = ['principal component 1', 'principal component 2','principal component 3'])

datapoints = PCAdf.values
m, f = datapoints.shape
k = 3

#Visualization
fig = plt.figure(1, figsize=(8, 6))
ax = Axes3D(fig, elev=-150, azim=110)
X_reduced = datapoints
ax.scatter(X_reduced[:, 0], X_reduced[:, 1], X_reduced[:, 2], c=labels,
          cmap=plt.cm.Set1, edgecolor='k', s=40)
ax.set_title("First three PCA directions")
ax.set_xlabel("principal component 1")
ax.w_xaxis.set_ticklabels([])
ax.set_ylabel("principal component 1")
ax.w_yaxis.set_ticklabels([])
ax.set_zlabel("principal component 1")
ax.w_zaxis.set_ticklabels([])
plt.show()

def init_medoids(X, k):
   from numpy.random import choice
   from numpy.random import seed

   seed(1)
   samples = choice(len(X), size=k, replace=False)
   return X[samples, :]

medoids_initial = init_medoids(datapoints, 3)

def compute_d_p(X, medoids, p):
   m = len(X)
   medoids_shape = medoids.shape
   # If a 1-D array is provided, 
   # it will be reshaped to a single row 2-D array
   if len(medoids_shape) == 1: 
       medoids = medoids.reshape((1,len(medoids)))
   k = len(medoids)

   S = np.empty((m, k))

   for i in range(m):
       d_i = np.linalg.norm(X[i, :] - medoids, ord=p, axis=1)
       S[i, :] = d_i**p

   return S

S = compute_d_p(datapoints, medoids_initial, 2)


def assign_labels(S):
   return np.argmin(S, axis=1)

labels = assign_labels(S)

def update_medoids(X, medoids, p):

   S = compute_d_p(datapoints, medoids, p)
   labels = assign_labels(S)

   out_medoids = medoids

   for i in set(labels):

       avg_dissimilarity = np.sum(compute_d_p(datapoints, medoids[i], p))

       cluster_points = datapoints[labels == i]

       for datap in cluster_points:
           new_medoid = datapoints
           new_dissimilarity= np.sum(compute_d_p(datapoints, datap, p))

           if new_dissimilarity < avg_dissimilarity :
               avg_dissimilarity = new_dissimilarity

               out_medoids[i] = datap

   return out_medoids

def has_converged(old_medoids, medoids):
   return set([tuple(x) for x in old_medoids]) == set([tuple(x) for x in medoids])

#Full algorithm
def kmedoids(X, k, p, starting_medoids=None, max_steps=np.inf):
   if starting_medoids is None:
       medoids = init_medoids(X, k)
   else:
       medoids = starting_medoids

   converged = False
   labels = np.zeros(len(X))
   i = 1
   while (not converged) and (i <= max_steps):
       old_medoids = medoids.copy()

       S = compute_d_p(X, medoids, p)

       labels = assign_labels(S)

       medoids = update_medoids(X, medoids, p)

       converged = has_converged(old_medoids, medoids)
       i += 1
   return (medoids,labels)

results = kmedoids(datapoints, 3, 2)
final_medoids = results[0]
data['clusters'] = results[1]

#Count
def mark_matches(a, b, exact=False):
   """
   Given two Numpy arrays of {0, 1} labels, returns a new boolean
   array indicating at which locations the input arrays have the
   same label (i.e., the corresponding entry is True).

   This function can consider "inexact" matches. That is, if `exact`
   is False, then the function will assume the {0, 1} labels may be
   regarded as the same up to a swapping of the labels. This feature
   allows

     a == [0, 0, 1, 1, 0, 1, 1]
     b == [1, 1, 0, 0, 1, 0, 0]

   to be regarded as equal. (That is, use `exact=False` when you
   only care about "relative" labeling.)
   """
   assert a.shape == b.shape
   a_int = a.astype(dtype=int)
   b_int = b.astype(dtype=int)
   all_axes = tuple(range(len(a.shape)))
   assert ((a_int == 0) | (a_int == 1) | (a_int == 2)).all()
   assert ((b_int == 0) | (b_int == 1) | (b_int == 2)).all()

   exact_matches = (a_int == b_int)
   if exact:
       return exact_matches

   assert exact == False
   num_exact_matches = np.sum(exact_matches)
   if (2*num_exact_matches) >= np.prod (a.shape):
       return exact_matches
   return exact_matches == False # Invert

def count_matches(a, b, exact=False):
   """
   Given two sets of {0, 1} labels, returns the number of mismatches.

   This function can consider "inexact" matches. That is, if `exact`
   is False, then the function will assume the {0, 1} labels may be
   regarded as similar up to a swapping of the labels. This feature
   allows

     a == [0, 0, 1, 1, 0, 1, 1]
     b == [1, 1, 0, 0, 1, 0, 0]

   to be regarded as equal. (That is, use `exact=False` when you
   only care about "relative" labeling.)
   """
   matches = mark_matches(a, b, exact=exact)
   return np.sum(matches)

n_matches = count_matches(labels, data['clusters'])
print(n_matches,
     "matches out of",
     len(data), "data points",
     "(~ {:.1f}%)".format(100.0 * n_matches / len(labels)))

如何在训练后保存上述模型，而不必每次将未分配集群的新记录添加到数据集中时都重新运行上述代码？

我还简化了本地计算机上的代码，以注释掉所有可视化和之后的所有内容，#Count并且仍然能够在我的数据集上获得集群分配。只是不想每次获得新记录时都运行上述代码。

我可以使用 Keras/Tensorflow 保存和加载模型后期训练，但不确定我是否必须只使用这些工具来做我想做的事。