一种非常简单的方法是为每个集群找到某种质心(例如,分别平均属于每个集群的文档的分布),然后计算集群内每个文档与相应质心的余弦距离。距离较短的文档将最接近质心,因此最具“代表性”。
继续上一个示例:
import pandas as pd
import numpy as np
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import cosine
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
# Initialize some documents
doc1 = {'Science':0.8, 'History':0.05, 'Politics':0.15, 'Sports':0.1}
doc2 = {'News':0.2, 'Art':0.8, 'Politics':0.1, 'Sports':0.1}
doc3 = {'Science':0.8, 'History':0.1, 'Politics':0.05, 'News':0.1}
doc4 = {'Science':0.1, 'Weather':0.2, 'Art':0.7, 'Sports':0.1}
collection = [doc1, doc2, doc3, doc4]
df = pd.DataFrame(collection)
# Fill missing values with zeros
df.fillna(0, inplace=True)
# Get Feature Vectors
feature_matrix = df.as_matrix()
# Fit DBSCAN
db = DBSCAN(min_samples=1, metric='precomputed').fit(pairwise_distances(feature_matrix, metric='cosine'))
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_clusters_)
# Find the representatives
representatives = {}
for label in set(labels):
# Find indices of documents belonging to the same cluster
ind = np.argwhere(labels==label).reshape(-1,)
# Select these specific documetns
cluster_samples = feature_matrix[ind,:]
# Calculate their centroid as an average
centroid = np.average(cluster_samples, axis=0)
# Find the distance of each document from the centroid
distances = [cosine(sample_doc, centroid) for sample_doc in cluster_samples]
# Keep the document closest to the centroid as the representative
representatives[label] = cluster_samples[np.argsort(distances),:][0]
for label, doc in representatives.iteritems():
print("Label : %d -- Representative : %s" % (label, str(doc)))