我有一个多维数据集的两个部分,我们称它们为train
和test
。我想基于训练数据集构建一个模型,然后在测试数据集上对其进行验证。簇的数量是已知的。
我尝试在 R 中应用 k-means 聚类,得到一个包含聚类中心的对象:
kClust <- kmeans(train, centers=N, nstart=M)
R中是否有一个函数可以获取找到的集群中心并将集群分配给我的测试数据集?
我可以尝试哪些其他方法/算法?
我有一个多维数据集的两个部分,我们称它们为train
和test
。我想基于训练数据集构建一个模型,然后在测试数据集上对其进行验证。簇的数量是已知的。
我尝试在 R 中应用 k-means 聚类,得到一个包含聚类中心的对象:
kClust <- kmeans(train, centers=N, nstart=M)
R中是否有一个函数可以获取找到的集群中心并将集群分配给我的测试数据集?
我可以尝试哪些其他方法/算法?
您可以使用以下函数计算新数据集的集群分配:
clusters <- function(x, centers) {
# compute squared euclidean distance from each sample to each cluster center
tmp <- sapply(seq_len(nrow(x)),
function(i) apply(centers, 1,
function(v) sum((x[i, ]-v)^2)))
max.col(-t(tmp)) # find index of min distance
}
# create a simple data set with two clusters
set.seed(1)
x <- rbind(matrix(rnorm(100, sd = 0.3), ncol = 2),
matrix(rnorm(100, mean = 1, sd = 0.3), ncol = 2))
colnames(x) <- c("x", "y")
x_new <- rbind(matrix(rnorm(10, sd = 0.3), ncol = 2),
matrix(rnorm(10, mean = 1, sd = 0.3), ncol = 2))
colnames(x_new) <- c("x", "y")
cl <- kmeans(x, centers=2)
all.equal(cl[["cluster"]], clusters(x, cl[["centers"]]))
# [1] TRUE
clusters(x_new, cl[["centers"]])
# [1] 2 2 2 2 2 1 1 1 1 1
plot(x, col=cl$cluster, pch=3)
points(x_new, col= clusters(x_new, cl[["centers"]]), pch=19)
points(cl[["centers"]], pch=4, cex=2, col="blue")
或者你可以使用flexclust包,它有一个predict
用于 k-means 的实现方法:
library("flexclust")
data("Nclus")
set.seed(1)
dat <- as.data.frame(Nclus)
ind <- sample(nrow(dat), 50)
dat[["train"]] <- TRUE
dat[["train"]][ind] <- FALSE
cl1 = kcca(dat[dat[["train"]]==TRUE, 1:2], k=4, kccaFamily("kmeans"))
cl1
#
# call:
# kcca(x = dat[dat[["train"]] == TRUE, 1:2], k = 4)
#
# cluster sizes:
#
# 1 2 3 4
#130 181 98 91
pred_train <- predict(cl1)
pred_test <- predict(cl1, newdata=dat[dat[["train"]]==FALSE, 1:2])
image(cl1)
points(dat[dat[["train"]]==TRUE, 1:2], col=pred_train, pch=19, cex=0.3)
points(dat[dat[["train"]]==FALSE, 1:2], col=pred_test, pch=22, bg="orange")
还有一些转换方法可以将结果从集群函数(如stats::kmeans
或cluster::pam
)转换为类对象,kcca
反之亦然:
as.kcca(cl, data=x)
# kcca object of family ‘kmeans’
#
# call:
# as.kcca(object = cl, data = x)
#
# cluster sizes:
#
# 1 2
# 50 50
step1:计算向量与矩阵每一行之间距离的函数
calc_vec2mat_dist = function(x, ref_mat) {
# compute row-wise vec2vec distance
apply(ref_mat, 1, function(r) sum((r - x)^2))
}
第 2 步:将 vec2mat 计算机应用于 input_matrix 的每一行的函数
calc_mat2mat_dist = function(input_mat, ref_mat) {
dist_mat = apply(input_mat, 1, function(r) calc_vec2mat_dist(r, ref_mat))
# transpose to have each row for each input datapoint
# each column for each centroids
cbind(t(dist_mat), max.col(-t(dist_mat)))
}
第三步。应用 mat2mat 函数
calc_mat2mat_dist(my_input_mat, kmeans_model$centers)
第4步。可选择使用 plyr::ddply 和 doMC 为大数据集并行化 mat2mat
library(doMC)
library(plyr)
pred_cluster_para = function(input_df, center_mat, cl_feat, id_cols, use_ncore = 8) {
# assign cluster lables for each individual (row) in the input_df
# input: input_df - dataframe with all features used in clustering, plus some id/indicator columns
# input: center_mat - matrix of centroid, K rows by M features
# input: cl_feat - list of features (col names)
# input: id_cols - list of index cols (e.g. id) to include in output
# output: output_df - dataframe with same number of rows as input,
# K columns of distances to each clusters
# 1 column of cluster_labels
# x column of indices in idx_cols
n_cluster = nrow(center_mat)
n_feat = ncol(center_mat)
n_input = nrow(input_df)
if(!(typeof(center_mat) %in% c('double','interger') & is.matrix(center_mat))){
stop('The argument "center_mat" must be numeric matrix')
} else if(length(cl_feat) != n_feat) {
stop(sprintf('cl_feat size: %d , center_mat n_col: %d, they have to match!',length(cl_feat), n_feat))
} else {
# register MultiCore backend through doMC and foreach package
doMC::registerDoMC(cores = use_ncore)
# create job_key for mapping/spliting the input data
input_df[,'job_idx'] = sample(1:use_ncore, n_input, replace = TRUE)
# create row_key for tracing the original row order which will be shuffled by mapreduce
input_df[,'row_idx'] = seq(n_input)
# use ddply (df input, df output) to split-process-combine
output_df = ddply(
input_df[, c('job_idx','row_idx',cl_feat,id_cols)], # input big data
'job_idx', # map/split by job_idx
function(chunk) { # work on each chunk
dist = data.frame(calc_mat2mat_dist(chunk[,cl_feat], center_mat))
names(dist) = c(paste0('dist2c_', seq(n_cluster)), 'pred_cluster')
dist[,id_cols] = chunk[,id_cols]
dist[,'row_idx'] = chunk[,'row_idx']
dist # product of mapper
}, .parallel = TRUE) # end of ddply
# sort back to original row order
output_df = output_df[order(output_df$row_idx),]
output_df[c('job_idx')] = NULL
return(output_df)
}
}