答案就在您的链接文章中。从第一个开始,这里是余弦和相关的公式(为了简洁和清晰而略作编辑):
CosSim(x,y) Corr(x,y) Corr(x,y)=∑ixiyi∑ix2i−−−−−√∑iy2i−−−−−√=∑i(xi−x¯)(yi−y¯)∑(xi−x¯)2−−−−−−−−−√∑(yi−y¯)2−−−−−−−−−√=CosSim(x−x¯, y−y¯)
因此,最简单的调整就是从输入向量中减去均值:
library(MASS) # we need this package to generate correlated data below
set.seed(2641) # this makes the example exactly reproducible
# now I generate correlated data:
X <- mvrnorm(1000, mu=c(100, 150), Sigma=rbind(c(30, 17),
c(17, 50) ) )
# I adapted the function somewhat, as the original was keyed to its context
cos.sim <- function(X, corr=FALSE){
if(corr){
A = X[,1] - mean(X[,1])
B = X[,2] - mean(X[,2])
} else {
A = X[,1]
B = X[,2]
}
return( t(A)%*%B / sqrt(sum(A^2)*sum(B^2)) )
}
cos.sim(X)
# [,1]
# [1,] 0.9985756
cos.sim(X, corr=TRUE)
# [,1]
# [1,] 0.4604822
cor(X)
# [,1] [,2]
# [1,] 1.0000000 0.4604822
# [2,] 0.4604822 1.0000000
这是一个矩阵版本:
set.seed(6616)
X3 <- mvrnorm(1000, mu=c(100, 150, 175), Sigma=rbind(c(30, 17, 12),
c(17, 50, 29),
c(12, 29, 46) ))
cos.sim.mat <- function(X, corr=FALSE){
if(corr){ X = apply(X, 2, function(x){ x-mean(x) }) }
denom = solve(diag(sqrt(diag(t(X)%*%X))))
return( denom%*%(t(X)%*%X)%*%denom )
}
cos.sim.mat(X3)
# [,1] [,2] [,3]
# [1,] 1.0000000 0.9984552 0.9983700
# [2,] 0.9984552 1.0000000 0.9992154
# [3,] 0.9983700 0.9992154 1.0000000
cos.sim.mat(X3, corr=TRUE)
# [,1] [,2] [,3]
# [1,] 1.0000000 0.3990872 0.2584569
# [2,] 0.3990872 1.0000000 0.5900067
# [3,] 0.2584569 0.5900067 1.0000000
cor(X3)
# [,1] [,2] [,3]
# [1,] 1.0000000 0.3990872 0.2584569
# [2,] 0.3990872 1.0000000 0.5900067
# [3,] 0.2584569 0.5900067 1.0000000