我正在尝试计算 2 个向量的互信息。我做了一个通用函数来识别数据是分类的还是连续的。很难找到这种计算的简单例子,我只找到了理论实现(例如如何计算互信息?)。 我计算了已归一化的数据(不再是整数),我想计算两行之间的互信息。我正在查看 R 文档https://cran.r-project.org/web/packages/entropy/entropy.pdf并将连续数据离散化到 bin 中。为什么需要这样做?
我的实现是否正确?如果不是,为什么以及如何固定才能准确计算互信息?
def shannon_entropy(A, mode="auto", verbose=False):
"""
https://stackoverflow.com/questions/42683287/python-numpy-shannon-entropy-array
"""
A = np.asarray(A)
# Determine distribution type
if mode == "auto":
condition = np.all(A.astype(float) == A.astype(int))
if condition:
mode = "discrete"
else:
mode = "continuous"
if verbose:
print(mode, file=sys.stderr)
# Compute shannon entropy
pA = A / A.sum()
# Remove zeros
pA = pA[np.nonzero(pA)[0]]
if mode == "continuous":
return -np.sum(pA*np.log2(A))
if mode == "discrete":
return -np.sum(pA*np.log2(pA))
def mutual_information(x,y, mode="auto", normalized=False):
"""
I(X, Y) = H(X) + H(Y) - H(X,Y)
https://stackoverflow.com/questions/20491028/optimal-way-to-compute-pairwise-mutual-information-using-numpy
"""
x = np.asarray(x)
y = np.asarray(y)
# Determine distribution type
if mode == "auto":
condition_1 = np.all(x.astype(float) == x.astype(int))
condition_2 = np.all(y.astype(float) == y.astype(int))
if all([condition_1, condition_2]):
mode = "discrete"
else:
mode = "continuous"
H_x = shannon_entropy(x, mode=mode)
H_y = shannon_entropy(y, mode=mode)
H_xy = shannon_entropy(np.concatenate([x,y]), mode=mode)
# Mutual Information
I_xy = H_x + H_y - H_xy
if normalized:
return I_xy/np.sqrt(H_x*H_y)
else:
return I_xy