我正在研究重叠聚类(聚类不相交)。我发现Neo-K-Means可能是现在最先进的。但是,当我尝试使用多标签数据集实现算法时(音乐情感/场景)。我没有得到论文中声明的高结果(我的结果约为 0.4 F-measure,论文声明音乐为 0.55,场景为 0.626)。尽管如此,我还是用 K-Means 的最佳情况初始化了实验(质心种子是每个类的均值,总的集群分配等于现实)。我想知道我的实现方法有什么问题,或者我是否需要做一些额外的事情才能获得更高的结果?
PS。我找到了Neo-Kmeans 的进一步研究,但我认为我应该在进一步研究之前明确这一点。
这是我的代码
while (count < TIMES) {
DC = new ArrayList<DistanceCollection>();
for (int i = 0; i < K; i++) {
cluster[i] = new Cluster();
}
for (int i = 0; i < dataList.size(); i++) {
for (int j = 0; j < K; j++) {
DistanceCollection dist = new DistanceCollection();
dist.dataNum = dataList.get(i).dataNum;
dist.clusterNum = j;
dist.distanceFromCluster = euclidean(centroids[j], dataList.get(i));
DC.add(dist);
}
}
// sort the distances for argmin(i,j) checking
Collections.sort(DC, new DistanceCollectionComparator());
int totalAssignment = 2585;
int assignedCluster = -1;
int assignmentCount = 0;
int[] dataSelectionCheck = new int[2407];
int dataMatrix[][] = new int[6][2407];
for (int i = 0; assignmentCount < dataList.size(); i++) {
int clusterNum = DC.get(i).clusterNum;
int dataNum = DC.get(i).dataNum;
if (dataMatrix[clusterNum][dataNum] == 0 && dataSelectionCheck[dataNum] == 0) {
cluster[clusterNum].dataMembers.add(dataList.get(dataNum));
dataMatrix[clusterNum][dataNum] = 1;
dataSelectionCheck[dataNum] = 1;
assignmentCount++;
}
}
for (int i = 0; assignmentCount < totalAssignment; i++) {
int clusterNum = DC.get(i).clusterNum;
int dataNum = DC.get(i).dataNum;
if (dataMatrix[clusterNum][dataNum] == 0) {
cluster[clusterNum].dataMembers.add(dataList.get(dataNum));
dataMatrix[clusterNum][dataNum] = 1;
assignmentCount++;
}
}
for (int i = 0; i < K; i++) {
if (cluster[i].dataMembers.size() > 0) {
for (int j = 0; j < centroids[i].features.length ; j++) {
double accumFeaturesValue = 0;
for (int k = 0; k < cluster[i].dataMembers.size(); k++) {
accumFeaturesValue += cluster[i].dataMembers.get(k).features[j];
}
centroids[i].features[j] = accumFeaturesValue / cluster[i].dataMembers.size();
}
}
}
count++ ;
}