我正在尝试使用 MPI 共享内存功能。我有几个 SMP 节点,每个节点都有四个核心。我需要每个节点的大小为 N 的数组,每个节点中的所有四个核心都应该访问该数组。我的计划是使用 MPI_Win_allocate_shared 构造一个大小为 N/4 的共享窗口,我希望每个节点的内存使用量为 N。在下面的示例中,N 是 4X10^9 字节,但每个节点的内存使用量不是4GB 但 16GB。我错过了什么吗?
#include <iostream>
#include <mpi.h>
int main(int argc, char** argv) {
MPI_Init(&argc, &argv);
int rank_all;
int rank_sm;
int size_sm;
// all communicator
MPI_Comm comm_sm;
MPI_Comm_rank(MPI_COMM_WORLD, &rank_all);
// shared memory communicator
MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &comm_sm);
MPI_Comm_rank(comm_sm, &rank_sm);
MPI_Comm_size(comm_sm, &size_sm);
std::size_t local_window_count(1000000000);
char* base_ptr;
MPI_Win win_sm;
int disp_unit(sizeof(char));
MPI_Win_allocate_shared(local_window_count * disp_unit, disp_unit, MPI_INFO_NULL, comm_sm, &base_ptr, &win_sm);
// write
char buffer;
if (rank_sm == 0) {
buffer = 'A';
}
else if (rank_sm == 1) {
buffer = 'C';
}
else if (rank_sm == 2) {
buffer = 'G';
}
else {
buffer = 'T';
}
MPI_Win_fence(0, win_sm);
for (std::size_t it = 0; it < local_window_count; it++) {
base_ptr[it] = buffer;
}
MPI_Win_fence(0, win_sm);
// read
long long int index_start(-1 * rank_sm * local_window_count);
long long int index_end((size_sm - rank_sm) * local_window_count - 1);
for (long long int it_rel = index_start; it_rel < index_end; it_rel++) {
buffer = base_ptr[it_rel];
if (it_rel == index_start) {
std::cout << rank_sm << " start: " << buffer << std::endl;
}
else if (it_rel == (index_end - 1)) {
std::cout << rank_sm << " end: " << buffer << std::endl;
}
}
MPI_Finalize();
return 0;
}