为什么我使用 MPI 的并行代码比串行代码慢得多?

计算科学 并行计算 迭代法 mpi
2021-12-23 17:47:06

我知道这不是第一次有人问这个问题,但我真的很困惑。我是 MPI 的新手,我尝试为线性系统实现 Jacobi 求解器。我想比较并行和串行问题之间的时间,但是对于大 ,我的 MPI 代码要慢得多。我不知道问题出在我的代码上还是在我的电脑上。顺便说一句,我在双核 CPU 上使用 2 个进程。任何帮助都会很有用!Ax=bN

这是我的并行代码:`

#include <iostream>
#include "mpi.h"
#include <cmath>
#include <cstdlib>
using namespace std;

#define n 10000

void CreateMatrix(double *A){

     double value;
     double sum=0;
     int j=0;
     for (int i=0;i<n*n;i++){
             j++;
            //cout<<"Enter matrix value row i="<<i<<endl;
            //cin>>value;
            value=(rand()%1000)+1;

            A[i]=value;
     }

     for(int i=0;i<n;i++){
         A[i*n+i]=(1000*n+128.5)+rand()%8984;//dummy valus in diagonal...
     }                                       //in order to be diagonally dominant   
};

void CreateVector(double *b){


    double value;

        for(int i=0;i<n;i++){
            //cout<<"Enter vector b value row i="<<i<<endl;
            value=(rand()%974)-i*(rand()%20)+(rand()%1000);//random values
            b[i]=value;
        }
};

int JacobiMethod(double *A,double *b,double *x_out,int size,int rank,double tol,int max_iter){

    int n_local=n/size;
    double *xold;
    double *xnew;
    double *temp1=new double [n];
    double *temp2=new double [n];
    double *swap;
    int iterations=0;
    int k;
    double sum,sum1;
    double error=1;
    int flag;


    MPI_Allgather(b,n_local,MPI_DOUBLE,temp1,n_local,MPI_DOUBLE,MPI_COMM_WORLD);//gather all b matrices from each process to temp1
    xold=temp2;
    xnew=temp1;

    while(abs(error)>tol && iterations<max_iter){
        swap=xnew;
        xnew=xold;
        xold=swap;
        iterations+=1;

        for(int i=0;i<n_local;i++){

            k=i+rank*n_local;//tells us the location of diagonal element


            sum=0;
            for(int j=0;j<n;j++){
                if (j!=k){
                    sum+=A[j+n*i]*xold[j];//A is a continuus array so n*i gives us the next row

                }
            }

            x_out[i]=(b[i]-sum)/A[n*i+k];

        }
        MPI_Allgather(x_out,n_local,MPI_DOUBLE,xnew,n_local,MPI_DOUBLE,MPI_COMM_WORLD);
        sum1=0;
        for(int i=0;i<n;i++){
            sum1=sum1+pow((xnew[i]-xold[i]),2);
        }
        error=sqrt(sum1);
    }


    if (iterations>=max_iter){
        flag= -1;
    }
    else{
        flag= 0;

    }
    delete []A;
    delete []b;
    delete []temp1;
    delete  [] temp2;
    return flag;
};
void printResults( double *x_out,int size,int rank){
    int n_local=n/size;
    double *answ=new double [n];
    MPI_Gather(x_out, n_local, MPI_DOUBLE, answ, n_local, MPI_DOUBLE,0,MPI_COMM_WORLD);//gather all data to answ-->in one process(process 0)
    if (rank==0){
        cout<<"The algorith converges"<<endl;
        cout<<"The results are: "<<endl;
        for(int i=0;i<n;i++){
            //cout<<answ[i]<<endl;
        }
    }
    delete[] answ;
};

int main(){


double tol;//tolerance
int max_iter;

int converged;
//Mpi initialization
MPI_Init(NULL,NULL);
double *b_local;
double *A_local;
int size;
MPI_Comm_size(MPI_COMM_WORLD, &size);   
int rank;
MPI_Comm_rank(MPI_COMM_WORLD,&rank);    
if (rank==0){
    b_local=new double[n];
    A_local=new double[n*n];
    cout<<"Enter talerance,number of iterations"<<endl;
    cin>>tol;
    cin>>max_iter;
    //Create A and scatter it to all process
    CreateMatrix(A_local);
    //Create b and scatter it to all process    
    CreateVector(b_local);

}
//data init
double *A=new double[n*n/size];
double *b=new double[n/size];
double *x_out=new double[n/size];
//brocast tol,max_iter to all processes
MPI_Bcast(&tol,1,MPI_DOUBLE,0,MPI_COMM_WORLD);//send to all processes
MPI_Bcast(&max_iter,1,MPI_INT,0,MPI_COMM_WORLD);    
//scatter vector b.Each process takes n/size
MPI_Scatter(b_local,n/size,MPI_DOUBLE,b,n/size,MPI_DOUBLE,0,MPI_COMM_WORLD);//here n_local cause we have only one column
//scatter it to all processes
MPI_Scatter(A_local,(n/size)*n,MPI_DOUBLE,A,(n/size)*n,MPI_DOUBLE,0,MPI_COMM_WORLD);//n_local*n--->number of elements in n/size rows
if (rank==0){
 delete [] b_local;
 delete [] A_local;
}
double time0,time1;
time0=MPI_Wtime();
converged=JacobiMethod(A,b,x_out,size,rank,tol,max_iter);
time1=MPI_Wtime();

if (converged==0){
    cout<<"Time needed for the jacobi algorith to be executed is :"<<time1-time0<<endl;
    printResults(x_out,size,rank);
}
else{
    cout<<"Jacobi doesnt converge"<<endl;
}
MPI_Finalize(); 
return 0;   

}
1个回答

您需要问自己的第一件事:您的问题是否足够大,以至于 MPI 消息传递的开销小于您节省的工作量。您的问题大小为 10k,很小,但另一方面,您似乎有一个密集的矩阵,因此有大量的工作。

下一步:并行化数值方法可能会改变数学。看起来您正在正确计算对角线元素的位置,但打印出残差范数并没有什么坏处:它们在顺序和并行之间应该是相同的。一般情况下这是不正确的。

我认为唯一真正错误的是您使用了 Allgather。这违背了 MPI 的理念,即一切都应该是分布式的。您永远不应该拥有完全复制的数据。

对不起,这些只是一些想法。也许问题完全是另一回事。首先检查在相同数量的迭代中顺序和并行运行。