如何让元素宝石快速运行?

计算科学 线性代数 矩阵 并行计算 mpi 布拉斯
2021-12-08 02:17:46

如何让元素宝石快速运行?

我有以下代码:

#include "elemental.hpp"
using namespace std;
using namespace elem;

extern "C" {
    void openblas_set_num_threads(int num_threads);
}

int main( int argc, char *argv[] ) {
    Initialize( argc, argv );
    const mpi::Comm comm = mpi::COMM_WORLD;
    const int commRank = mpi::CommRank( comm );
    const int commSize = mpi::CommSize( comm );

    try {
        const int n = Input("--n","size of matrices",1000);
        const int nb = Input("--nb","algorithmic blocksize",128);
        int r = Input("--r","process grid height",0);
        ProcessInput();

        SetBlocksize( nb );
        // If no process grid height was specified, try for a square
        if( r == 0 )
            r = Grid::FindFactor( commSize );
        Grid g( comm, r );

        Matrix<double> A, B, C;
        Uniform( A, n, n );
        Uniform( B, n, n );
        Uniform( C, n, n );
        mpi::Barrier( comm );
        Timer timer;
        if( commRank == 0 ) {
            timer.Start();
            Gemm( NORMAL, NORMAL, 1., A, B, 0., C );
            std::cout << "Multithreaded time: " << timer.Stop() << " secs"
                      << std::endl;  

            openblas_set_num_threads(1);
            timer.Start();
            Gemm( NORMAL, NORMAL, 1., A, B, 0., C );
            std::cout << "Sequential time: " << timer.Stop() << " secs"
                      << std::endl;  
        }
        openblas_set_num_threads(1);

        if( commRank == 0 )
            timer.Start();
        DistMatrix<double,CIRC,CIRC> ARoot(n,n,g), BRoot(n,n,g); 
        if( commRank == 0 ) {
            ARoot.Matrix() = A;
            BRoot.Matrix() = B;
            std::cout << "Population time: " << timer.Stop() << " secs"
                      << std::endl;
        }

        if( commRank == 0 )
            timer.Start();
        DistMatrix<double> ADist( ARoot ), BDist( BRoot ), CDist(g);
        Zeros( CDist, n, n );
        mpi::Barrier( comm );
        if( commRank == 0 ) 
            std::cout << "Scatter from root: " << timer.Stop() << " secs"
                      << std::endl;

        if( commRank == 0 )
            timer.Start();
        Gemm( NORMAL, NORMAL, 1., ADist, BDist, 0., CDist ); 
        mpi::Barrier( comm );   
        if( commRank == 0 ) 
            std::cout << "Distributed: " << timer.Stop() << " secs"
                      << std::endl;

        DistMatrix<double,CIRC,CIRC> CRoot( CDist );
        mpi::Barrier( comm );
        if( commRank == 0 ) 
            std::cout << "Gather to root: " << timer.Stop() << " secs"
                      << std::endl;
    } catch( std::exception& e ) { ReportException(e); }
    Finalize();
    return 0;
}

我在 4 个节点上运行,每个节点有 12 个内核。blas是OpenBLAS,源码编译,所以支持12线程,关闭了affinity。对于 gemm,我得到以下结果,n=4000:

  • 多线程 blas,通过 Elemental Matrix:1.6 秒
  • 单线程 blas。通过元素矩阵:11.9 秒
  • DistMatrix,1 mpi 进程,单线程 blas:13.3 秒
  • DistMatrix,4 个 mpi 进程,单线程 blas:6.6 秒
  • DistMatrix,48 mpi 进程,单线程 blas:30.2 秒

我还尝试使用带有 DistMatrix 的多线程 blas,通过注释掉openblas_set_num_threads(1)上面代码中的行,得到以下结果:

  • DistMatrix,4 个 mpi 进程,多线程 blas:4.5 秒

为什么我在 4 个节点上获得的 DistMatrix 时间与单个计算节点上的多线程 blas 时间没有竞争力?

编辑:我也尝试将 A 设置为 MC,STAR;和 B 到 STAR,MR,在 Jack Poulson 2012 年的“重新思考分布式密集线性代数”的第 27 页之后,但结果对我来说并没有更好:

  • DistMatrix,mpi 4 进程,多线程 blas:6.2 秒
  • DistMatrix,mpi 48 个进程,单线程 blas:35.3 秒

编辑 2:添加代码以防止任何代码被优化,即函数readMatrixcout << sum << endl最后,但时间没有变化:

  • blas 单线程:12.2 秒
  • blas 多线程:1.6
  • DistMatrix 4 个节点,4 个进程,多线程 blas:4.5 秒
  • DistMatrix 4 个节点,48 个进程,单线程 blas:29.8

编辑 3:请注意,对我来说,scalapack sdgemm 的运行似乎也比多线程 blas 慢,但对我来说比 Elemental 快:

scalapack sdgemm,4 个节点,4 个进程,多线程 blas:2.11 秒 scalapack sdgemm,4 个节点,48 个进程,单线程 blas:29.1 秒

请注意,对于 scalapack 和 elemental,使用一个 mpi 进程一个节点并打开 OpenBLAS 多线程似乎比每个内核使用一个 mpi 进程并关闭 OpenBLAS 多线程更快。这是有道理的,因为可以利用共享内存,减少所需的通信量?

scalapack 测试的代码:

#pragma once

extern "C" {
    struct DESC{
        int DTYPE_;
        int CTXT_;
        int M_;
        int N_;
        int MB_;
        int NB_;
        int RSRC_;
        int CSRC_;
        int LLD_;
    } ;

    void blacs_pinfo_( int *iam, int *nprocs );
    void blacs_get_( int *icontxt, int *what, int *val );
    void blacs_gridinit_( int *icontxt, char *order, int *nprow, int *npcol );
    void blacs_gridinfo_( int *context, int *nprow, int *npcol, int *myrow, int *mycol );
    void blacs_gridexit_( int *context );
    void blacs_exit_( int *code );

    int numroc_( int *n, int *nb, int *iproc, int *isrcproc, int *nprocs );
    void descinit_( struct DESC *desc, int *m, int *n, int *mb, int *nb, int *irsrc, int *icsrc, int *ictxt, int *lld, int *info );
    void pdlaprnt_( int *m, int *n, double *a, int *ia, int *ja, struct DESC *desca, int *irprnt,
        int *icprnt, const char *cmatnm, int *nout, double *work, int cmtnmlen );
    void pdgemm_( char *transa, char *transb, int *m, int *n, int *k, double *alpha,
         double *a, int *ia, int *ja, struct DESC *desca, double *b, int *ib, int *jb,
        struct DESC *descb, double *beta, double *c, int *ic, int *jc, struct DESC *descc );
}

void blacs_pinfo( int *p, int *P ) {
    blacs_pinfo_( p, P );
}

int blacs_get( int icontxt, int what ) {
    int val;
    blacs_get_( &icontxt, &what, &val );
    return val;
}

int blacs_gridinit( int icontxt, bool isColumnMajor, int nprow, int npcol ) {
    int newcontext = icontxt;
    char order = isColumnMajor ? 'C' : 'R';
    blacs_gridinit_( &newcontext, &order, &nprow, &npcol );
    return newcontext;
}

void blacs_gridinfo( int context, int nprow, int npcol, int *myrow, int *mycol ) {
    blacs_gridinfo_( &context, &nprow, &npcol, myrow, mycol );
}

void blacs_gridexit( int context ) {
    blacs_gridexit_( &context );
}

void blacs_exit( int code ) {
    blacs_exit_( &code );
}

int numroc( int n, int nb, int iproc, int isrcproc, int nprocs ) {
    return numroc_( &n, &nb, &iproc, &isrcproc, &nprocs );
}

void descinit( struct DESC *desc, int m, int n, int mb, int nb, int irsrc, int icsrc, int ictxt, int lld ) {
    int info;
    descinit_( desc, &m, &n, &mb, &nb, &irsrc, &icsrc, &ictxt, &lld, &info );
    if( info != 0 ) {
        throw runtime_error( "non zero info: " + toString( info ) );
    }
//    return info;
}

void pdlaprnt( int m, int n, double *A, int ia, int ja, struct DESC *desc, int irprnt,
    int icprnt, const char *cmatnm, int nout, double *work ) {
    int cmatnmlen = strlen(cmatnm);
    pdlaprnt_( &m, &n, A, &ia, &ja, desc, &irprnt, &icprnt, cmatnm, &nout, work, cmatnmlen );
}

void pdgemm( bool isTransA, bool isTransB, int m, int n, int k, double alpha,
     double *a, int ia, int ja, struct DESC *desca, double *b, int ib, int jb,
    struct DESC *descb, double beta, double *c, int ic, int jc, struct DESC *descc ) {
    char transa = isTransA ? 'T' : 'N';
    char transb = isTransB ? 'T' : 'N';
    pdgemm_( &transa, &transb, &m, &n, &k, &alpha, a, &ia, &ja, desca, b, &ib, &jb,
        descb, &beta, c, &ic, &jc, descc );
}

#include <iostream>
#include <cmath>
using namespace std;

#include "cpputils.h"

#include "args.h"
#include "scalapack.h"

extern "C" {
    void openblas_set_num_threads(int num_threads);
}

int getRootFactor( int n ) {
    for( int t = sqrt(n); t > 0; t-- ) {
        if( n % t == 0 ) {
            return t;
        }
    }
    return 1;
}

// conventions:
// M_ by N_ matrix block-partitioned into MB_ by NB_ blocks, then
// distributed according to 2d block-cyclic scheme

// based on http://acts.nersc.gov/scalapack/hands-on/exercise3/pspblasdriver.f.html

int main( int argc, char *argv[] ) {
    int p, P;
    blacs_pinfo( &p, &P );
    mpi_print( toString(p) + " / " + toString(P) );

    int n;
    int numthreads;
    Args( argc, argv ).arg("N", &n ).arg("numthreads", &numthreads ).go();
    openblas_set_num_threads( numthreads );

    int nprows = getRootFactor(P);
    int npcols = P / nprows;
    if( p == 0 ) cout << "grid: " << nprows << " x " << npcols << endl;

    int system = blacs_get( -1, 0 );
    int grid = blacs_gridinit( system, true, nprows, npcols );
    if( p == 0 ) cout << "system context " << system << " grid context: " << grid << endl;

    int myrow, mycol;
    blacs_gridinfo( grid, nprows, npcols, &myrow, &mycol );
    mpi_print("grid, me: " + toString(myrow) + ", " + toString(mycol) );

    if( myrow >= nprows || mycol >= npcols ) {
        mpi_print("not needed, exiting");
        blacs_gridexit( grid );
        blacs_exit(0);
        exit(0);
    }

    // A     B       C
    // m x k k x n = m x n
    // nb: blocksize

    // nprows: process grid, number rows
    // npcols: process grid, number cols
    // myrow: process grid, our row
    // mycol: process grid, our col
    int m = n;
    int k = n;
//    int nb = min(n,128); // nb is column block size for A, and row blocks size for B
    int nb=min(n/P,128);

    int mp = numroc( m, nb, myrow, 0, nprows ); // mp number rows A owned by this process
    int kp = numroc( k, nb, myrow, 0, nprows ); // kp number rows B owned by this process
    int kq = numroc( k, nb, mycol, 0, npcols ); // kq number cols A owned by this process
    int nq = numroc( n, nb, mycol, 0, npcols ); // nq number cols B owned by this process
    mpi_print( "mp " + toString(mp) + " kp " + toString(kp) + " kq " + toString(kq) + " nq " + toString(nq) );

    struct DESC desca, descb, descc;
    descinit( (&desca), m, k, nb, nb, 0, 0, grid, max(1, mp) );
    descinit( (&descb), k, n, nb, nb, 0, 0, grid, max(1, kp) );
    descinit( (&descc), m, n, nb, nb, 0, 0, grid, max(1, mp) );
    mpi_print( "desca.LLD_ " + toString(desca.LLD_) + " kq " + toString(kq) );
    double *ipa = new double[desca.LLD_ * kq];
    double *ipb = new double[descb.LLD_ * nq];
    double *ipc = new double[descc.LLD_ * nq];

    for( int i = 0; i < desca.LLD_ * kq; i++ ) {
        ipa[i] = p;
    }
    for( int i = 0; i < descb.LLD_ * nq; i++ ) {
        ipb[i] = p;
    }

    if( p == 0 ) cout << "created matrices" << endl;
    double *work = new double[nb];
    if( n <=5 ) {
        pdlaprnt( n, n, ipa, 1, 1, &desca, 0, 0, "A", 6, work );
        pdlaprnt( n, n, ipb, 1, 1, &descb, 0, 0, "B", 6, work );
    }

    NanoTimer timer;
    pdgemm( false, false, m, n, k, 1,
                  ipa, 1, 1, &desca, ipb, 1, 1, &descb,
                  1, ipc, 1, 1, &descc );
    MPI_Barrier( MPI_COMM_WORLD );
    if( p == 0 ) timer.toc("pdgemm");

    blacs_gridexit( grid );
    blacs_exit(0);

    return 0;
}

这是 Jack Poulson 的 elemental/examples/blas-like/Gemm.cpp 程序的输出,运行时使用OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 mpirun.mpich2 -hosts host3,host1,host2,host4 48 ./ElementalExampleGemm --m 2000 --n 2000 --k 2000 --nb 128,即 48 个进程,每个进程 1 个线程:

g: 6 x 8
Sequential: 1.74586 secs and 9.16452 GFLops
Populate root node: 0.0340741 secs
Spread from root: 0.443471 secs
[MC,* ] AllGather: 0.0068028 secs, 50.2758 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.224466 secs, 1.14048 MB/s for 128 x 250 local matrix
Local gemm: 0.00301719 secs and 7.08474 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.00582409 secs, 58.7244 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.221785 secs, 1.15427 MB/s for 128 x 250 local matrix
Local gemm: 0.00290704 secs and 7.35319 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.00559711 secs, 61.1058 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.258585 secs, 0.990002 MB/s for 128 x 250 local matrix
Local gemm: 0.00293088 secs and 7.29337 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.00562692 secs, 60.7821 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.266162 secs, 0.961821 MB/s for 128 x 250 local matrix
Local gemm: 0.00652504 secs and 3.276 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.00574803 secs, 59.5014 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.253986 secs, 1.00793 MB/s for 128 x 250 local matrix
Local gemm: 0.00300407 secs and 7.11567 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.00567889 secs, 60.2258 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.263011 secs, 0.973343 MB/s for 128 x 250 local matrix
Local gemm: 0.00289297 secs and 7.38894 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.00561213 secs, 60.9422 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.0310259 secs, 8.25117 MB/s for 128 x 250 local matrix
Local gemm: 0.00288296 secs and 7.41461 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.00552988 secs, 61.8487 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.229407 secs, 1.11592 MB/s for 128 x 250 local matrix
Local gemm: 0.00290298 secs and 7.36346 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.00556993 secs, 61.4039 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.259156 secs, 0.987822 MB/s for 128 x 250 local matrix
Local gemm: 0.00277686 secs and 7.6979 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.00564504 secs, 60.587 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.260839 secs, 0.981448 MB/s for 128 x 250 local matrix
Local gemm: 0.00277185 secs and 7.7118 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.00549412 secs, 62.2513 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.224814 secs, 1.13872 MB/s for 128 x 250 local matrix
Local gemm: 0.00276208 secs and 7.7391 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.00556684 secs, 61.4381 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.216236 secs, 1.18389 MB/s for 128 x 250 local matrix
Local gemm: 0.00276899 secs and 7.71977 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.00551414 secs, 62.0252 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.22506 secs, 1.13747 MB/s for 128 x 250 local matrix
Local gemm: 0.00276208 secs and 7.7391 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.005409 secs, 63.2309 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.255941 secs, 1.00023 MB/s for 128 x 250 local matrix
Local gemm: 0.00276995 secs and 7.71712 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.00536704 secs, 63.7252 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.225583 secs, 1.13484 MB/s for 128 x 250 local matrix
Local gemm: 0.00295305 secs and 7.23861 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.00358391 secs, 59.6444 MB/s for 334 x 80 local matrix
[* ,MR] AllGather: 0.251425 secs, 0.636373 MB/s for 80 x 250 local matrix
Local gemm: 0.00167489 secs and 7.97664 GFlops for 334 x 250 x 80 product
Distributed Gemm: 3.80641 secs
Gathered to root: 0.399101 secs

编辑:以及 4 个 mpi 进程(在 4 个 12 核节点上)的结果,使用 运行mpirun.mpich2 -hosts host3,host1,host2,host4 -np 4 ./ElementalExampleGemm --m 2000 --n 2000 --k 2000 --nb 128,即激活 OpenBLAS 多线程:

g: 2 x 2
Sequential: 0.219173 secs and 73.0017 GFLops
Populate root node: 0.035708 secs
Spread from root: 0.482837 secs
[MC,* ] AllGather: 0.0147331 secs, 69.5035 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.010592 secs, 96.6769 MB/s for 128 x 1000 local matrix
Local gemm: 0.00869703 secs and 29.4353 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.00796413 secs, 128.576 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.00883698 secs, 115.877 MB/s for 128 x 1000 local matrix
Local gemm: 0.00752282 secs and 34.0298 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.00717402 secs, 142.737 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.0083642 secs, 122.427 MB/s for 128 x 1000 local matrix
Local gemm: 0.00796413 secs and 32.1441 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.00718212 secs, 142.576 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.0080409 secs, 127.349 MB/s for 128 x 1000 local matrix
Local gemm: 0.00650787 secs and 39.337 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.00641584 secs, 159.605 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.00688195 secs, 148.795 MB/s for 128 x 1000 local matrix
Local gemm: 0.00576997 secs and 44.3677 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.00597095 secs, 171.497 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.00652695 secs, 156.888 MB/s for 128 x 1000 local matrix
Local gemm: 0.00652885 secs and 39.2106 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.00575399 secs, 177.963 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.00634193 secs, 161.465 MB/s for 128 x 1000 local matrix
Local gemm: 0.00574183 secs and 44.5851 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.00563598 secs, 181.69 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.00627708 secs, 163.133 MB/s for 128 x 1000 local matrix
Local gemm: 0.00576282 secs and 44.4227 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.005867 secs, 174.535 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.00619698 secs, 165.242 MB/s for 128 x 1000 local matrix
Local gemm: 0.01108 secs and 23.1046 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.00589108 secs, 173.822 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.00623584 secs, 164.212 MB/s for 128 x 1000 local matrix
Local gemm: 0.00559402 secs and 45.7632 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.00580001 secs, 176.551 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.00648689 secs, 157.857 MB/s for 128 x 1000 local matrix
Local gemm: 0.00570703 secs and 44.857 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.00566101 secs, 180.886 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.00638199 secs, 160.452 MB/s for 128 x 1000 local matrix
Local gemm: 0.00575209 secs and 44.5056 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.00572801 secs, 178.771 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.00630784 secs, 162.338 MB/s for 128 x 1000 local matrix
Local gemm: 0.005795 secs and 44.176 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.00581098 secs, 176.218 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.00644612 secs, 158.855 MB/s for 128 x 1000 local matrix
Local gemm: 0.00571203 secs and 44.8177 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.00570321 secs, 179.548 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.00653315 secs, 156.739 MB/s for 128 x 1000 local matrix
Local gemm: 0.00570703 secs and 44.857 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.00371313 secs, 172.361 MB/s for 1000 x 80 local matrix
[* ,MR] AllGather: 0.00396085 secs, 161.582 MB/s for 80 x 1000 local matrix
Local gemm: 0.00396299 secs and 40.3735 GFlops for 1000 x 1000 x 80 product
Distributed Gemm: 0.327859 secs
Gathered to root: 0.237197 secs
2个回答

如果您有兴趣更准确地说明在 Gemm 中花费的时间,您可能需要查看我刚刚创建的以下示例驱动程序

例如, mpirun -np 48 ./Gemm --m 2000 --n 2000 --k 2000 --nb 128 将运行您的示例代码的检测版本,该版本报告在算法的每个阶段发生的MPI_Allgather和调用中实现的本地 MB/s 和 GFlop/s。dgemm如果分布式 Gemm 中实现的本地 GFlops 远低于顺序 Gemm,那么您可能需要查看您的 BLAS 堆栈。如果 AllGathers 的表现非常糟糕,那么您可能需要查看您的 MPI 配置。粗略地说,ScaLAPACK 的pdgemm工作方式与 Elemental 的类似,但它使用与MPI_BcastElemental 使用的位置相同的地方MPI_AllgatherMPI_Bcast对于MPI_Allgather相同的结果大小,您可能希望在您的机器上测量实现的 MB/s 。

我最好的看法是你的问题太小了。如果您使用 MPI,确实可以让更多处理器处理您的问题,但它们必须使用外部网络相当缓慢地交换数据。使用如此小的矩阵,您可以通过 MPI 使用足够多的处理器将执行时间降低到非常小的数字,但是网络通信时间将占主导地位,并且是作为整体运行时间报告的时间。

要了解更多关于它的真正含义,您需要进行参数研究:使用各种大小为 100、500、1000、2000、4000、1e4 的矩阵运行测试,并绘制运行时间图。您将看到随着问题变得越来越大,不同的实现会表现出不同的行为。