如何让元素宝石快速运行?
我有以下代码:
#include "elemental.hpp"
using namespace std;
using namespace elem;
extern "C" {
void openblas_set_num_threads(int num_threads);
}
int main( int argc, char *argv[] ) {
Initialize( argc, argv );
const mpi::Comm comm = mpi::COMM_WORLD;
const int commRank = mpi::CommRank( comm );
const int commSize = mpi::CommSize( comm );
try {
const int n = Input("--n","size of matrices",1000);
const int nb = Input("--nb","algorithmic blocksize",128);
int r = Input("--r","process grid height",0);
ProcessInput();
SetBlocksize( nb );
// If no process grid height was specified, try for a square
if( r == 0 )
r = Grid::FindFactor( commSize );
Grid g( comm, r );
Matrix<double> A, B, C;
Uniform( A, n, n );
Uniform( B, n, n );
Uniform( C, n, n );
mpi::Barrier( comm );
Timer timer;
if( commRank == 0 ) {
timer.Start();
Gemm( NORMAL, NORMAL, 1., A, B, 0., C );
std::cout << "Multithreaded time: " << timer.Stop() << " secs"
<< std::endl;
openblas_set_num_threads(1);
timer.Start();
Gemm( NORMAL, NORMAL, 1., A, B, 0., C );
std::cout << "Sequential time: " << timer.Stop() << " secs"
<< std::endl;
}
openblas_set_num_threads(1);
if( commRank == 0 )
timer.Start();
DistMatrix<double,CIRC,CIRC> ARoot(n,n,g), BRoot(n,n,g);
if( commRank == 0 ) {
ARoot.Matrix() = A;
BRoot.Matrix() = B;
std::cout << "Population time: " << timer.Stop() << " secs"
<< std::endl;
}
if( commRank == 0 )
timer.Start();
DistMatrix<double> ADist( ARoot ), BDist( BRoot ), CDist(g);
Zeros( CDist, n, n );
mpi::Barrier( comm );
if( commRank == 0 )
std::cout << "Scatter from root: " << timer.Stop() << " secs"
<< std::endl;
if( commRank == 0 )
timer.Start();
Gemm( NORMAL, NORMAL, 1., ADist, BDist, 0., CDist );
mpi::Barrier( comm );
if( commRank == 0 )
std::cout << "Distributed: " << timer.Stop() << " secs"
<< std::endl;
DistMatrix<double,CIRC,CIRC> CRoot( CDist );
mpi::Barrier( comm );
if( commRank == 0 )
std::cout << "Gather to root: " << timer.Stop() << " secs"
<< std::endl;
} catch( std::exception& e ) { ReportException(e); }
Finalize();
return 0;
}
我在 4 个节点上运行,每个节点有 12 个内核。blas是OpenBLAS,源码编译,所以支持12线程,关闭了affinity。对于 gemm,我得到以下结果,n=4000:
- 多线程 blas,通过 Elemental Matrix:1.6 秒
- 单线程 blas。通过元素矩阵:11.9 秒
- DistMatrix,1 mpi 进程,单线程 blas:13.3 秒
- DistMatrix,4 个 mpi 进程,单线程 blas:6.6 秒
- DistMatrix,48 mpi 进程,单线程 blas:30.2 秒
我还尝试使用带有 DistMatrix 的多线程 blas,通过注释掉openblas_set_num_threads(1)上面代码中的行,得到以下结果:
- DistMatrix,4 个 mpi 进程,多线程 blas:4.5 秒
为什么我在 4 个节点上获得的 DistMatrix 时间与单个计算节点上的多线程 blas 时间没有竞争力?
编辑:我也尝试将 A 设置为 MC,STAR;和 B 到 STAR,MR,在 Jack Poulson 2012 年的“重新思考分布式密集线性代数”的第 27 页之后,但结果对我来说并没有更好:
- DistMatrix,mpi 4 进程,多线程 blas:6.2 秒
- DistMatrix,mpi 48 个进程,单线程 blas:35.3 秒
编辑 2:添加代码以防止任何代码被优化,即函数readMatrix和cout << sum << endl最后,但时间没有变化:
- blas 单线程:12.2 秒
- blas 多线程:1.6秒
- DistMatrix 4 个节点,4 个进程,多线程 blas:4.5 秒
- DistMatrix 4 个节点,48 个进程,单线程 blas:29.8秒
编辑 3:请注意,对我来说,scalapack sdgemm 的运行似乎也比多线程 blas 慢,但对我来说比 Elemental 快:
scalapack sdgemm,4 个节点,4 个进程,多线程 blas:2.11 秒 scalapack sdgemm,4 个节点,48 个进程,单线程 blas:29.1 秒
请注意,对于 scalapack 和 elemental,使用一个 mpi 进程一个节点并打开 OpenBLAS 多线程似乎比每个内核使用一个 mpi 进程并关闭 OpenBLAS 多线程更快。这是有道理的,因为可以利用共享内存,减少所需的通信量?
scalapack 测试的代码:
#pragma once
extern "C" {
struct DESC{
int DTYPE_;
int CTXT_;
int M_;
int N_;
int MB_;
int NB_;
int RSRC_;
int CSRC_;
int LLD_;
} ;
void blacs_pinfo_( int *iam, int *nprocs );
void blacs_get_( int *icontxt, int *what, int *val );
void blacs_gridinit_( int *icontxt, char *order, int *nprow, int *npcol );
void blacs_gridinfo_( int *context, int *nprow, int *npcol, int *myrow, int *mycol );
void blacs_gridexit_( int *context );
void blacs_exit_( int *code );
int numroc_( int *n, int *nb, int *iproc, int *isrcproc, int *nprocs );
void descinit_( struct DESC *desc, int *m, int *n, int *mb, int *nb, int *irsrc, int *icsrc, int *ictxt, int *lld, int *info );
void pdlaprnt_( int *m, int *n, double *a, int *ia, int *ja, struct DESC *desca, int *irprnt,
int *icprnt, const char *cmatnm, int *nout, double *work, int cmtnmlen );
void pdgemm_( char *transa, char *transb, int *m, int *n, int *k, double *alpha,
double *a, int *ia, int *ja, struct DESC *desca, double *b, int *ib, int *jb,
struct DESC *descb, double *beta, double *c, int *ic, int *jc, struct DESC *descc );
}
void blacs_pinfo( int *p, int *P ) {
blacs_pinfo_( p, P );
}
int blacs_get( int icontxt, int what ) {
int val;
blacs_get_( &icontxt, &what, &val );
return val;
}
int blacs_gridinit( int icontxt, bool isColumnMajor, int nprow, int npcol ) {
int newcontext = icontxt;
char order = isColumnMajor ? 'C' : 'R';
blacs_gridinit_( &newcontext, &order, &nprow, &npcol );
return newcontext;
}
void blacs_gridinfo( int context, int nprow, int npcol, int *myrow, int *mycol ) {
blacs_gridinfo_( &context, &nprow, &npcol, myrow, mycol );
}
void blacs_gridexit( int context ) {
blacs_gridexit_( &context );
}
void blacs_exit( int code ) {
blacs_exit_( &code );
}
int numroc( int n, int nb, int iproc, int isrcproc, int nprocs ) {
return numroc_( &n, &nb, &iproc, &isrcproc, &nprocs );
}
void descinit( struct DESC *desc, int m, int n, int mb, int nb, int irsrc, int icsrc, int ictxt, int lld ) {
int info;
descinit_( desc, &m, &n, &mb, &nb, &irsrc, &icsrc, &ictxt, &lld, &info );
if( info != 0 ) {
throw runtime_error( "non zero info: " + toString( info ) );
}
// return info;
}
void pdlaprnt( int m, int n, double *A, int ia, int ja, struct DESC *desc, int irprnt,
int icprnt, const char *cmatnm, int nout, double *work ) {
int cmatnmlen = strlen(cmatnm);
pdlaprnt_( &m, &n, A, &ia, &ja, desc, &irprnt, &icprnt, cmatnm, &nout, work, cmatnmlen );
}
void pdgemm( bool isTransA, bool isTransB, int m, int n, int k, double alpha,
double *a, int ia, int ja, struct DESC *desca, double *b, int ib, int jb,
struct DESC *descb, double beta, double *c, int ic, int jc, struct DESC *descc ) {
char transa = isTransA ? 'T' : 'N';
char transb = isTransB ? 'T' : 'N';
pdgemm_( &transa, &transb, &m, &n, &k, &alpha, a, &ia, &ja, desca, b, &ib, &jb,
descb, &beta, c, &ic, &jc, descc );
}
#include <iostream>
#include <cmath>
using namespace std;
#include "cpputils.h"
#include "args.h"
#include "scalapack.h"
extern "C" {
void openblas_set_num_threads(int num_threads);
}
int getRootFactor( int n ) {
for( int t = sqrt(n); t > 0; t-- ) {
if( n % t == 0 ) {
return t;
}
}
return 1;
}
// conventions:
// M_ by N_ matrix block-partitioned into MB_ by NB_ blocks, then
// distributed according to 2d block-cyclic scheme
// based on http://acts.nersc.gov/scalapack/hands-on/exercise3/pspblasdriver.f.html
int main( int argc, char *argv[] ) {
int p, P;
blacs_pinfo( &p, &P );
mpi_print( toString(p) + " / " + toString(P) );
int n;
int numthreads;
Args( argc, argv ).arg("N", &n ).arg("numthreads", &numthreads ).go();
openblas_set_num_threads( numthreads );
int nprows = getRootFactor(P);
int npcols = P / nprows;
if( p == 0 ) cout << "grid: " << nprows << " x " << npcols << endl;
int system = blacs_get( -1, 0 );
int grid = blacs_gridinit( system, true, nprows, npcols );
if( p == 0 ) cout << "system context " << system << " grid context: " << grid << endl;
int myrow, mycol;
blacs_gridinfo( grid, nprows, npcols, &myrow, &mycol );
mpi_print("grid, me: " + toString(myrow) + ", " + toString(mycol) );
if( myrow >= nprows || mycol >= npcols ) {
mpi_print("not needed, exiting");
blacs_gridexit( grid );
blacs_exit(0);
exit(0);
}
// A B C
// m x k k x n = m x n
// nb: blocksize
// nprows: process grid, number rows
// npcols: process grid, number cols
// myrow: process grid, our row
// mycol: process grid, our col
int m = n;
int k = n;
// int nb = min(n,128); // nb is column block size for A, and row blocks size for B
int nb=min(n/P,128);
int mp = numroc( m, nb, myrow, 0, nprows ); // mp number rows A owned by this process
int kp = numroc( k, nb, myrow, 0, nprows ); // kp number rows B owned by this process
int kq = numroc( k, nb, mycol, 0, npcols ); // kq number cols A owned by this process
int nq = numroc( n, nb, mycol, 0, npcols ); // nq number cols B owned by this process
mpi_print( "mp " + toString(mp) + " kp " + toString(kp) + " kq " + toString(kq) + " nq " + toString(nq) );
struct DESC desca, descb, descc;
descinit( (&desca), m, k, nb, nb, 0, 0, grid, max(1, mp) );
descinit( (&descb), k, n, nb, nb, 0, 0, grid, max(1, kp) );
descinit( (&descc), m, n, nb, nb, 0, 0, grid, max(1, mp) );
mpi_print( "desca.LLD_ " + toString(desca.LLD_) + " kq " + toString(kq) );
double *ipa = new double[desca.LLD_ * kq];
double *ipb = new double[descb.LLD_ * nq];
double *ipc = new double[descc.LLD_ * nq];
for( int i = 0; i < desca.LLD_ * kq; i++ ) {
ipa[i] = p;
}
for( int i = 0; i < descb.LLD_ * nq; i++ ) {
ipb[i] = p;
}
if( p == 0 ) cout << "created matrices" << endl;
double *work = new double[nb];
if( n <=5 ) {
pdlaprnt( n, n, ipa, 1, 1, &desca, 0, 0, "A", 6, work );
pdlaprnt( n, n, ipb, 1, 1, &descb, 0, 0, "B", 6, work );
}
NanoTimer timer;
pdgemm( false, false, m, n, k, 1,
ipa, 1, 1, &desca, ipb, 1, 1, &descb,
1, ipc, 1, 1, &descc );
MPI_Barrier( MPI_COMM_WORLD );
if( p == 0 ) timer.toc("pdgemm");
blacs_gridexit( grid );
blacs_exit(0);
return 0;
}
这是 Jack Poulson 的 elemental/examples/blas-like/Gemm.cpp 程序的输出,运行时使用OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 mpirun.mpich2 -hosts host3,host1,host2,host4 48 ./ElementalExampleGemm --m 2000 --n 2000 --k 2000 --nb 128,即 48 个进程,每个进程 1 个线程:
g: 6 x 8
Sequential: 1.74586 secs and 9.16452 GFLops
Populate root node: 0.0340741 secs
Spread from root: 0.443471 secs
[MC,* ] AllGather: 0.0068028 secs, 50.2758 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.224466 secs, 1.14048 MB/s for 128 x 250 local matrix
Local gemm: 0.00301719 secs and 7.08474 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.00582409 secs, 58.7244 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.221785 secs, 1.15427 MB/s for 128 x 250 local matrix
Local gemm: 0.00290704 secs and 7.35319 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.00559711 secs, 61.1058 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.258585 secs, 0.990002 MB/s for 128 x 250 local matrix
Local gemm: 0.00293088 secs and 7.29337 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.00562692 secs, 60.7821 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.266162 secs, 0.961821 MB/s for 128 x 250 local matrix
Local gemm: 0.00652504 secs and 3.276 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.00574803 secs, 59.5014 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.253986 secs, 1.00793 MB/s for 128 x 250 local matrix
Local gemm: 0.00300407 secs and 7.11567 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.00567889 secs, 60.2258 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.263011 secs, 0.973343 MB/s for 128 x 250 local matrix
Local gemm: 0.00289297 secs and 7.38894 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.00561213 secs, 60.9422 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.0310259 secs, 8.25117 MB/s for 128 x 250 local matrix
Local gemm: 0.00288296 secs and 7.41461 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.00552988 secs, 61.8487 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.229407 secs, 1.11592 MB/s for 128 x 250 local matrix
Local gemm: 0.00290298 secs and 7.36346 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.00556993 secs, 61.4039 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.259156 secs, 0.987822 MB/s for 128 x 250 local matrix
Local gemm: 0.00277686 secs and 7.6979 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.00564504 secs, 60.587 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.260839 secs, 0.981448 MB/s for 128 x 250 local matrix
Local gemm: 0.00277185 secs and 7.7118 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.00549412 secs, 62.2513 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.224814 secs, 1.13872 MB/s for 128 x 250 local matrix
Local gemm: 0.00276208 secs and 7.7391 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.00556684 secs, 61.4381 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.216236 secs, 1.18389 MB/s for 128 x 250 local matrix
Local gemm: 0.00276899 secs and 7.71977 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.00551414 secs, 62.0252 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.22506 secs, 1.13747 MB/s for 128 x 250 local matrix
Local gemm: 0.00276208 secs and 7.7391 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.005409 secs, 63.2309 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.255941 secs, 1.00023 MB/s for 128 x 250 local matrix
Local gemm: 0.00276995 secs and 7.71712 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.00536704 secs, 63.7252 MB/s for 334 x 128 local matrix
[* ,MR] AllGather: 0.225583 secs, 1.13484 MB/s for 128 x 250 local matrix
Local gemm: 0.00295305 secs and 7.23861 GFlops for 334 x 250 x 128 product
[MC,* ] AllGather: 0.00358391 secs, 59.6444 MB/s for 334 x 80 local matrix
[* ,MR] AllGather: 0.251425 secs, 0.636373 MB/s for 80 x 250 local matrix
Local gemm: 0.00167489 secs and 7.97664 GFlops for 334 x 250 x 80 product
Distributed Gemm: 3.80641 secs
Gathered to root: 0.399101 secs
编辑:以及 4 个 mpi 进程(在 4 个 12 核节点上)的结果,使用 运行mpirun.mpich2 -hosts host3,host1,host2,host4 -np 4 ./ElementalExampleGemm --m 2000 --n 2000 --k 2000 --nb 128,即激活 OpenBLAS 多线程:
g: 2 x 2
Sequential: 0.219173 secs and 73.0017 GFLops
Populate root node: 0.035708 secs
Spread from root: 0.482837 secs
[MC,* ] AllGather: 0.0147331 secs, 69.5035 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.010592 secs, 96.6769 MB/s for 128 x 1000 local matrix
Local gemm: 0.00869703 secs and 29.4353 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.00796413 secs, 128.576 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.00883698 secs, 115.877 MB/s for 128 x 1000 local matrix
Local gemm: 0.00752282 secs and 34.0298 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.00717402 secs, 142.737 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.0083642 secs, 122.427 MB/s for 128 x 1000 local matrix
Local gemm: 0.00796413 secs and 32.1441 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.00718212 secs, 142.576 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.0080409 secs, 127.349 MB/s for 128 x 1000 local matrix
Local gemm: 0.00650787 secs and 39.337 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.00641584 secs, 159.605 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.00688195 secs, 148.795 MB/s for 128 x 1000 local matrix
Local gemm: 0.00576997 secs and 44.3677 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.00597095 secs, 171.497 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.00652695 secs, 156.888 MB/s for 128 x 1000 local matrix
Local gemm: 0.00652885 secs and 39.2106 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.00575399 secs, 177.963 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.00634193 secs, 161.465 MB/s for 128 x 1000 local matrix
Local gemm: 0.00574183 secs and 44.5851 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.00563598 secs, 181.69 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.00627708 secs, 163.133 MB/s for 128 x 1000 local matrix
Local gemm: 0.00576282 secs and 44.4227 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.005867 secs, 174.535 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.00619698 secs, 165.242 MB/s for 128 x 1000 local matrix
Local gemm: 0.01108 secs and 23.1046 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.00589108 secs, 173.822 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.00623584 secs, 164.212 MB/s for 128 x 1000 local matrix
Local gemm: 0.00559402 secs and 45.7632 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.00580001 secs, 176.551 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.00648689 secs, 157.857 MB/s for 128 x 1000 local matrix
Local gemm: 0.00570703 secs and 44.857 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.00566101 secs, 180.886 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.00638199 secs, 160.452 MB/s for 128 x 1000 local matrix
Local gemm: 0.00575209 secs and 44.5056 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.00572801 secs, 178.771 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.00630784 secs, 162.338 MB/s for 128 x 1000 local matrix
Local gemm: 0.005795 secs and 44.176 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.00581098 secs, 176.218 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.00644612 secs, 158.855 MB/s for 128 x 1000 local matrix
Local gemm: 0.00571203 secs and 44.8177 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.00570321 secs, 179.548 MB/s for 1000 x 128 local matrix
[* ,MR] AllGather: 0.00653315 secs, 156.739 MB/s for 128 x 1000 local matrix
Local gemm: 0.00570703 secs and 44.857 GFlops for 1000 x 1000 x 128 product
[MC,* ] AllGather: 0.00371313 secs, 172.361 MB/s for 1000 x 80 local matrix
[* ,MR] AllGather: 0.00396085 secs, 161.582 MB/s for 80 x 1000 local matrix
Local gemm: 0.00396299 secs and 40.3735 GFlops for 1000 x 1000 x 80 product
Distributed Gemm: 0.327859 secs
Gathered to root: 0.237197 secs