如何使用 ScaLapack/PBLAS 进行矩阵向量乘法?

计算科学 拉帕克 布拉斯
2021-12-11 22:05:31

在浏览了 ScaLapack 的所有可能的“介绍”之后,我仍然无法理解如何PDGEMV使用它进行简单的操作。

这是我必须做的:

我必须使用生成矩阵

do i=1,n
  x(i) = i*i*1.0D+00
  do j=1,n
    A(i,j) = (i+j)*j*1.00D+00
  end do
end do

然后简单地相乘Matrix A by Vector x (b=Ax)

对于 Lapack,我只是使用 CALL DGEMV('N',n,n,1.0D+00,a,n,x,1,0.0D+00,b,1)

我如何在 PBLAS 中实现这一目标?我已经运行了示例(example1.f来自网站)程序并且它可以工作。

什么是矩阵的划分?我该怎么做?

每个人(在教程中)似乎都在专注于DGESV我想要的时候,DGEMV/DGEMMBLAS

2个回答

scalapack 需要的数据分布是块循环 分布(这里有一个小的交互式计算器)。一旦你掌握了窍门,Scalapack 就非常简单了。

对于给定的块大小(这是一个自由参数)并将处理器分解为网格(例如,4 个处理器 -> 2 行,2 列处理器),您可以从矩阵块的本地索引转到全局索引使用此 NERSC 示例中的 l2g 或 g2l 的矩阵索引这是一种非常简单的入门方式。行或列向量被视为 nx1 或 1xn 矩阵。

下面是对角矩阵 A(1,2,3...) 乘以单位向量 X 并将结果放入 Y 的示例;这里唯一真正的技巧是并非所有处理器都有一大块 X 或 Y 向量。

program gemv1
      use mpi
      implicit none

      integer :: n, nb    ! problem size and block size
      integer :: myArows, myAcols   ! size of local subset of global matrix
      integer :: myXrows, myXcols   ! size of local subset of global vector
      integer :: i,j, myi, myj
      real, dimension(:,:), allocatable :: myA,myX,myY
      integer :: ierr

      integer, external :: numroc   ! blacs routine
      integer :: me, procs, icontxt, prow, pcol, myrow, mycol  ! blacs data
      integer :: info    ! scalapack return value
      integer, dimension(9)   :: ides_a, ides_x, ides_y ! matrix descriptors
      integer, dimension(2) :: dims
      real :: error, globerror

! Initialize blacs processor grid

      call blacs_pinfo   (me,procs)

! create as square as possible a grid of processors

      dims = 0
      call MPI_Dims_create(procs, 2, dims, ierr)
      prow = dims(1)
      pcol = dims(2)

! create the BLACS context

      call blacs_get     (0, 0, icontxt)
      call blacs_gridinit(icontxt, 'R', prow, pcol)
      call blacs_gridinfo(icontxt, prow, pcol, myrow, mycol)

! Construct local arrays
! Global structure:  matrix A of n rows and n columns

      n = int(25000.*sqrt(dble(procs)))

! blocksize - a free parameter.

      nb = 100

! how big is "my" chunk of matrix A?

      myArows = numroc(n, nb, myrow, 0, prow)
      myAcols = numroc(n, nb, mycol, 0, pcol)

! how big is "my" chunk of vector x?

      myXrows = numroc(n, nb, myrow, 0, prow)
      myXcols = 1

! Initialize local arrays    

      allocate(myA(myArows,myAcols)) 
      allocate(myX(myXrows,myXcols)) 
      allocate(myY(myXrows,myXcols)) 

      myA = 0.
      do myj=1,myAcols
          ! get global index from local index
          call l2g(myj,mycol,n,pcol,nb,j)
          do myi=1,myArows
              ! get global index from local index
              call l2g(myi,myrow,n,prow,nb,i)
              if (i == j) myA(myi,myj) = i
          enddo
      enddo

      myX = 0.
      call l2g(1,mycol,n,pcol,nb,j)
      if (j == 1) then
          do myi=1,myXrows
              call l2g(myi,myrow,n,prow,nb,i)
              myX(myi,1) = 1.
          enddo
      endif

      myY = 0.

! Prepare array descriptors for ScaLAPACK 

      call descinit( ides_a, n, n, nb, nb, 0, 0, icontxt, myArows, info )
      call descinit( ides_x, n, 1, nb, nb, 0, 0, icontxt, myXrows, info )
      call descinit( ides_y, n, 1, nb, nb, 0, 0, icontxt, myXrows, info )

! Call ScaLAPACK library routine

      call psgemv('N',n,n,1.,mya,1,1,ides_a,myx,1,1,ides_x,1,0.,myy,1,1,ides_y,1)
      if (me == 0) then
        if (info /= 0) then
             print *, 'Error -- info = ', info
        endif
      endif

! Deallocate the local arrays

      deallocate(myA, myX)

! Print results - Y should be 1,2,3...

      error = 0.
      call l2g(1,mycol,n,pcol,nb,j)
      if (j == 1) then
          do myi=1,myXrows
              call l2g(myi,myrow,n,prow,nb,i)
              error = error + (myY(myi,1)-1.*i)**2.
          enddo
      endif

      call MPI_Reduce(error, globerror, 1, MPI_REAL, MPI_SUM, 0, MPI_COMM_WORLD, ierr)

      if (me == 0) then
        print *,'Y l2 error = ', sqrt(globerror/n)
      endif

      deallocate(myY)

! End blacs for processors that are used

      call blacs_gridexit(icontxt)
      call blacs_exit(0)

contains

! convert global index to local index in block-cyclic distribution

   subroutine g2l(i,n,np,nb,p,il)

   implicit none
   integer, intent(in) :: i    ! global array index, input
   integer, intent(in) :: n    ! global array dimension, input
   integer, intent(in) :: np   ! processor array dimension, input
   integer, intent(in) :: nb   ! block size, input
   integer, intent(out):: p    ! processor array index, output
   integer, intent(out):: il   ! local array index, output
   integer :: im1   

   im1 = i-1
   p   = mod((im1/nb),np)
   il  = (im1/(np*nb))*nb + mod(im1,nb) + 1

   return
   end subroutine g2l

! convert local index to global index in block-cyclic distribution

   subroutine l2g(il,p,n,np,nb,i)

   implicit none
   integer :: il   ! local array index, input
   integer :: p    ! processor array index, input
   integer :: n    ! global array dimension, input
   integer :: np   ! processor array dimension, input
   integer :: nb   ! block size, input
   integer :: i    ! global array index, output
   integer :: ilm1   

   ilm1 = il-1
   i    = (((ilm1/nb) * np) + p)*nb + mod(ilm1,nb) + 1

   return
   end subroutine l2g

end program gemv1