我有一个我想快速组装的矩阵,它是块状的:
# pragma omp parallel for
loop from k = 1 to N
v_k = buildv(k); // v_k and G_k are stored in thread-specific storage
G_k = v_k v_k^T; // using DSYR - this step is fast
loop i = 1 to 3 // this part is really SLOW
loop j = 1 to 3
alpha_ijk = ...; // this is fast
A[thread_id](i,j) += alpha_ijk * G_k; // I wrote a matrix DAXPY routine for this
sum A[thread_id] over all threads to build final A
当我写代码的时候,我以为计算和将是缓慢的部分,因为它们涉及计算(BLAS 调用)。但是我很惊讶缓慢的部分只是将已经计算的结果复制到矩阵(即使每个线程都有自己的)。所以我想我需要专注于以某种方式并行复制操作。有没有人有什么建议?
注意相当大(大约 500 万),这就是为什么我专注于并行计算.
编辑
根据要求,我编写了一个最小的工作示例(可在此处获得)。我编译了下面的代码:
gcc -g -Wall -O2 -fopenmp -o matrix matrix.c -lm -lgsl -lcblas -latlas
请注意,运行此程序的先决条件是 GSL 库和 ATLAS BLAS - 尽管您可以将 ATLAS 替换为任何 cblas 库(例如 -lgslcblas)。
我添加了一个#if 标志来启用/禁用矩阵复制操作,这是缓慢的部分。当矩阵被复制到更大的线程特定矩阵中时,我机器上的总运行时间是81 seconds。禁用此代码时,总运行时间为4.1 秒。我的机器上有 24 个内核。
为简单起见,我使用随机数生成器来指定向量.
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <omp.h>
#include <sys/time.h>
#include <gsl/gsl_math.h>
#include <gsl/gsl_matrix.h>
#include <gsl/gsl_blas.h>
#include <gsl/gsl_rng.h>
static void
random_vector(const double a, const double b, gsl_vector * v, gsl_rng * r)
{
size_t i;
for (i = 0; i < v->size; ++i)
{
double vi = (b - a) * gsl_rng_uniform(r) + a;
gsl_vector_set(v, i, vi);
}
}
static int
matrix_daxpy(const double alpha, const gsl_matrix * X, gsl_matrix * Y)
{
const size_t M = Y->size1;
const size_t N = Y->size2;
if (X->size1 != M || X->size2 != N)
{
GSL_ERROR ("matrices must have same dimensions", GSL_EBADLEN);
}
else
{
if (alpha != 0.0)
{
const size_t tda_X = X->tda;
const size_t tda_Y = Y->tda;
size_t i, j;
for (i = 0; i < M; ++i)
{
for (j = 0; j < N; ++j)
{
Y->data[tda_Y * i + j] += alpha * X->data[tda_X * i + j];
}
}
}
return GSL_SUCCESS;
}
}
int
main(int argc, char *argv[])
{
const size_t max_threads = (size_t) omp_get_max_threads();
const size_t N = 500000; /* my real N is around 5 million */
const size_t m = 300;
const size_t nblocks = 3; /* number of block matrices in A */
const double a = -10.0;
const double b = 10.0;
gsl_vector **v = malloc(max_threads * sizeof(gsl_vector *));
gsl_matrix **GTG = malloc(max_threads * sizeof(gsl_matrix *));
gsl_rng **r = malloc(max_threads * sizeof(gsl_rng *));
gsl_matrix **A = malloc(max_threads * sizeof(gsl_matrix *));
size_t k;
struct timeval tv0, tv1;
for (k = 0; k < max_threads; ++k)
{
v[k] = gsl_vector_alloc(m);
GTG[k] = gsl_matrix_calloc(m, m);
A[k] = gsl_matrix_calloc(nblocks * m, nblocks * m);
r[k] = gsl_rng_alloc(gsl_rng_default);
}
gettimeofday(&tv0, NULL);
#pragma omp parallel for
for (k = 0; k < N; ++k)
{
int thread_id = omp_get_thread_num();
size_t i, j;
/* construct v_k */
random_vector(a, b, v[thread_id], r[thread_id]);
/* GTG += v_k v_k^T */
gsl_blas_dsyr(CblasLower, 1.0, v[thread_id], GTG[thread_id]);
/* copy lower triangle to upper triangle */
gsl_matrix_transpose_tricpy('L', 0, GTG[thread_id], GTG[thread_id]);
#if 1
/* SLOW PART: now copy alpha_{ijk} v_k v_k^T into thread-specific final A */
for (i = 0; i < nblocks; ++i)
{
for (j = 0; j <= i; ++j)
{
double alpha_ijk = 1.0; /* for simplicity */
gsl_matrix_view Av = gsl_matrix_submatrix(A[thread_id], i * m, j * m, m, m); /* A_{ij} m-by-m block */
/* copy alpha_ijk v_k v_k^T into appropriate spot */
matrix_daxpy(alpha_ijk, GTG[thread_id], &Av.matrix);
}
}
#endif
}
gettimeofday(&tv1, NULL);
fprintf(stderr, "elapsed time = %.4f [sec]\n",
(tv1.tv_sec + tv1.tv_usec * 1.0e-6) - (tv0.tv_sec + tv0.tv_usec * 1.0e-6));
return 0;
}