我正在尝试编写一个矩阵乘法例程,因为我需要在其中进行一些分析,CUDA
并且我想用 CPU 代码对其进行验证。我正在尝试dgemm
这样做,但我似乎做错了。
我的矩阵被构造为1D
数组,因为在内核中完成了同样的事情CUDA
,所以我试图dgemm
用1D
矩阵调用。
当我使用随机数作为输入矩阵时,我得到:
A = [ ...
0.840187717154710 0.394382926819093 0.783099223758606
0.798440033476073 0.911647357936784 0.197551369293384
0.335222755714889 0.768229594811904 0.277774710803188
];
B = [ ...
0.553969955795431 0.477397051862160 0.628870924761924
0.364784472791843 0.513400910195616 0.952229725174713
0.916195068003701 0.635711727959901 0.717296929432683
];
myC = [ ...
1.057423514989923 1.136811309272234 0.702808322919262
1.035616345918396 1.343436407318801 0.651590826816703
1.517807789358121 1.491925339118482 1.042304316032681
];
作为我的输出,正如我发现的那样,它实际上是B*A
. 这种情况始终如一地发生 - 当我尝试将平方矩阵相乘时A
,B
我实际上得到了B*A
. 我正在测试这个,因为当我做非方阵时,它只是一团糟——我什至无法追踪正在发生的事情!
我的C
代码如下:
void test_host_matmult() {
fprintf(stderr, "Inside test_host_matmult\n");
int nrA = 3;
int ncA = 3;
int nrB = 3;
int ncB = 3;
double* A;
double* B;
double* C;
A = (double*)malloc(nrA*ncA*sizeof(double));
B = (double*)malloc(nrB*ncB*sizeof(double));
C = (double*)malloc(nrA*ncB*sizeof(double));
//int icount = 0;
for(int ir = 0; ir < nrA; ir++) {
for(int ic = 0; ic < ncA; ic++) {
//A[ir*ncA + ic] = 10*(ir+1) + (ic+1);
A[ir*ncA + ic] = rand_double(0.0, 1.0);
//fprintf(stderr, "%i ... %i\n", icount++, 10*(ir+1) + (ic+1));
}
}
for(int ir = 0; ir < nrB; ir++) {
for(int ic = 0; ic < ncB; ic++) {
//B[ir*ncB + ic] = 10*(ir+1) + (ic+1);
B[ir*ncB + ic] = rand_double(0.0, 1.0);
}
}
matmult(A, nrA, ncA, B, nrB, ncB, C, nrA, ncB);
fprintf(stderr, "A = [ ...\n");
for(int ir = 0; ir < nrA; ir++) {
for(int ic = 0; ic < ncA; ic++) {
fprintf(stderr, "%25.15f", A[ir*ncA + ic]);
}
fprintf(stderr, "\n");
}
fprintf(stderr, "];\n");
fprintf(stderr, "B = [ ...\n");
for(int ir = 0; ir < nrB; ir++) {
for(int ic = 0; ic < ncB; ic++) {
fprintf(stderr, "%25.15f", B[ir*ncB + ic]);
}
fprintf(stderr, "\n");
}
fprintf(stderr, "];\n");
fprintf(stderr, "myC = [ ...\n");
for(int ir = 0; ir < nrA; ir++) {
for(int ic = 0; ic < ncB; ic++) {
fprintf(stderr, "%25.15f", C[ir*ncB + ic]);
}
fprintf(stderr, "\n");
}
fprintf(stderr, "];\n");
}
这是matmult
功能:
void matmult( double* A, int nrA, int ncA, double* B, int nrB, int ncB, double* C, int nrC, int ncC) {
// (nrA x ncA) . (nrB x ncB ) = (nrA x ncB)
if( !(nrA == nrC && ncB == ncC) ) {
fprintf(stderr, "ERROR: incorrect matrix sizes!!\n");
}
char TRANS = 'N';
int M = nrA;
int N = ncB;
int K = ncA;
double alpha = 1.0;
double beta = 0.0;
int lda = nrA;
int ldb = nrB;
int ldc = nrC;
dgemm_(&TRANS, &TRANS, &M, &N, &K, &alpha, A, &lda, B, &ldb, &beta, C, &ldc);
}