我正在重试这个问题,因为我上次没有正确地这样做:
我已经实现了一个简单的 c++ CUDA 示例,它采用 A [mx 1] 和 B [1 x N] 并使用我习惯称为“隐式”数组扩展(MATLAB 术语)的元素明智地添加它们。我有两个问题:
- 为什么我不会在两个线程同时从 a 或 b 的同一元素读取时遇到错误?
- 这是执行这些类型的元素明智操作的“正确”方式,其中“元素添加”表示任意数量的自定义函数,这些函数采用行和列矩阵的任何排列并计算组合的所有排列?
#include <stdio.h>
#define Row 500
#define Col 60000
__global__ void elementadd( int * a, int * b, int * c )
{
int ro = blockIdx.x * blockDim.x + threadIdx.x;
int co = blockIdx.y * blockDim.y + threadIdx.y;
if (ro < Row && co < Col)
{
c[ro*Col + co] = a[ro] + b[co];
}
}
int main()
{
int *a, *b, *c_cpu, *c_gpu;
float s1 = (float)(Row) * (float)(Col);
float size1 = s1 * sizeof (int); // Number of bytes of an N x N matrix
int sizeR = Row * sizeof(int);
int sizeC = Col * sizeof(int);
// Allocate memory
cudaMallocManaged (&a, sizeR);
cudaMallocManaged (&b, sizeC);
cudaMallocManaged (&c_cpu, size1);
cudaMallocManaged (&c_gpu, size1);
//initialize 2d
for(int i = 0; i < Row; ++i){
a[i] = i*2;
}
for(int i = 0; i < Col; ++i){
b[i] = i+2;
}
for(int ro = 0; ro < Row; ++ro){
for(int co = 0; co < Col; ++co){
c_gpu[ro*Col + co] = 0;
c_gpu[ro*Col + co] = 0;
}
}
dim3 threads_per_block (32, 32, 1); // A 16 x 16 block threads
dim3 number_of_blocks ((Row / threads_per_block.x) + 1, (Col / threads_per_block.y) + 1, 1);
elementadd<<< number_of_blocks, threads_per_block >>> ( a, b, c_gpu );
cudaDeviceSynchronize(); // Wait for the GPU to finish before proceeding
for(int ro = 0; ro < Row; ++ro){
for(int co = 0; co < Col; ++co){
printf("entry[%d][%d] is %d a:%d b:%d \n",ro,co,c_gpu[ro*Col + co],a[ro],b[co]);
}
}
// Free all our allocated memory
cudaFree(a); cudaFree(b);
cudaFree( c_cpu ); cudaFree( c_gpu );
}
```