发送3D数组到CUDA内核

我把给出的代码作为答案我怎样才能使用嵌套for循环加起来两个2d（pitched）数组？并试图使用它的3D而不是2D和稍微改变其他部分，现在看起来如下：

__global__ void doSmth(int*** a) { for(int i=0; i<2; i++) for(int j=0; j<2; j++) for(int k=0; k<2; k++) a[i][j][k]=i+j+k; } int main() { int*** h_c = (int***) malloc(2*sizeof(int**)); for(int i=0; i<2; i++) { h_c[i] = (int**) malloc(2*sizeof(int*)); for(int j=0; j<2; j++) GPUerrchk(cudaMalloc((void**)&h_c[i][j],2*sizeof(int))); } int*** d_c; GPUerrchk(cudaMalloc((void****)&d_c,2*sizeof(int**))); GPUerrchk(cudaMemcpy(d_c,h_c,2*sizeof(int**),cudaMemcpyHostToDevice)); doSmth<<<1,1>>>(d_c); GPUerrchk(cudaPeekAtLastError()); int res[2][2][2]; for(int i=0; i<2; i++) for(int j=0; j<2; j++) GPUerrchk(cudaMemcpy(&res[i][j][0], h_c[i][j],2*sizeof(int),cudaMemcpyDeviceToHost)); for(int i=0; i<2; i++) for(int j=0; j<2; j++) for(int k=0; k<2; k++) printf("[%d][%d][%d]=%d\n",i,j,k,res[i][j][k]); }

在上面的代码中，我使用2作为h_c的每个维度的大小，在实际的实现中，我将在“int ***”或更多维度的子数组的每个部分以非常大的数量和不同的大小。内核调用后，我试图将结果复制回res数组。你能帮我解决这个问题吗？你可以用我上面写的方式显示解决scheme。谢谢！

首先，当他发表你对上一个问题的回应时，我认为他是个翘首以待的人，并不是要成为好的编码的代表。因此，搞清楚如何将其扩展到3D可能不是您最好的时间。例如，为什么我们要编写只使用一个线程的程序？虽然这样的内核可能有合法的用途，但这不是其中之一。你的内核有可能并行地做一堆独立的工作，但是你可以把它全部强制到一个线程上，然后序列化它。平行工作的定义是：

 a[i][j][k]=i+j+k;

让我们弄清楚如何在GPU上并行处理。

我会做的另一个介绍性的观察是，由于我们正在处理提前知道的大小的问题，我们使用C来处理它们，并从语言中获得尽可能多的好处。在某些情况下可能需要嵌套循环做cudaMalloc，但我不认为这是其中之一。

以下是并行完成工作的代码：

 #include <stdio.h> #include <stdlib.h> // set a 3D volume // To compile it with nvcc execute: nvcc -O2 -o set3d set3d.cu //define the data set size (cubic volume) #define DATAXSIZE 100 #define DATAYSIZE 100 #define DATAZSIZE 20 //define the chunk sizes that each threadblock will work on #define BLKXSIZE 32 #define BLKYSIZE 4 #define BLKZSIZE 4 // for cuda error checking #define cudaCheckErrors(msg) \ do { \ cudaError_t __err = cudaGetLastError(); \ if (__err != cudaSuccess) { \ fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ msg, cudaGetErrorString(__err), \ __FILE__, __LINE__); \ fprintf(stderr, "*** FAILED - ABORTING\n"); \ return 1; \ } \ } while (0) // device function to set the 3D volume __global__ void set(int a[][DATAYSIZE][DATAXSIZE]) { unsigned idx = blockIdx.x*blockDim.x + threadIdx.x; unsigned idy = blockIdx.y*blockDim.y + threadIdx.y; unsigned idz = blockIdx.z*blockDim.z + threadIdx.z; if ((idx < (DATAXSIZE)) && (idy < (DATAYSIZE)) && (idz < (DATAZSIZE))){ a[idz][idy][idx] = idz+idy+idx; } } int main(int argc, char *argv[]) { typedef int nRarray[DATAYSIZE][DATAXSIZE]; const dim3 blockSize(BLKXSIZE, BLKYSIZE, BLKZSIZE); const dim3 gridSize(((DATAXSIZE+BLKXSIZE-1)/BLKXSIZE), ((DATAYSIZE+BLKYSIZE-1)/BLKYSIZE), ((DATAZSIZE+BLKZSIZE-1)/BLKZSIZE)); // overall data set sizes const int nx = DATAXSIZE; const int ny = DATAYSIZE; const int nz = DATAZSIZE; // pointers for data set storage via malloc nRarray *c; // storage for result stored on host nRarray *d_c; // storage for result computed on device // allocate storage for data set if ((c = (nRarray *)malloc((nx*ny*nz)*sizeof(int))) == 0) {fprintf(stderr,"malloc1 Fail \n"); return 1;} // allocate GPU device buffers cudaMalloc((void **) &d_c, (nx*ny*nz)*sizeof(int)); cudaCheckErrors("Failed to allocate device buffer"); // compute result set<<<gridSize,blockSize>>>(d_c); cudaCheckErrors("Kernel launch failure"); // copy output data back to host cudaMemcpy(c, d_c, ((nx*ny*nz)*sizeof(int)), cudaMemcpyDeviceToHost); cudaCheckErrors("CUDA memcpy failure"); // and check for accuracy for (unsigned i=0; i<nz; i++) for (unsigned j=0; j<ny; j++) for (unsigned k=0; k<nx; k++) if (c[i][j][k] != (i+j+k)) { printf("Mismatch at x= %d, y= %d, z= %d Host= %d, Device = %d\n", i, j, k, (i+j+k), c[i][j][k]); return 1; } printf("Results check!\n"); free(c); cudaFree(d_c); cudaCheckErrors("cudaFree fail"); return 0; }

既然你已经在评论中提出了要求，下面是我为了让你的代码运行起来所能做的最less的改变。让我们也提醒一下你前面提到的一些问题，

“对于代码复杂性和性能的原因，你真的不希望这样做，在CUDA代码中使用指针数组比使用线性内存的替代方法更难和更慢。

“与使用线性内存相比，这是一个糟糕的主意。”

我不得不在纸上画出来，以确保我的指针复制正确。

 #include <cstdio> inline void GPUassert(cudaError_t code, char * file, int line, bool Abort=true) { if (code != 0) { fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code),file,line); if (Abort) exit(code); } } #define GPUerrchk(ans) { GPUassert((ans), __FILE__, __LINE__); } __global__ void doSmth(int*** a) { for(int i=0; i<2; i++) for(int j=0; j<2; j++) for(int k=0; k<2; k++) a[i][j][k]=i+j+k; } int main() { int*** h_c = (int***) malloc(2*sizeof(int**)); for(int i=0; i<2; i++) { h_c[i] = (int**) malloc(2*sizeof(int*)); for(int j=0; j<2; j++) GPUerrchk(cudaMalloc((void**)&h_c[i][j],2*sizeof(int))); } int ***h_c1 = (int ***) malloc(2*sizeof(int **)); for (int i=0; i<2; i++){ GPUerrchk(cudaMalloc((void***)&(h_c1[i]), 2*sizeof(int*))); GPUerrchk(cudaMemcpy(h_c1[i], h_c[i], 2*sizeof(int*), cudaMemcpyHostToDevice)); } int*** d_c; GPUerrchk(cudaMalloc((void****)&d_c,2*sizeof(int**))); GPUerrchk(cudaMemcpy(d_c,h_c1,2*sizeof(int**),cudaMemcpyHostToDevice)); doSmth<<<1,1>>>(d_c); GPUerrchk(cudaPeekAtLastError()); int res[2][2][2]; for(int i=0; i<2; i++) for(int j=0; j<2; j++) GPUerrchk(cudaMemcpy(&res[i][j][0], h_c[i][j],2*sizeof(int),cudaMemcpyDeviceToHost)); for(int i=0; i<2; i++) for(int j=0; j<2; j++) for(int k=0; k<2; k++) printf("[%d][%d][%d]=%d\n",i,j,k,res[i][j][k]); }

简而言之，我们必须做一个连续的序列：

malloc指针的multidimensional array（在主机上），一维小于问题的大小，最后一个维度是指向区域的一组指针，而不是主机。
创build与上一步创build的类相同的类的另一个多维指针数组，但是一个维度小于上一步创build的维度。这个数组还必须在设备上有最终的等级cudaMalloc。
将上一个第二步中的最后一组主机指针复制到上一步中设备上的cudaMalloced区域。
重复步骤2-3，直到我们得到一个指向多维指针数组的单个（主机）指针，所有指针都驻留在设备上。

发送3D数组到CUDA内核

是否可以*在分配的内存上使用free（）？

C＃：DateTime.Now月份输出格式

什么是在C + +中不平凡的构造函数？

有一个简单的方法来创build在C＃中的序数？

内置.Netalgorithm将值舍入到最近的10个区间

什么是无符号数据types？

辛格尔顿：应该如何使用它

在使用C ++进行multithreading编程时，有什么“要知道的事情”

如何实现C ++exception处理运行时？

如何编写使用临时容器的范围pipe道？