/***************************************************************************** C-DAC Tech Workshop : hyPACK-2013 October 15-18, 2013 Example :cuda-matrix-matrix-multiplication-mgpu.cu Objective : Write CUDA program to compute Matrix-Matrix multiplication to be executed on multiple GPUs.(using global memory) Input : None Output : Execution time in seconds , Gflops achieved Created : August-2013 E-mail : hpcfte@cdac.in ****************************************************************************/ #include #include #include #define SIZE 128 #define EPS 1.0e-12 #define GRIDSIZE 10 #define BLOCKSIZE 16 #define MAX_GPU 2 typedef struct { int hA; int wA; int wB; double* hMatA; double* hMatB; double* hMatC; double* dMatA; double* dMatB; double* dMatC; cudaStream_t stream; }TGPUPlan; int hA, wA,wB; double *hMatA,*hMatB,*hMatC,*dMatA,*dMatB,*dMatC; void checkResult(double *InMatA, double *InMatB, double *outMatC, int m, int n , int k ); __global__ void mmmul(double* dm1,double* dm2,double *dres,int r,int m,int c) { int tx = blockIdx.x*blockDim.x + threadIdx.x; int ty = blockIdx.y*blockDim.y + threadIdx.y; if(tx fabs(dRes[i])) relativeError = fabs((hRes[i] - dRes[i]) / hRes[i]); else relativeError = fabs((dRes[i] - hRes[i]) / dRes[i]); if (relativeError > EPS && relativeError != 0.0e+00 ) { if(errorNorm < relativeError) { errorNorm = relativeError; flag=1; } } } if( flag == 1) { printf(" \n Results verfication : Failed"); printf(" \n Considered machine precision : %e", EPS); printf(" \n Relative Error : %e\n", errorNorm); } else printf("\n Results verfication : Success\n"); } int main(int argc,char** argv) { int numGPU; int hA,wA,wB; double *host_A,*host_B,*host_C; int gpuBase,offset; int i,j; /* FOR TIMING MEASUREMENTS */ /*cudaEvent_t* start,*stop; float *elapsedTime; float Tsec=0,gflops;*/ /* ----- MULTI DEVICE COUNT --------*/ CUDA_SAFE_CALL(cudaGetDeviceCount(&numGPU)); if(numGPU > MAX_GPU ) numGPU=MAX_GPU; printf("CUDA CAPABLE DEVICE COUNT: %d\n",numGPU); hA=SIZE; wA=SIZE; wB=SIZE; /*---------FILLING HOST MATRICES---------*/ host_A=(double*)malloc(hA*wA*sizeof(double)); host_B=(double*)malloc(wA*wB*sizeof(double)); host_C=(double*)malloc(hA*wB*sizeof(double)); for(i =0;i < hA * wA;i++) host_A[i] = drand48(); for(i =0;i < wA*wB;i++) host_B[i] = drand48(); /*start = (cudaEvent_t*)malloc(numGPU*sizeof(cudaEvent_t)); stop = (cudaEvent_t*)malloc(numGPU*sizeof(cudaEvent_t)); elapsedTime = (float *)malloc(numGPU*sizeof(float)); */ /*-------INITIATING THE DATA FOR EACH DEVICE ----*/ TGPUPlan plan[numGPU]; for(i =0;i < numGPU; i++) { plan[i].hA = hA / numGPU; plan[i].wA = wA; plan[i].wB = wB; //cudaEventCreate(&start[i]); //cudaEventCreate(&stop[i]); } /*.........To handle odd size of vectors.........*/ for(i = 0;i < hA % numGPU; i++) plan[i].hA++; for(i = 0; i= BLOCKSIZE ) gridX=plan[i].wB/BLOCKSIZE; if( plan[i].hA >= BLOCKSIZE ) gridY=plan[i].hA/BLOCKSIZE; dim3 dimGrid(gridX,gridY); mmmul<<>>(plan[i].dMatA,plan[i].dMatB,plan[i].dMatC,hA,wA,wB); //CUDA_SAFE_CALL(cudaEventRecord(stop[i],plan[i].stream)); //CUDA_SAFE_CALL(cudaEventSynchronize(stop[i])); //printf("\nDevice status:%d:%d:%s\n",i,cudaPeekAtLastError(),cudaGetErrorString(cudaPeekAtLastError())); CUDA_SAFE_CALL(cudaMemcpyAsync(plan[i].hMatC,plan[i].dMatC,plan[i].hA*plan[i].wB*sizeof(double),cudaMemcpyDeviceToHost,plan[i].stream)); } /*--------- PROCESS RESULTS FROM GPU ----------*/ offset=0; for(i=0; i