/******************************************************************** C-DAC Tech Workshop : hyPACK-2013 October 15-18, 2013 Example : cuda-matrix-matrix-multiplication.cu Objective : Write a CUDA Program to perform Matrix Matrix multiplication. Input : None Output : Execution time in seconds , Gflops achieved Created : August-2013 E-mail : hpcfte@cdac.in ************************************************************************/ #include #include #define BLOCKSIZE 16 #define SIZE 128 cudaDeviceProp deviceProp; cudaEvent_t start,stop; cudaError_t ret; double *host_MatA,*host_MatB,*host_MatC,*CPU_Result; double *device_MatA,*device_MatB,*device_MatC; int size = SIZE; float elapsedTime; double Tsec,gflops; /*kernel funtion*/ __global__ void Muld(double* A, double* B, int wA, int wB, double* C) { int bx = blockIdx.x; int by = blockIdx.y; int tx = threadIdx.x; int ty = threadIdx.y; int aBegin = wA * BLOCKSIZE * by; int aEnd = aBegin + wA - 1; int aStep = BLOCKSIZE; int bBegin = BLOCKSIZE * bx; int bStep = BLOCKSIZE * wB; double Csub = 0; for(int a = aBegin, b = bBegin; a <= aEnd ; a += aStep, b += bStep) { __shared__ double As[BLOCKSIZE][BLOCKSIZE]; __shared__ double Bs[BLOCKSIZE][BLOCKSIZE]; As[ty][tx] = A[a + wA * ty + tx]; Bs[ty][tx] = B[b+ wB * ty + tx]; __syncthreads(); for(int k= 0; k< BLOCKSIZE; ++k) Csub += As[ty][k] * Bs[k][tx]; __syncthreads(); } int c = wB * BLOCKSIZE * by + BLOCKSIZE * bx; C[ c+ wB * ty + tx] = Csub; }/* end of Muld device code */ /*mem error*/ void mem_error(char *arrayname, char *benchmark, int len, char *type) { printf("\nMemory not sufficient to allocate for array %s\n\tBenchmark : %s \n\tMemory requested = %d number of %s elements\n",arrayname, benchmark, len, type); exit(-1); } /*cuda safe call*/ void CUDA_SAFE_CALL(cudaError_t call) { cudaError_t ret = call; //printf("RETURN FROM THE CUDA CALL:%d\t:",ret); switch(ret) { case cudaSuccess: // printf("Success\n"); break; /* case cudaErrorInvalidValue: { printf("ERROR: InvalidValue:%i.\n",__LINE__); exit(-1); break; } case cudaErrorInvalidDevicePointer: { printf("ERROR:Invalid Device pointeri:%i.\n",__LINE__); exit(-1); break; } case cudaErrorInvalidMemcpyDirection: { printf("ERROR:Invalid memcpy direction:%i.\n",__LINE__); exit(-1); break; } */ default: { printf(" ERROR at line :%i.%d' ' %s\n",__LINE__,ret,cudaGetErrorString(ret)); exit(-1); break; } } } /* void SetUp_CUDA_Exe_Config() */ void check_block_grid_dim(cudaDeviceProp devProp,dim3 blockDim,dim3 gridDim) { if( blockDim.x >= devProp.maxThreadsDim[0] || blockDim.y >= devProp.maxThreadsDim[1] || blockDim.z >= devProp.maxThreadsDim[2] ) { printf("\nBlock Dimensions exceed the maximum limits:%d * %d * %d \n",devProp.maxThreadsDim[0],devProp.maxThreadsDim[1],devProp.maxThreadsDim[2]); exit(-1); } if( gridDim.x >= devProp.maxGridSize[0] || gridDim.y >= devProp.maxGridSize[1] || gridDim.z >= devProp.maxGridSize[2] ) { printf("\nGrid Dimensions exceed the maximum limits:%d * %d * %d \n",devProp.maxGridSize[0],devProp.maxGridSize[1],devProp.maxGridSize[2]); exit(-1); } } /*function to free memory*/ void dfree(double * arr[],int len) { for(int i=0;i>>(device_MatA,device_MatB,size,size,device_MatC); cudaEventRecord(stop,0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTime,start,stop); Tsec=elapsedTime*1.0e-3; calculate_gflops(Tsec); } /* Fill in the vector with double precision values */ void fill_dp_vector(double* vec,int size) { int ind; for(ind=0;ind