/*************************************************************************** C-DAC Tech Workshop : hyPACK-2013 October 15-18, 2013 Example : cuda-matrix-matrix-addition.cu Objective : Write CUDA program to compute Matrix-Matrix addition. Input : None Output : Execution time in seconds , Gflops achieved Created : August-2013 E-mail : hpcfte@cdac.in ****************************************************************************/ #include #include #define EPS 1.0e-12 #define GRIDSIZE 10 #define BLOCKSIZE 16 #define SIZE 128 int size = SIZE; cudaDeviceProp deviceProp; cudaEvent_t start,stop; cudaError_t ret; /* kernel funtion */ __global__ void add_matrix (double *matA,double *matB,double *matC,int length) { int i=blockIdx.x * blockDim.x + threadIdx.x; int j=blockIdx.y * blockDim.y + threadIdx.y; int k = i+j*length; if(i= devProp.maxThreadsDim[0] || blockDim.y >= devProp.maxThreadsDim[1] || blockDim.z >= devProp.maxThreadsDim[2] ) { printf("\nBlock Dimensions exceed the maximum limits:%d * %d * %d \n",devProp.maxThreadsDim[0],devProp.maxThreadsDim[1],devProp.maxThreadsDim[2]); exit(-1); } if( gridDim.x >= devProp.maxGridSize[0] || gridDim.y >= devProp.maxGridSize[1] || gridDim.z >= devProp.maxGridSize[2] ) { printf("\nGrid Dimensions exceed the maximum limits:%d * %d * %d \n",devProp.maxGridSize[0],devProp.maxGridSize[1],devProp.maxGridSize[2]); exit(-1); } } /* Function to print memory error */ void mem_error(char *arrayname, char *benchmark, int len, char *type) { printf("\nMemory not sufficient to allocate for array %s\n\tBenchmark : %s \n\tMemory requested = %d number of %s elements\n",arrayname, benchmark, len, type); printf("\tAborting\n"); exit(-1); } /* launch kernel function is called in main() */ void launch_kernel_MatMatAdd(double *device_MatA,double *device_MatB,double *device_MatC,int size) { dim3 dimBlock(BLOCKSIZE,BLOCKSIZE); dim3 dimGrid(size/dimBlock.x,size/dimBlock.y); /* checking the maximum limit of blocksize and gridsize */ check_block_grid_dim(deviceProp,dimBlock,dimGrid); add_matrix<<>>(device_MatA,device_MatB,device_MatC,size); } /* Function to calculate gflops */ double calculate_gflops(double &Tsec) { //printf("time taken is %.8lf\n",Tsec); double gflops=(1.0e-9 * (( 1.0 * size*size )/Tsec)); //printf("Gflops is \t%f\n",gflops); return gflops; } /* prints the result on screen */ void print_on_screen(char * program_name,float tsec,double gflops,int size,int flag)//flag=1 if gflops has been calculated else flag =0 { printf("\n---------------%s----------------\n",program_name); printf("\tSIZE\t TIME_SEC\t Gflops\n"); if(flag==1) printf("\t%d\t%f\t%lf\t",size,tsec,gflops); else printf("\t%d\t%lf\t%lf\t",size,"---","---"); } /* Function to perform Mat Addition on CPU */ void CPU_MatMatAdd(double *A,double *B,double *C,int length) { for(int i =0;i fabs(dRes[i])) relativeError = fabs((hRes[i] - dRes[i]) / hRes[i]); else relativeError = fabs((dRes[i] - hRes[i]) / dRes[i]); if (relativeError > EPS && relativeError != 0.0e+00 ) { if(errorNorm < relativeError) { errorNorm = relativeError; flag=1; } } } if( flag == 1) { printf(" \n Results verfication : Failed"); printf(" \n Considered machine precision : %e", EPS); printf(" \n Relative Error : %e\n", errorNorm); } else printf("\n Results verfication : Success\n"); } /* free memory */ void dfree(double * arr[],int len) { for(int i=0;i