/********************************************************************** C-DAC Tech Workshop : hyPACK-2013 October 15-18, 2013 Example : cuda-infinity-norm.cu Objective : Write CUDA program to calculate Infinity Norm of Matrix. Input : None Output : Data Size ,Execution time in seconds Created : August-2013 E-mail : hpcfte@cdac.in **********************************************************************/ #include #include #include #define EPS 1.0e-12 #define GRIDSIZE 10 #define BLOCKSIZE 16 #define SIZE 128 int size = SIZE; cudaDeviceProp deviceProp; cudaEvent_t start,stop; cudaError_t ret; /* kernel funtion */ __global__ void matInfinityNorm(double *device_InMat,double *device_InfinityNorm,int matRowSize, int matColSize, int threadDim) { int tidx = threadIdx.x; int tidy = threadIdx.y; int tindex = (threadDim * tidx) + tidy; int maxNumThread = threadDim * threadDim; int pass = 0; int colCount, tCount ; int curRowInd; double tempInfinityNorm = 0.0; double rowMaxValue = 0.0; for( tCount = 1; tCount < maxNumThread; tCount++) device_InfinityNorm[tCount] = 0.0; while( (curRowInd = (tindex + maxNumThread * pass)) < matRowSize ) { rowMaxValue = 0.0; for( colCount = 0; colCount < matColSize; colCount++) rowMaxValue += fabs(device_InMat[curRowInd* matRowSize + colCount]); tempInfinityNorm = ( tempInfinityNorm>rowMaxValue? tempInfinityNorm:rowMaxValue); pass++; } device_InfinityNorm[ tindex ] = tempInfinityNorm; __syncthreads(); if(tindex == 0) for( tCount = 1; tCount < maxNumThread; tCount++) device_InfinityNorm[0] = device_InfinityNorm[0]> device_InfinityNorm[tCount]? device_InfinityNorm[0]: device_InfinityNorm[tCount]; } /* Check for safe return of all calls to the device */ void CUDA_SAFE_CALL(cudaError_t call) { cudaError_t ret = call; switch(ret) { case cudaSuccess: break; default: { printf(" ERROR at line :%i.%d' ' %s\n",__LINE__,ret,cudaGetErrorString(ret)); exit(-1); break; } } } /* Get the number of GPU devices present on the host */ int get_DeviceCount() { int count; cudaGetDeviceCount(&count); return count; } /* Fill in the vector with double precision values */ void fill_dp_matrix(double* mat,int size) { int ind; for(ind=0;ind= devProp.maxThreadsDim[0] || blockDim.y >= devProp.maxThreadsDim[1] || blockDim.z >= devProp.maxThreadsDim[2] ) { printf("\nBlock Dimensions exceed the maximum limits:%d * %d * %d \n",devProp.maxThreadsDim[0],devProp.maxThreadsDim[1],devProp.maxThreadsDim[2]); exit(-1); } if( gridDim.x >= devProp.maxGridSize[0] || gridDim.y >= devProp.maxGridSize[1] || gridDim.z >= devProp.maxGridSize[2] ) { printf("\nGrid Dimensions exceed the maximum limits:%d * %d * %d \n",devProp.maxGridSize[0],devProp.maxGridSize[1],devProp.maxGridSize[2]); exit(-1); } } /* Function to print memory error */ void mem_error(char *arrayname, char *benchmark, int len, char *type) { printf("\nMemory not sufficient to allocate for array %s\n\tBenchmark : %s \n\tMemory requested = %d number of %s elements\n",arrayname, benchmark, len, type); printf("\tAborting\n"); exit(-1); } /* launch kernel function is called in main() */ void launch_kernel_MatInfinityNorm(double *device_InMat,double *device_InfinityNorm,int size) { dim3 dimBlock(BLOCKSIZE,BLOCKSIZE); dim3 dimGrid(size/dimBlock.x,size/dimBlock.y); /* checking the maximum limit of blocksize and gridsize */ check_block_grid_dim(deviceProp,dimBlock,dimGrid); matInfinityNorm<<>>(device_InMat,device_InfinityNorm,size,size,BLOCKSIZE); } /* Function to calculate gflops */ double calculate_gflops(double &Tsec) { double gflops=(1.0e-9 * (( 1.0 * size*size )/Tsec)); return gflops; } /* prints the result on screen */ void print_on_screen(char * program_name,float tsec,double gflops,int size,int flag)//flag=1 if gflops has been calculated else flag =0 { printf("\n---------------%s----------------\n",program_name); printf("\tSIZE\t TIME_SEC\t Gflops\n"); if(flag==1) printf("\t%d\t%f\t%lf\t",size,tsec,gflops); else printf("\t%d\t%lf\t%lf\t",size,"---","---"); } /* free memory */ void dfree(double * arr[],int len) { for(int i=0;i fabs(*host_InfinityNorm)) relativeError = fabs((temp_norm - *host_InfinityNorm) / temp_norm); else relativeError = fabs((*host_InfinityNorm - temp_norm) / *host_InfinityNorm); if (relativeError > eps && relativeError != 0.0e+00 ) { if(errorNorm < relativeError) { errorNorm = relativeError; flag=1; } } if( flag == 1) { printf(" \n \t\t Results verfication : Failed\n"); printf(" \n \t\t Considered machine precision : %e\n", eps); printf(" \n \t\t Relative Error : %e\n", errorNorm); } else { printf("\n \n\t\tResults verfication : Success\n\n"); } return 0; } /* main() */ int main() { double *host_InMat,*host_InfinityNorm; double *device_InMat,*device_InfinityNorm; int device_Count=get_DeviceCount(); printf("\n\nNUmber of Devices : %d\n\n", device_Count); /* Device Selection, Device 1 */ cudaSetDevice(0); int device; /* Current Device Detection */ cudaGetDevice(&device); cudaGetDeviceProperties(&deviceProp,device); printf("Using device %d: %s \n", device, deviceProp.name); /* event creation */ CUDA_SAFE_CALL(cudaEventCreate (&start)); CUDA_SAFE_CALL(cudaEventCreate (&stop)); /* allocating the memory for each matrix */ host_InMat = new double[size*size]; host_InfinityNorm = new double[sizeof(double)]; if(host_InMat==NULL) mem_error("host_InMat","matInfinityNorm",size,"double"); if(host_InfinityNorm==NULL) mem_error("host_InfinityNorm","matInfinityNorm",sizeof(double),"double"); /* filling the matrix with double precisio */ fill_dp_matrix(host_InMat,size*size); /* allocating memory on GPU */ CUDA_SAFE_CALL(cudaMalloc( (void**)&device_InMat,size*size*sizeof(double))); CUDA_SAFE_CALL(cudaMalloc( (void**)&device_InfinityNorm,sizeof(double))); /* copying host matrix to device matrix */ CUDA_SAFE_CALL(cudaMemcpy((void*)device_InMat, (void*)host_InMat, size*size* sizeof(double) , cudaMemcpyHostToDevice )); CUDA_SAFE_CALL(cudaMemcpy((void*)device_InfinityNorm, (void*)host_InfinityNorm, sizeof(double) , cudaMemcpyHostToDevice )); CUDA_SAFE_CALL(cudaEventRecord (start, 0)); launch_kernel_MatInfinityNorm(device_InMat,device_InfinityNorm,size); //launching the kernel CUDA_SAFE_CALL(cudaEventRecord (stop, 0)); CUDA_SAFE_CALL(cudaEventSynchronize (stop)); /* computing elapsed time */ float elapsedTime; CUDA_SAFE_CALL(cudaEventElapsedTime ( &elapsedTime, start, stop)); double Tsec = elapsedTime *1.0e-3; /* calling funtion for measuring Gflops */ calculate_gflops(Tsec); /* printing the result on screen */ print_on_screen("MAT INFINTYNORM",Tsec,calculate_gflops(Tsec),size,1); /* retriving result from device */ CUDA_SAFE_CALL(cudaMemcpy((void*)host_InfinityNorm, (void*)device_InfinityNorm, sizeof(double) , cudaMemcpyDeviceToHost )); /* to get the result uncomment this part printf("\n ----------------------------------------------------------------------"); printf("InfinityNorm = %lf", *host_InfinityNorm);*/ /* comparing result of CPU-GPU */ matInfinityNormCheckResult(host_InMat,size,size,host_InfinityNorm); /* free the device memory */ double *array[2]; array[0]=device_InMat; array[1]=device_InfinityNorm; dfree(array,2); /* free host memory */ free(host_InMat); free(host_InfinityNorm); cudaDeviceReset(); }// end of main