/***************************************************************************** C-DAC Tech Workshop : hyPACK-2013 October 15-18, 2013 Example : cuda-matrix-vector-multiplication.cu. Objective : Write CUDA program to compute Matrix-Vector multiplication using Synchronus execution and Asynchronus concurrent execution. Input : Can specify the number of kernel. If not specified it will be taken as 16 and more than 16 is not allowed. Output : Execution time in seconds, Gflops achieved for both the above mentioned execution. Created : August-2013 E-mail : hpcfte@cdac.in ****************************************************************************/ #include #include #include #include #include #include #define BLOCKSIZE 16 #define SIZE 1024 #define EPS 1.0e-15 cudaDeviceProp deviceProp; // Global declaration double *host_Mat,*host_Vect,*host_ResVect,*cpu_ResVect; double *device_Mat,*device_Vect,*device_ResVect; int device_Count; long int size = SIZE; int matRowSize = size; int vlength = size; int matColSize = size; int nkernels = 0; int nstream; /*mem error*/ void mem_error(char *arrayname, char *benchmark, int len, char *type) { printf("\nMemory not sufficient to allocate for array %s\n\tBenchmark : %s \n\tMemory requested = %d number of %s elements\n",arrayname, benchmark, len, type); exit(-1); } /*calculate Gflops*/ double calculate_gflops(float &Tsec) { float gflops=(1.0e-9 * (( 2.0 * size*size )/Tsec)); return gflops; } /*sequential function for mat vect multiplication*/ void CPU_MatVect() { cpu_ResVect = (double *)malloc(matRowSize*sizeof(double)); if(cpu_ResVect==NULL) mem_error("cpu_ResVect","vectmatmul",size,"double"); int i,j; for(i=0;i fabs(dRes[i])) relativeError = fabs((hRes[i] - dRes[i]) / hRes[i]); else relativeError = fabs((dRes[i] - hRes[i]) / dRes[i]); if (relativeError > EPS && relativeError != 0.0e+00 ) { if(errorNorm < relativeError) { errorNorm = relativeError; flag=1; } } } if( flag == 1) { printf(" \n Results verfication : Failed"); printf(" \n Considered machine precision : %e", EPS); printf(" \n Relative Error : %e\n", errorNorm); } else printf("\n Results verfication : Success\n"); } /*prints the result in screen*/ void print_on_screen(char * program_name,float tsec,double gflops,int size,int flag)//flag=1 if gflops has been calculated else flag =0 { printf("\n---------------%s----------------\n",program_name); printf("\tSIZE\t TIME_SEC\t Gflops\n"); if(flag==1) printf("\t%d\t%f\t%lf\t",size,tsec,gflops); else printf("\t%d\t%lf\t%lf\t",size,"---","---"); } /*funtion to check blocks per grid and threads per block*/ void check_block_grid_dim(cudaDeviceProp devProp,dim3 blockDim,dim3 gridDim) { if( blockDim.x >= devProp.maxThreadsDim[0] || blockDim.y >= devProp.maxThreadsDim[1] || blockDim.z >= devProp.maxThreadsDim[2] ) { printf("\nBlock Dimensions exceed the maximum limits:%d * %d * %d \n",devProp.maxThreadsDim[0],devProp.maxThreadsDim[1],devProp.maxThreadsDim[2]); exit(-1); } if( gridDim.x >= devProp.maxGridSize[0] || gridDim.y >= devProp.maxGridSize[1] || gridDim.z >= devProp.maxGridSize[2] ) { printf("\nGrid Dimensions exceed the maximum limits:%d * %d * %d \n",devProp.maxGridSize[0],devProp.maxGridSize[1],devProp.maxGridSize[2]); exit(-1); } } /*Get the number of GPU devices present on the host */ int get_DeviceCount() { int count; cudaGetDeviceCount(&count); return count; } /*Fill in the vector with double precision values */ void fill_dp_vector(double* vec,int size) { int ind; for(ind=0;ind [nkernels].........aborting \n"); exit(-1); } if(nkernels > 16) { printf("\n The maximum number of kernel launches that a device can execute concurrently is 16 \n"); printf("\n Kernels will may not be executed concurrently...... \n"); } break; default : printf("\n Invalid options...\n"); printf("\n Usage : <./exe> [nKernels] \n"); exit(-1); } } /*============================================================================================== Check device Availability =================================================================================================*/ void checkDeviceAvailability() { cudaError_t err; // holds error value err=cudaSetDevice(0); //change this to set the code to another GPU if (err == cudaErrorDevicesUnavailable) { printf("\ndevice Not available\n"); exit(0); } } /*============================================================================================= check device property for concurrent execution ========================================================================================= */ void checkDeviceProperty(cudaDeviceProp deviceProp) { printf("\n DEVICE USED :\t %s",deviceProp.name); if( (deviceProp.concurrentKernels == 0 )) //check concurrent kernel support { printf("> GPU does not support concurrent kernel execution\n"); printf(" CUDA kernel runs will be serialized\n"); } if(deviceProp.asyncEngineCount == 0) //check concurrent data transfer support { printf("GPU does not support concurrent Data transer and overlaping of kernel execution & data transfer\n"); printf("Mem copy call will be blocking calls\n"); } } /*============================================================================================= Memory Allocation =================================================================================== */ void memoryAlloc(long int size ) { /* memory allocate to matrices*/ CUDA_SAFE_CALL( cudaMallocHost((void**)&host_Mat , matRowSize * matColSize * sizeof(double))); CUDA_SAFE_CALL( cudaMallocHost((void**)&host_Vect , vlength * sizeof(double))); CUDA_SAFE_CALL( cudaMallocHost((void**)&host_ResVect , vlength * sizeof(double))); /* initialize Matrices*/ fill_dp_vector(host_Mat,matRowSize* matColSize); fill_dp_vector(host_Vect,vlength); for(int index = 0; index < matRowSize ; index++) host_ResVect[index] = 0; /* allocate device memory*/ CUDA_SAFE_CALL( cudaMalloc((void**) &device_Mat, matRowSize * matColSize *sizeof(double))); /* allocate device memory*/ CUDA_SAFE_CALL( cudaMalloc((void**) &device_Vect,vlength*sizeof(double))); /* allocate device memory*/ CUDA_SAFE_CALL( cudaMalloc((void**) &device_ResVect,matRowSize *sizeof(double))); } /*************************************************************** function to implement concurrent kernel execution ***************************************************************/ void funcAsynchConcurrentExec(cudaStream_t *stream) { float elapsedTime; // holds timing variables cudaError_t err; // holds error value /* create CUDA event handles */ cudaEvent_t startEvent, stopEvent; CUDA_SAFE_CALL( cudaEventCreate(&startEvent)); CUDA_SAFE_CALL( cudaEventCreate(&stopEvent)); /* get all errors before kernel launch */ if ( err=cudaGetLastError()) { printf(" File : %s , Line : %d , Error : %s \n",__FILE__, __LINE__, cudaGetErrorString(err)); } /* Asynchronous kernel execution */ int max=BLOCKSIZE*BLOCKSIZE; int BlocksPerGrid=matRowSize/max+1; dim3 dimBlock(BLOCKSIZE,BLOCKSIZE); if(matRowSize%max==0) BlocksPerGrid--; dim3 dimGrid(1,BlocksPerGrid); check_block_grid_dim(deviceProp,dimBlock,dimGrid); //Starting the Asynchronus copy event cudaEventRecord(startEvent); for( int ind=0; ind>>(device_Mat,device_Vect,matRowSize,vlength,device_ResVect); } for( int ind=0; ind>>(device_Mat,device_Vect,matRowSize,vlength, device_ResVect); } for( int ind=0; ind