/*********************************************************************** C-DAC Tech Workshop : hyPACK-2013 October 15-18, 2013 Example : cuda-vector-vector-addition-grid-blocks-threads.cu Objective : Write a CUDA program to compute Vector-Vector Addition using global memory implementation. Input : None Output : Execution time in seconds , Gflops achieved Created : August-2013 E-mail : hpcfte@cdac.in ************************************************************************/ #include #include #include #define EPS 1.0e-12 #define BLOCKSIZE 16 #define SIZE 128 /* global variable declaration */ cudaDeviceProp deviceProp; // holds device property /* kernel to calculate vector vector addition */ __global__ void vectvectadd(double* dm1,double* dm2,double *dres,int num) { int tx = blockIdx.x * blockDim.x + threadIdx.x; int ty = blockIdx.y * blockDim.y + threadIdx.y; int tindex=tx + (gridDim.x) * (blockDim.x) * ty; if(tindex < num) dres[tindex] = dm1[tindex] + dm2[tindex]; } /* Check for safe return of all calls to the device */ void CUDA_SAFE_CALL(cudaError_t call) { cudaError_t ret = call; switch(ret) { case cudaSuccess: // printf("Success\n"); break; default: { printf(" ERROR at line :%i.%d' ' %s\n",__LINE__,ret,cudaGetErrorString(ret)); exit(-1); break; } } } /* Fill in the vector with double precision values */ void fill_dp_vector(double* vec,int size) { int index; for(index=0 ; index < size ; index++) vec[index]=drand48(); } /* checl grid and block dimensions */ void check_block_grid_dim(cudaDeviceProp devProp,dim3 blockDim,dim3 gridDim) { if( blockDim.x >= devProp.maxThreadsDim[0] || blockDim.y >= devProp.maxThreadsDim[1] || blockDim.z >= devProp.maxThreadsDim[2] ) { printf("\nBlock Dimensions exceed the maximum limits:%d * %d * %d \n",devProp.maxThreadsDim[0],devProp.maxThreadsDim[1],devProp.maxThreadsDim[2]); exit(-1); } if( gridDim.x >= devProp.maxGridSize[0] || gridDim.y >= devProp.maxGridSize[1] || gridDim.z >= devProp.maxGridSize[2] ) { printf("\nGrid Dimensions exceed the maximum limits:%d * %d * %d \n",devProp.maxGridSize[0],devProp.maxGridSize[1],devProp.maxGridSize[2]); exit(-1); } } /* function to check memory error */ void mem_error(char *arrayname, char *benchmark, int len, char *type) { printf("\nMemory not sufficient to allocate for array %s\n\tBenchmark : %s \n\tMemory requested = %d number of %s elements\n",arrayname, benchmark, len, type); printf("\n\tAborting\n\n"); exit(-1); } /* Get the number of GPU devices present on the host */ int get_DeviceCount() { int count; cudaGetDeviceCount(&count); return count; } /* function to calculate relative error */ void relError(double* dRes,double* hRes,int size) { double relativeError=0.0,errorNorm=0.0; int flag=0; int index; for( index = 0; index < size; ++index) { if (fabs(hRes[index]) > fabs(dRes[index])) relativeError = fabs((hRes[index] - dRes[index]) / hRes[index]); else relativeError = fabs((dRes[index] - hRes[index]) / dRes[index]); if (relativeError > EPS && relativeError != 0.0e+00 ) { if(errorNorm < relativeError) { errorNorm = relativeError; flag=1; } } } if( flag == 1) { printf(" \n\t Results verfication : Failed"); printf(" \n \tConsidered machine precision : %e", EPS); printf(" \n \tRelative Error : %e\n", errorNorm); } else printf("\n \tResults verfication : Success\n"); } /* function to check device availability */ void deviceQuery() { int device_Count; device_Count = get_DeviceCount(); printf("\n\nNUmber of Devices : %d\n\n", device_Count); cudaSetDevice(0); int device; cudaGetDevice(&device); cudaGetDeviceProperties(&deviceProp,device); printf("Using device %d: %s \n", device, deviceProp.name); } /* Launch kernel */ void launch_kernel(double *dVectA , double *dVectB , double *dresult , int vlength) { dim3 dimBlock(BLOCKSIZE, BLOCKSIZE); // holds number of blocks per grid dim3 dimGrid((vlength/BLOCKSIZE*BLOCKSIZE)+1,1); // holds number of threads per block /* check block and grid dimension */ check_block_grid_dim(deviceProp,dimBlock,dimGrid); /* Launch vector vector additon kernel */ vectvectadd<<>>(dVectA, dVectB, dresult,vlength ); } /* Function to print gflosp rating */ double print_Gflops_rating(float Tsec,int size) { double gflops; gflops=(1.0e-9 * (( 1.0 * size )/Tsec)); return gflops; } /* prints the result on screen */ void print_on_screen(char * program_name,float tsec,double gflops,int size,int flag)//flag=1 if gflops has been calculated else flag =0 { printf("\n---------------%s----------------\n",program_name); printf("\tSIZE\t TIME_SEC\t Gflops\n"); if(flag==1) printf("\t%d\t%f\t%lf\t",size,tsec,gflops); else printf("\t%d\t%lf\t%lf\t",size,"---","---"); } /* Function to perform computation on CPU*/ void vectVectAdd_Seq(double *A ,double *B,double *C,int size) { for(int index=0 ; index < size ; index++) C[index] = A[index] + B[index]; } /* main function */ int main() { double *dVectA, *dVectB; // holds device input vector double *hVectA, *hVectB; // holds host input vector double *dresult; // holds device resultant vector double *hVectC; // holds host resultant vector double *cpuVectC; //holds host-cpu resultant vector int vlength ; // holds vector length cudaEvent_t start,stop; // holds cuda event vlength = SIZE; //holds vector length /* check device availability */ deviceQuery(); /* create cuda events */ CUDA_SAFE_CALL(cudaEventCreate (&start)); CUDA_SAFE_CALL(cudaEventCreate (&stop)); /* allocation pageable host memory */ hVectA = (double*) malloc( vlength * sizeof(double)); hVectB = (double*) malloc( vlength * sizeof(double)); hVectC = (double*) malloc( vlength * sizeof(double)); cpuVectC = (double*) malloc( vlength * sizeof(double)); /* check for memory error */ if(hVectA==NULL) mem_error("hVectA","vectvectmul",vlength,"double"); if(hVectB==NULL) mem_error("hVectB","vectvectmul",vlength,"double"); if(hVectC==NULL) mem_error("hVectC","vectvectmul",vlength,"double"); /* allocation device memory */ CUDA_SAFE_CALL(cudaMalloc( (void**)&dVectA, vlength * sizeof(double))); CUDA_SAFE_CALL(cudaMalloc( (void**)&dVectB, vlength * sizeof(double))); CUDA_SAFE_CALL(cudaMalloc( (void**)&dresult, vlength*sizeof(double))); /* fill host vectors with random generated double precision value */ fill_dp_vector(hVectA,vlength); fill_dp_vector(hVectB,vlength); for(int index=0 ; index < vlength ; index++) { hVectC[index] = 0.0; } for(int index=0 ; index < vlength ; index++) { cpuVectC[index]=0.0; } /* copy host vector to device vector */ CUDA_SAFE_CALL(cudaMemcpy((void*)dVectA, (void*)hVectA, vlength* sizeof(double) , cudaMemcpyHostToDevice )); CUDA_SAFE_CALL(cudaMemcpy((void*)dVectB, (void*)hVectB, vlength* sizeof(double) , cudaMemcpyHostToDevice )); /* start time */ CUDA_SAFE_CALL(cudaEventRecord (start, 0)); /* function to launch vector vector addition kernel */ launch_kernel(dVectA,dVectB,dresult,vlength); /* time ends */ CUDA_SAFE_CALL(cudaEventRecord (stop, 0)); CUDA_SAFE_CALL(cudaEventSynchronize (stop)); /* copy resultant vector from device to host */ CUDA_SAFE_CALL(cudaMemcpy((void*)hVectC, (void*)dresult,vlength*sizeof(double) , cudaMemcpyDeviceToHost )); /* computing elapsed time */ float elapsedTime; double Tsec; CUDA_SAFE_CALL(cudaEventElapsedTime(&elapsedTime, start, stop)); Tsec = elapsedTime *1.0e-3; //time in sec now /* calculation of Gflops */ print_Gflops_rating(Tsec,vlength); /* printing the result on screen */ print_on_screen("vect vect ADDITION",Tsec, print_Gflops_rating(Tsec,vlength),vlength,1); /* uncomment to get the sequential result of vector vector additoion */ vectVectAdd_Seq(hVectA ,hVectB,cpuVectC,vlength); /* uncomment to compare host-cpu results with gpu results */ relError(hVectC,cpuVectC,vlength); /* free host memory */ free(hVectA); free(hVectB); free(hVectC); free(cpuVectC); /* free device memory */ cudaFree(dVectA); cudaFree(dVectB); cudaFree(dresult); }