/*************************************************************************** C-DAC Tech Workshop : hyPACK-2013 October 15-18, 2013 Example : VectVectAdd.cu Objective : Write a CUDA program to compute Vector-Vector Addition using global memory implementation. Input : None Output : Execution time in seconds , Gflops achieved Created : August-2013 E-mail : hpcfte@cdac.in **************************************************************************/ #include #include #include #define EPS 1.0e-12 #define GRIDSIZE 10 #define BLOCKSIZE 16 #define SIZE 128 double *dMatA, *dMatB; double *hMatA, *hMatB; double *dresult, *hresult,*CPU_Result,*host_MatC; int vlength , count = 0; int blockWidth; cudaEvent_t start,stop; cudaDeviceProp deviceProp; int device_Count,size=SIZE; __global__ void vectvectadd(double* dm1,double* dm2,double *dres,int num) { int tx = blockIdx.x*blockDim.x + threadIdx.x; int ty = blockIdx.y*blockDim.y + threadIdx.y; int tindex=tx+(gridDim.x)*(blockDim.x)*ty; if(tindex= devProp.maxThreadsDim[0] || blockDim.y >= devProp.maxThreadsDim[1] || blockDim.z >= devProp.maxThreadsDim[2] ) { printf("\nBlock Dimensions exceed the maximum limits:%d * %d * %d \n",devProp.maxThreadsDim[0],devProp.maxThreadsDim[1],devProp.maxThreadsDim[2]); exit(-1); } if( gridDim.x >= devProp.maxGridSize[0] || gridDim.y >= devProp.maxGridSize[1] || gridDim.z >= devProp.maxGridSize[2] ) { printf("\nGrid Dimensions exceed the maximum limits:%d * %d * %d \n",devProp.maxGridSize[0],devProp.maxGridSize[1],devProp.maxGridSize[2]); exit(-1); } } void mem_error(char *arrayname, char *benchmark, int len, char *type) { printf("\nMemory not sufficient to allocate for array %s\n\tBenchmark : %s \n\tMemory requested = %d number of %s elements\n",arrayname, benchmark, len, type); printf("\n\tAborting\n\n"); exit(-1); } /* * Get the number of GPU devices present on the host */ int get_DeviceCount() { int count; cudaGetDeviceCount(&count); return count; } void deviceQuery() { int device_Count; device_Count=get_DeviceCount(); printf("\n\nNUmber of Devices : %d\n\n", device_Count); cudaSetDevice(0); int device; cudaGetDevice(&device); cudaGetDeviceProperties(&deviceProp,device); printf("Using device %d: %s \n", device, deviceProp.name); } /* Launch kernel */ void launch_kernel() { dim3 dimBlock(BLOCKSIZE, BLOCKSIZE); dim3 dimGrid((vlength/BLOCKSIZE*BLOCKSIZE)+1,1); check_block_grid_dim(deviceProp,dimBlock,dimGrid); vectvectadd<<>>(dMatA, dMatB, dresult,vlength ); } /* Function to print gflosp rating */ double print_Gflops_rating(float Tsec,int size) { // printf("time taken is %.8f\n",Tsec); double gflops; gflops=(1.0e-9 * (( 1.0 * size )/Tsec)); //printf("Gflops is \t%f\n",gflops); return gflops; } /* prints the result on screen */ void print_on_screen(char * program_name,float tsec,double gflops,int size,int flag)//flag=1 if gflops has been calculated else flag =0 { printf("\n---------------%s----------------\n",program_name); printf("\tSIZE\t TIME_SEC\t Gflops\n"); if(flag==1) printf("\t%d\t%f\t%lf\t",size,tsec,gflops); else printf("\t%d\t%lf\t%lf\t",size,"---","---"); } void dfree(double * arr[],int len) { for(int i=0;i