/******************************************************************** C-DAC Tech Workshop : hyPACK-2013 October 15-18, 2013 Example : VectVectAdd_shared.cu Objective : Write a CUDA program to compute Vector-Vector Addition using shared memory implementation. Input : None Output : Execution time in seconds , Gflops achieved Created : August-2013 E-mail : hpcfte@cdac.in ************************************************************************/ #include #include #include #define SIZE 128 #define EPS 1.0e-12 #define GRIDSIZE 10 #define BLOCKSIZE 16 double *dMatA, *dMatB; double *hMatA, *hMatB; double *dresult, *hresult,*CPU_Result,*host_MatC; int vlength , count = 0; int blockWidth; cudaEvent_t start,stop; cudaDeviceProp deviceProp; int device_Count,size=SIZE; /* Kernel Function */ __global__ void vectvectadd(double *vect1,double *vect2,double *result,int width) { int threads_per_block=blockDim.x*blockDim.y; int total_blocks_in_grid=gridDim.x*gridDim.y; int tid=threadIdx.x + threadIdx.y*blockDim.x + blockIdx.x*threads_per_block + blockIdx.y*gridDim.x; while(tid= devProp.maxThreadsDim[0] || blockDim.y >= devProp.maxThreadsDim[1] || blockDim.z >= devProp.maxThreadsDim[2] ) { printf("\nBlock Dimensions exceed the maximum limits:%d * %d * %d \n",devProp.maxThreadsDim[0],devProp.maxThreadsDim[1],devProp.maxThreadsDim[2]); exit(-1); } if( gridDim.x >= devProp.maxGridSize[0] || gridDim.y >= devProp.maxGridSize[1] || gridDim.z >= devProp.maxGridSize[2] ) { printf("\nGrid Dimensions exceed the maximum limits:%d * %d * %d \n",devProp.maxGridSize[0],devProp.maxGridSize[1],devProp.maxGridSize[2]); exit(-1); } } /* Function to Launch kernel */ void launch_kernel() { dim3 dimBlock(BLOCKSIZE, BLOCKSIZE); dim3 dimGrid(1,1); check_block_grid_dim(deviceProp,dimBlock,dimGrid); printf("calling kernel\n"); vectvectadd<<>>(dMatA, dMatB, dresult,vlength ); printf("after kernel call\n"); } /* Function for device information*/ void deviceQuery() { int device_Count; device_Count=get_DeviceCount(); printf("\n\nNUmber of Devices : %d\n\n", device_Count); cudaSetDevice(0); int device; cudaGetDevice(&device); cudaGetDeviceProperties(&deviceProp,device); printf("Using device %d: %s \n", device, deviceProp.name); } void mem_error(char *arrayname, char *benchmark, int len, char *type) { printf("\nMemory not sufficient to allocate for array %s\n\tBenchmark : %s \n\tMemory requested = %d number of %s elements\n",arrayname, benchmark, len, type); printf("\n\tAborting\n\n"); exit(-1); } /* * Fill in the vector with double precision values */ void fill_dp_vector(double* vec,int size) { int ind; for(ind=0;ind