/***************************************************************************** C-DAC Tech Workshop : hyPACK-2013 October 15-18, 2013 Example : VectVectMult.cu Objective : Write a CUDA Program to perform Vector Vector multiplication using global memory implementation. Input : None Output : Execution time in seconds , Gflops achieved Created : August-2013 E-mail : hpcfte@cdac.in ****************************************************************************/ #include #include #define EPS 1.0e-12 #define GRIDSIZE 10 #define BLOCKSIZE 16 #define SIZE 128 double *dMatA, *dMatB; double *hMatA, *hMatB; double *dresult, *hresult; int vlength , count = 0; int blockWidth; cudaEvent_t start,stop; cudaDeviceProp deviceProp; int device_Count; float elapsedTime; double Tsec,gflops; long long int start_time,end_time; /* Kernel Function */ __global__ void vvmul(int len,double* A,double* B,double *C) { int tid= blockIdx.x*blockDim.x*blockDim.y + threadIdx.x +threadIdx.y * blockDim.x; while(tid < len) { C[tid] = A[tid] * B[tid]; tid += blockDim.x * gridDim.x; } } /* Check for safe return of all calls to the device */ void CUDA_SAFE_CALL(cudaError_t call) { cudaError_t ret = call; //printf("RETURN FROM THE CUDA CALL:%d\t:",ret); switch(ret) { case cudaSuccess: // printf("Success\n"); break; /* case cudaErrorInvalidValue: { printf("ERROR: InvalidValue:%i.\n",__LINE__); exit(-1); break; } case cudaErrorInvalidDevicePointer: { printf("ERROR:Invalid Device pointeri:%i.\n",__LINE__); exit(-1); break; } case cudaErrorInvalidMemcpyDirection: { printf("ERROR:Invalid memcpy direction:%i.\n",__LINE__); exit(-1); break; } */ default: { printf(" ERROR at line :%i.%d' ' %s\n",__LINE__,ret,cudaGetErrorString(ret)); exit(-1); break; } } } /* Get the number of GPU devices present on the host */ int get_DeviceCount() { int count; cudaGetDeviceCount(&count); return count; } /* Function to launch kernel for execution */ void launch_kernel() { dim3 threadsPerBlock(16,16); int numBlocks; if( vlength /256 == 0) numBlocks=1; else numBlocks = vlength/100; dim3 blocksPerGrid(numBlocks ,1); cudaEventRecord(start,0); vvmul<<>>(vlength,dMatA,dMatB,dresult); cudaEventRecord(stop,0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTime,start,stop); Tsec=elapsedTime*1.0e-3; printf("time taken is %.8lf\n",Tsec); gflops=(2.0e-9 * (( vlength )/Tsec)); printf("Gflops is \t%f\n",gflops); } /* Function to get device informatin */ void deviceQuery() { int device_Count; device_Count=get_DeviceCount(); printf("\n\nNUmber of Devices : %d\n\n", device_Count); cudaSetDevice(0); int device; cudaGetDevice(&device); cudaGetDeviceProperties(&deviceProp,device); printf("Using device %d: %s \n", device, deviceProp.name); } /* function for memory check */ void mem_error(char *arrayname, char *benchmark, int len, char *type) { printf("\nMemory not sufficient to allocate for array %s\n\tBenchmark : %s \n\tMemory requested = %d number of %s elements\n",arrayname, benchmark, len, type); printf("\n\tAborting\n\n"); exit(-1); } /* Fill in the vector with double precision values */ void fill_dp_vector(double* vec,int size) { int ind; for(ind=0;ind