/************************************************************************ C-DAC Tech Workshop : hyPACK-2013 October 15-18, 2013 Source Code : vectorModel.cu Program : GPU as SIMD Processor using Vector Programming model Objective : To demonstrate that better bandwidth can be achieved if each thread handles more than one element using GPU as a 32-way SIMD processor. This Program measures the bandwidth of global memory for simple initialization kernel operation [a(i) = value]. Input : None Output : Bandwidth achieved and timing (average) Created : August-2013 E-mail : hpcfte@cdac.in *******************************************************************************/ #include #include #define ARRAY_SIZE 1280000 #define BLOCK_SIZE 32 #define FACTOR 4 #define NTIMES 10 #define HLINE "--------------------------------------------------------------\n" void printResults(); void printDeviceDetails(); void cudaSafeMalloc(void ** , size_t ); void CudaGetDeviceProperties(cudaDeviceProp *, int); void CudaGetDevice(int *); void checkCudaErrors(); float avgTime[2] = {0}; char *label[] = {"Normal ","Vector Model"}; /////////////////////////////////////////////////////////////////////////////////////////////////////// // Kernel for initializing the array - straightforward /////////////////////////////////////////////////////////////////////////////////////////////////////// __global__ void initializationNormally(float *array, float value, int size) { int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < size) array[idx] = value; } /////////////////////////////////////////////////////////////////////////////////////////////////////// // Kernel for initializing the array using Vector Model /////////////////////////////////////////////////////////////////////////////////////////////////////// __global__ void initializationWithVectorModel(float *array, float value, int size) { int idx = threadIdx.x + blockIdx.x * blockDim.x * FACTOR; if (idx < size) for(int i=0; i>>(d_array, 1, ARRAY_SIZE); cudaEventRecord(stop,0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTimes[0][i],start,stop); checkCudaErrors(); // timing the initialization with Partition Camping cudaEventRecord(start,0); initializationWithVectorModel<<< grid2, block>>>(d_array, 1, ARRAY_SIZE); cudaEventRecord(stop,0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTimes[1][i],start,stop); checkCudaErrors(); } //Computing average time taken for(i=0; i<2; i++) { for(j=1; j