/************************************************************************ C-DAC Tech Workshop : hyPACK-2013 October 15-18, 2013 Source Code : blockPartitioning.cu Objective : This Program measures the bandwidth of global memory for the following different block sizes and fixed length array in a copy operation. Input : None Output : Bandwidth achieved for different block sizes Created : August-2013 E-mail : hpcfte@cdac.in **************************************************************************/ #include #include #define ARRAY_SIZE 2097120 #define N 5 #define HLINE "----------------------------------------------------\n" #define NTIMES 10 void printResults(); void printDeviceDetails(); void cudaSafeMalloc(void ** , size_t ); void CudaGetDeviceProperties(cudaDeviceProp *, int); void CudaGetDevice(int *); void checkCudaErrors(); float avgTime[N] = {0}; static int blockSize[] = {32,64,128,256,512}; /////////////////////////////////////////////////////////////////////////////////////////////////// // Simple Copy Kernel // It copies one array on device to other /////////////////////////////////////////////////////////////////////////////////////////////////// __global__ void simpleCopyKernel(float* dest,float* src,long size) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if(idx < size) dest[idx] = src[idx]; } /////////////////////////////////////////////////////////////////////////////////////////////////// // Kernel for initializing the array on device with given element /////////////////////////////////////////////////////////////////////////////////////////////////// __global__ void setArray(float *array, float value, int size) { int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < size) array[idx] = value; } //////////////////////////////////////////////////////////////////////////////////////////////////// // Main Routine // for timing the copy kernel printing the results /////////////////////////////////////////////////////////////////////////////////////////////////// int main(int argc,char* argv[]) { int i,j; float *d_srcArray , *d_destArray; //float bandWidths[N]; float elapsedTimes[N][NTIMES]; cudaEvent_t start,stop; int gridSize[N]; // allocating the memory for the two arrays on the device cudaSafeMalloc((void **)&d_srcArray,ARRAY_SIZE*sizeof(float)); cudaSafeMalloc((void **)&d_destArray,ARRAY_SIZE*sizeof(float)); // event creation, which will be used for timing the code cudaEventCreate(&start); cudaEventCreate(&stop); for(j=0; j< NTIMES; j++) { for(i=0; i>> (d_srcArray,1.0f,ARRAY_SIZE); setArray <<< gridSize[i],blockSize[i] >>> (d_destArray,0.0f,ARRAY_SIZE); cudaThreadSynchronize(); // timing the copy routine for different Block Sizes cudaEventRecord(start,0); simpleCopyKernel <<< gridSize[i],blockSize[i] >>> (d_destArray,d_srcArray,ARRAY_SIZE); cudaEventRecord(stop,0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTimes[i][j],start,stop); } } //Computing average time for(i=0; i