/************************************************************************************ C-DAC Tech Workshop : hyPACK-2013 October 15-18, 2013 Source Code : sharedMemoryRestructuringDataTypes.cu Objective : This code demonstrates achievable shared memory bandwidth for different inbuilt data types Description : Example code to demonstrate the different shared memory bandwidths achieved when 1) accessing a 3d array of floats 2) accessing a float3 array 3) accessing a 4d array of floats 4) accessing a float4 array Input : None output : The different bandwidths of the shared memory that are achieved in the above mentioned accesses Created : August-2013 E-mail : hpcfte@cdac.in ****************************************************************************************/ #include #include #include #define NO_OF_PATTERNS 4 #define NTIMES 10 #define BLOCK_SIZE 128 #define TRANSFERED_DATA_SIZE 2000000 //2 MB #define MIN(x,y) ((x)<(y)?(x):(y)) #define MAX(x,y) ((x)>(y)?(x):(y)) void printResults(void); //////////////////////////////////////////////////////////////////////////////////////////////// // // intialising an unused 3D float array in shared memory // access pattern involving accessing the i-th element from all the rows by the i-th thread // ///////////////////////////////////////////////////////////////////////////////////////////////// __global__ void set3DFloatArray( float value ) { __shared__ __attribute__((unused)) float array[3][BLOCK_SIZE]; int idx = threadIdx.x; array[0][idx] = value; array[1][idx] = value; array[2][idx] = value; } ///////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////// // // intialising an unused float3 array in shared memory // access pattern involving accessing all the fields of the i-th element of the array by the i-th thread // //////////////////////////////////////////////////////////////////////////////////////////////////////// __global__ void setFloat3Array( float value ) { __shared__ __attribute__((unused)) float3 array[BLOCK_SIZE]; int idx = threadIdx.x; array[idx].x = value; array[idx].y = value; array[idx].z = value; } //////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////// // // intialising an unused 4D float array in shared memory // access pattern involving accessing the i-th element from all the rows by the i-th thread // /////////////////////////////////////////////////////////////////////////////////////////////// __global__ void set4DFloatArray( float value ) { __shared__ __attribute__((unused)) float array[4][BLOCK_SIZE]; int idx = threadIdx.x; array[0][idx] = value; array[1][idx] = value; array[2][idx] = value; array[3][idx] = value; } /////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////// // // intialising an unused float4 array in shared memory // access pattern involving accessing all the fields of the i-th element of the array by the i-th thread // ////////////////////////////////////////////////////////////////////////////////////////////////////////// __global__ void setFloat4Array( float value ) { __shared__ __attribute__((unused)) float4 array[BLOCK_SIZE]; int idx = threadIdx.x; array[idx].x = value; array[idx].y = value; array[idx].z = value; array[idx].w = value; } /////////////////////////////////////////////////////////////////////////////////////////////////////////// static double avgtime[NO_OF_PATTERNS] = {0}, maxtime[NO_OF_PATTERNS] = {0}, mintime[NO_OF_PATTERNS]; static float bandWidths[NO_OF_PATTERNS] = {0}; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // // the main routene // for timing the different initialising kernels // finding the band widths // //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// int main(int argc, char* argv[]) { float elapsedTimes[NO_OF_PATTERNS][NTIMES]; cudaEvent_t start,stop; double bytes = sizeof(float) * TRANSFERED_DATA_SIZE; // event creation, which will be used for timing the code cudaEventCreate(&start); cudaEventCreate(&stop); //finding the 1D grid size int gridSize = TRANSFERED_DATA_SIZE/BLOCK_SIZE; if( TRANSFERED_DATA_SIZE % BLOCK_SIZE != 0 ) gridSize += 1; // running each pattern NTIMES for(int k=0; k < NTIMES; k++) { // timing the kernels corresponding to different access patterns // PATTERN 1 cudaEventRecord(start,0); set3DFloatArray <<< gridSize,BLOCK_SIZE >>> (2.0f); cudaEventRecord(stop,0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTimes[0][k],start,stop); // PATTERN 2 cudaEventRecord(start,0); setFloat3Array <<< gridSize,BLOCK_SIZE >>> (2.0f); cudaEventRecord(stop,0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTimes[1][k],start,stop); //PATTERN 3 cudaEventRecord(start,0); set4DFloatArray <<< gridSize,BLOCK_SIZE >>> (2.0f); cudaEventRecord(stop,0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTimes[2][k],start,stop); //PATTERN 3 cudaEventRecord(start,0); setFloat4Array <<< gridSize,BLOCK_SIZE >>> (2.0f); cudaEventRecord(stop,0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTimes[3][k],start,stop); } // end of the for loop involving NTIMES // intializing the mintime array for(int i=0; i < NO_OF_PATTERNS;i++) mintime[i] = FLT_MAX; for (int k=1; k < NTIMES; k++) // skiping the first iteration { for (int i=0; i < NO_OF_PATTERNS; i++) { avgtime[i] = avgtime[i] + elapsedTimes[i][k]; mintime[i] = MIN(mintime[i],elapsedTimes[i][k]); maxtime[i] = MAX(maxtime[i], elapsedTimes[i][k]); } } // calculation of the different band widths that are achieved by different access patterns for(int i=0; i < NO_OF_PATTERNS; i++) { avgtime[i] = avgtime[i]/(double)(NTIMES-1); // finding the average time bandWidths[i] = bytes/mintime[i]; } printResults(); printf("\n\n**** successful termination of the program ****\n\n"); cudaEventDestroy(start); cudaEventDestroy(stop); return 0; } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // // prints the results containig the minimum, maximum, average times taken by different kernels // the associated maximum bandwidth of the sharede memory achieved by the different kernals // /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void printResults() { printf("Demonstrating differences among the different data types and their accesses in the shared memory\n"); printf("The transfered data size (Bytes): %ld\n",TRANSFERED_DATA_SIZE*sizeof(float)); printf("\n-------------------------------------------------------------------------------------------------------------------------------\n"); printf("Pattern \t\t\t Bandwidth (GB/sec) \t Avg time (ms) \t Min time (ms) \t Max time (ms)\n"); printf("-------------------------------------------------------------------------------------------------------------------------------\n"); // printing the results for different access patterns for(int i=0; i < NO_OF_PATTERNS; i++) { switch(i) { case 0: printf("accessing 3-d float array "); break; case 1: printf("accessing 1-d float3 array"); break; case 2: printf("accessing 4-d float array "); break; case 3: printf("accessing 1-d float4 array"); break; } printf("\t %.6f \t\t %f \t\t %f \t\t %f\n",bandWidths[i]/1000000,avgtime[i],mintime[i],maxtime[i]); } printf("\n ------------------------------------------------------------------------------------------------------------------------------\n"); } //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////