/******************************************************************** C-DAC Tech Workshop : hyPACK-2013 October 15-18, 2013 Source Code : sharedMemoryReadingSameWord.cu Objective : Example code to demonstrate the different access patterns of float3 array in the global memory the corresponding advantages in terms of the bandwidth that is achievable Descritpion : Example code to demonstrate that while reading the same word by all the threads, there will not be any serialization since all threads are accessing from the same bank the 32-bit word gets broadcasted to all the threads -- hence bandwidth can be comparable to the value got when there were no bank conflicts output: The different bandwidths of the shared memory that are achieved while accessing the same word and the access pattern without any bank conflicts Created : August-2013 E-mail : hpcfte@cdac.in ****************************************************************************/ #include #include #include #define BLOCK_SIZE 128 // 128 threads per block #define TRANSFERED_DATA_SIZE 2000000 //2 MB #define NO_OF_PATTERNS 2 #define NTIMES 10 #define MIN(x,y) ((x)<(y)?(x):(y)) #define MAX(x,y) ((x)>(y)?(x):(y)) void printResults(void); //////////////////////////////////////////////////////////////////////////////////// // // access pattern involving accessing the elements with a stride of one 32-bit word // the access pattern causes no bank conflicts // //////////////////////////////////////////////////////////////////////////////////// __global__ void sharedMemAccessWithStride1(void) { __shared__ __attribute__((unused)) float array[BLOCK_SIZE]; int idx = threadIdx.x; float scalar = 5.0f; array[idx] = scalar; } /////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// // // access pattern involving reading the same word by all the threads // the access pattern is conflict-free // since the word gets broadcasted to all the threads // ///////////////////////////////////////////////////////////////////////////////// __global__ void sharedMemReadSameWord(void) { __shared__ float array[BLOCK_SIZE]; int idx = 3; float __attribute__((unused)) scalar ; scalar = array[idx]; } ///////////////////////////////////////////////////////////////////////////////// static double avgtime[NO_OF_PATTERNS] = {0}, maxtime[NO_OF_PATTERNS] = {0}, mintime[NO_OF_PATTERNS]; static float bandWidths[NO_OF_PATTERNS] = {0}; ////////////////////////////////////////////////////////////////////////////////////// // // the main routene // for timing the two kernals // finding the corresponding band widths // /////////////////////////////////////////////////////////////////////////////////// int main(int argc, char* argv[]) { float elapsedTimes[NO_OF_PATTERNS][NTIMES]; cudaEvent_t start,stop; double bytes = sizeof(float) * TRANSFERED_DATA_SIZE; // event creation, which will be used for timing the code cudaEventCreate(&start); cudaEventCreate(&stop); //finding the 1D grid size int gridSize = TRANSFERED_DATA_SIZE/BLOCK_SIZE; if( TRANSFERED_DATA_SIZE % BLOCK_SIZE != 0 ) gridSize += 1; // running each pattern NTIMES for(int k=0; k < NTIMES; k++) { // timing the kernels corresponding to different access patterns // PATTERN 1 cudaEventRecord(start,0); sharedMemAccessWithStride1 <<< gridSize,BLOCK_SIZE >>> (); cudaEventRecord(stop,0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTimes[0][k],start,stop); // PATTERN 2 cudaEventRecord(start,0); sharedMemReadSameWord <<< gridSize,BLOCK_SIZE >>> (); cudaEventRecord(stop,0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTimes[1][k],start,stop); } // end of the for loop involving NTIMES // intializing the mintime array for(int i=0; i < NO_OF_PATTERNS;i++) mintime[i] = FLT_MAX; for (int k=1; k < NTIMES; k++) // skiping the first iteration { for (int i=0; i < NO_OF_PATTERNS; i++) { avgtime[i] = avgtime[i] + elapsedTimes[i][k]; mintime[i] = MIN(mintime[i],elapsedTimes[i][k]); maxtime[i] = MAX(maxtime[i], elapsedTimes[i][k]); } } // calculation of the different band widths that are achieved by different access patterns for(int i=0; i < NO_OF_PATTERNS; i++) { avgtime[i] = avgtime[i]/(double)(NTIMES-1); // finding the average time bandWidths[i] = bytes/mintime[i]; } printResults(); printf("\n\n**** successful termination of the program ****\n\n"); cudaEventDestroy(start); cudaEventDestroy(stop); return 0; } ///////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////// // // prints the results containig the minimum, maximum, average times taken by the two kernels // the associated maximum bandwidth achieved by the different kernels // ///////////////////////////////////////////////////////////////////////////////////////////////////// void printResults() { printf("Demonstrating that reading same 32-bit word by all threads will not cause bank conflicts\n"); printf("The transfered data size (Bytes): %ld\n",TRANSFERED_DATA_SIZE*sizeof(float)); printf("\n-----------------------------------------------------------------------------------------------------\n"); printf("Pattern \t\t\t Bandwidth (GB/sec) \t Avg time (ms) \t Min time (ms) \t Max time (ms)\n"); printf("--------------------------------------------------------------------------------------------------------\n"); // printing the results for different access patterns for(int i=0; i < NO_OF_PATTERNS; i++) { switch(i) { case 0: printf("Stride of one 32-bit word "); break; case 1: printf("Reading the same 32-bit word"); break; } printf("\t %.6f \t\t %f \t\t %f \t\t %f\n",bandWidths[i]/1000000,avgtime[i],mintime[i],maxtime[i]); } printf("\n ---------------------------------------------------------------------------------------------------\n"); } ///////////////////////////////////////////////////////////////////////////////////////////////////////