/************************************************************************ C-DAC Tech Workshop : hyPACK-2013 October 15-18, 2013 Example : multiple-cuda-streams.cu Objective : Objective is to demonstrate multiple streams for addition of two vectors Input : None Output : Time of cuda event elapsed time. Created : August-2013 E-mail : hpcfte@cdac.in **************************************************************************/ #include void CUDA_SAFE_CALL(cudaError_t call) { cudaError_t ret = call; switch(ret) { case cudaSuccess: break; default: { printf(" ERROR at line :%i.%d' '%s\n",__LINE__,ret,cudaGetErrorString(ret)); exit(-1); break; } } } #define N (1024*1024) /* N = Vector Size */ #define FULL_DATA_SIZE (N*20) __global__ void kernel( int *a, int *b, int *c ) { int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < N) { int idx1 = (idx + 1) % 256; int idx2 = (idx + 2) % 256; float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f; float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f; c[idx] = (as + bs) / 2; } } int main( void ) { cudaDeviceProp prop; int whichDevice; CUDA_SAFE_CALL( cudaGetDevice( &whichDevice ) ); CUDA_SAFE_CALL( cudaGetDeviceProperties( &prop, whichDevice ) ); if (!prop.deviceOverlap) { printf( "Device will not handle overlaps, so no speed up from streams\n" ); return 0; } if( (prop.concurrentKernels == 0 )) //check concurrent kernel support { printf("> GPU does not support concurrent kernel execution\n"); printf(" CUDA kernel runs will be serialized\n"); } if(prop.asyncEngineCount == 0) //check concurrent data transfer support { printf("GPU does not support concurrent Data transer and overlaping of kernel execution & data transfer\n"); printf("Mem copy call will be blocking calls\n"); } cudaEvent_t start, stop; float elapsedTime; cudaStream_t stream0, stream1; int *host_a, *host_b, *host_c; int *dev_a0, *dev_b0, *dev_c0; int *dev_a1, *dev_b1, *dev_c1; // start the timers CUDA_SAFE_CALL( cudaEventCreate( &start ) ); CUDA_SAFE_CALL( cudaEventCreate( &stop ) ); // initialize the streams CUDA_SAFE_CALL( cudaStreamCreate( &stream0 ) ); CUDA_SAFE_CALL( cudaStreamCreate( &stream1 ) ); // allocate the memory on the GPU CUDA_SAFE_CALL( cudaMalloc( (void**)&dev_a0, N * sizeof(int) ) ); CUDA_SAFE_CALL( cudaMalloc( (void**)&dev_b0, N * sizeof(int) ) ); CUDA_SAFE_CALL( cudaMalloc( (void**)&dev_c0, N * sizeof(int) ) ); CUDA_SAFE_CALL( cudaMalloc( (void**)&dev_a1, N * sizeof(int) ) ); CUDA_SAFE_CALL( cudaMalloc( (void**)&dev_b1, N * sizeof(int) ) ); CUDA_SAFE_CALL( cudaMalloc( (void**)&dev_c1, N * sizeof(int) ) ); // allocate host locked memory, used to stream CUDA_SAFE_CALL( cudaHostAlloc( (void**)&host_a, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault ) ); CUDA_SAFE_CALL( cudaHostAlloc( (void**)&host_b, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault ) ); CUDA_SAFE_CALL( cudaHostAlloc( (void**)&host_c, FULL_DATA_SIZE * sizeof(int), cudaHostAllocDefault ) ); for (int i=0; i>>( dev_a0, dev_b0, dev_c0 ); kernel<<>>( dev_a1, dev_b1, dev_c1 ); // enqueue copies of c from device to locked memory CUDA_SAFE_CALL( cudaMemcpyAsync( host_c+i, dev_c0, N * sizeof(int), cudaMemcpyDeviceToHost, stream0 ) ); CUDA_SAFE_CALL( cudaMemcpyAsync( host_c+i+N, dev_c1, N * sizeof(int), cudaMemcpyDeviceToHost, stream1 ) ); } CUDA_SAFE_CALL( cudaStreamSynchronize( stream0 ) ); CUDA_SAFE_CALL( cudaStreamSynchronize( stream1 ) ); CUDA_SAFE_CALL( cudaEventRecord( stop, 0 ) ); CUDA_SAFE_CALL( cudaEventSynchronize( stop ) ); CUDA_SAFE_CALL( cudaEventElapsedTime( &elapsedTime, start, stop ) ); printf( "Time taken: %3.1f ms\n", elapsedTime ); // cleanup the streams and memory CUDA_SAFE_CALL( cudaFreeHost( host_a ) ); CUDA_SAFE_CALL( cudaFreeHost( host_b ) ); CUDA_SAFE_CALL( cudaFreeHost( host_c ) ); CUDA_SAFE_CALL( cudaFree( dev_a0 ) ); CUDA_SAFE_CALL( cudaFree( dev_b0 ) ); CUDA_SAFE_CALL( cudaFree( dev_c0 ) ); CUDA_SAFE_CALL( cudaFree( dev_a1 ) ); CUDA_SAFE_CALL( cudaFree( dev_b1 ) ); CUDA_SAFE_CALL( cudaFree( dev_c1 ) ); CUDA_SAFE_CALL( cudaStreamDestroy( stream0 ) ); CUDA_SAFE_CALL( cudaStreamDestroy( stream1 ) ); return 0; }