/**************************************************************************** C-DAC Tech Workshop : hyPACK-2013 October 15-18, 2013 Objective : program to solve a Vector Vector multiplication using block striped partioning on hybrid computing using MPI C++ and CUDA and Multiple GPUs Input : Process 0 initialize the Vector. Output : Process 0 prints the resultant value. Necessary Size of the Each Vector should be greater than number of Conditons : processes and perfectly divisible by number of processes. Created : August-2013 E-mail : hpcfte@cdac.in ****************************************************************************/ #include #include #include #define BLOCKSIZE 16 #define SIZE 1024 int vlen=SIZE; int size=SIZE; float *hVectA,*hVectB,hRes; float elapsedTime,elapsedTime1; double Tsec,Tsec1,gflops,gflops1; cudaEvent_t start,stop,start1,stop1; int blocksPerGrid; int gridsPerBlock; void routine(void * givendata); void init(int s); struct Data { int deviceId; int size; float* a; float* b; float retVal; }; Data vector[2]; /*sequential function*/ extern "C" float compare() { init(vlen); float sum=0; for(int i=0;i= devProp.maxThreadsDim[0] || blockDim.y >= devProp.maxThreadsDim[1] || blockDim.z >= devProp.maxThreadsDim[2] ) { printf("\nBlock Dimensions exceed the maximum limits:%d * %d * %d \n",devProp.maxThreadsDim[0],devProp.maxThreadsDim[1],devProp.maxThreadsDim[2]); exit(-1); } if( gridDim.x >= devProp.maxGridSize[0] || gridDim.y >= devProp.maxGridSize[1] || gridDim.z >= devProp.maxGridSize[2] ) { printf("\nGrid Dimensions exceed the maximum limits:%d * %d * %d \n",devProp.maxGridSize[0],devProp.maxGridSize[1],devProp.maxGridSize[2]); exit(-1); } } /*function to free memory*/ void dfree(double * arr[],int len) { for(int i=0;isize; float *a,*b,*part_c; float *d_a,*d_b,*d_part_c; a=data->a; b=data->b; part_c = (float*)malloc(len*sizeof(float)); float c; CUDA_SAFE_CALL(cudaSetDevice(data->deviceId)); CUDA_SAFE_CALL(cudaMalloc((void**)&d_a,len*sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void**)&d_b,len*sizeof(float))); CUDA_SAFE_CALL(cudaMalloc((void**)&d_part_c,len*sizeof(float))); CUDA_SAFE_CALL(cudaMemcpy(d_a,a,len*sizeof(float),cudaMemcpyHostToDevice)); CUDA_SAFE_CALL(cudaMemcpy(d_b,b,len*sizeof(float),cudaMemcpyHostToDevice)); dim3 threadsPerBlock(16,16); int numBlocks; if( len /256 == 0) numBlocks=1; else numBlocks = len/100; dim3 blocksPerGrid(numBlocks ,1); printf("Calling kernel on device: %d\n",data->deviceId); if(data->deviceId==1) { // start=(cudaEvent_t)malloc(sizeof(cudaEvent_t)); //stop=(cudaEvent_t)malloc(sizeof(cudaEvent_t)); cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start,0); vvmul<<>>(len,d_a,d_b,d_part_c); cudaEventRecord(stop,0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTime,start,stop); Tsec=elapsedTime*1.0e-3; printf("\n\ntime taken by device 0 is: %.8lf\n\n",Tsec); print_on_screen("VECTOR VECTOR MULTIPLICATION (MULTIGPU+MPI)",Tsec,0,vlen/2,1); } else { // start1=(cudaEvent_t)malloc(sizeof(cudaEvent_t)); //stop1=(cudaEvent_t)malloc(sizeof(cudaEvent_t)); cudaEventCreate(&start1); cudaEventCreate(&stop1); cudaEventRecord(start1,0); vvmul<<>>(len,d_a,d_b,d_part_c); cudaEventRecord(stop1,0); cudaEventSynchronize(stop1); cudaEventElapsedTime(&elapsedTime1,start1,stop1); Tsec1=elapsedTime1*1.0e-3; printf("\n\ntime taken by device 1: %.8lf\n\n",Tsec1); print_on_screen("VECTOR VECTOR MULTIPLICATION (MULTIGPU+MPI)",Tsec1,0,vlen/2,0); } if(cudaPeekAtLastError()) printf("KERNEL ERROR: %s\t on device:%d\n",cudaGetErrorString(cudaPeekAtLastError()),data->deviceId); CUDA_SAFE_CALL(cudaMemcpy(part_c,d_part_c,len*sizeof(float),cudaMemcpyDeviceToHost)); // this line has problem because the part_c array size / allocation . int ind; for(ind=0;indretVal=c; printf("Exiting from device :%d \n",data->deviceId); } void init(int size) { int devCount; CUDA_SAFE_CALL(cudaGetDeviceCount(&devCount)); if(devCount < 2) { printf("Atleast 2 GPU's are needed :%d\n",devCount); exit(0); } printf("devices available\n"); int vlength=size; int ind; hVectA=(float*)malloc(vlen*sizeof(float)); hVectB=(float*)malloc(vlen*sizeof(float)); for(ind=0;ind < vlen;ind++) { hVectA[ind]=2; hVectB[ind]=2; } vector[0].deviceId = 0; vector[0].size =vlength/2; vector[0].a =hVectA; vector[0].b =hVectB; vector[1].deviceId = 1; vector[1].size =vlength/2; vector[1].a =hVectA + vlength/2 ; vector[1].b =hVectB + vlength/2 ; } extern "C" void hfree() { free(hVectA); free(hVectB); printf("host mem freed successfully\n"); }