/**************************************************************************** C-DAC Tech Workshop : hyPACK-2013 October 15-18, 2013 Objective : program to solve a Matrix vector multiplication using block striped partioning on hybrid computing using MPI C++ and CUDA Input : Matrix Rows, Matrix Columns, Vector Size and Process 0 intializes the Matrix and the vector. Output : Process 0 prints the resultant vector. Necessary Number of rows of matrix should be greater than number of Conditons : processes and perfectly divisible by number of processes. Matrix columns should be equal to vector size. Created : August-2013 E-mail : hpcfte@cdac.in ****************************************************************************/ #include #include #include #include #include //------------------------------------------------------------------------------------------------------------------------------------------- #define BLOCKSIZE 16 //------------------------------------------------------------------------------------------------------------------------------------------- int IntializingMatrixVectors(float **, float **, float **, int , int , int ); int CheckDevice(int ); //------------------------------------------------------------------------------------------------------------------------------------------- //Pragma routine to report the detail of cuda error #define CUDA_SAFE_CALL(call) \ do{ \ cudaError_t err = call; \ if(err != cudaSuccess) \ { \ fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",\ __FILE__, __LINE__, cudaGetErrorString( err) ); \ exit(1); \ } \ } while (0) \ //---------------------------------------------------------------------------------------------------------------------------------------- //Kernel that performs Matrix Vector Multiplication __global__ void MatrixVectorMultiplication(float *Matrix,float *Vector,float *Solution,int VectorLength, int ScatterSize, int ThreadDim) { int tidx = threadIdx.x; int tidy = threadIdx.y; int ThreadIndex = (ThreadDim * tidx) + tidy; int MaxNumThread = ThreadDim * ThreadDim; //int VectLen = MatrixSize; int count,ThreadColumnIndex,pass = 0 ; float TempResult = 0.0f; while( (ThreadColumnIndex = (ThreadIndex + MaxNumThread * pass)) < ScatterSize ) { TempResult = 0.0f; for( count = 0; count < VectorLength; count++) TempResult += Matrix[ThreadColumnIndex*VectorLength+count] * Vector[count]; Solution[ThreadColumnIndex] = TempResult; pass++; } __syncthreads(); }//End of Matrix Vector Multiplication Device Function //--------------------------------------------------------------------------------------------------------------------------------------- int main(int argc, char **argv) { int MyRank, NumberOfProcessors; int Root = 0, Index, Status = 1; float *MatrixA, *VectorB, *ResultVector; float *MyMatrixA, *MyResultVector; float *DeviceMyMatrixA, *DeviceMyResultVector, *DeviceVectorB; int RowsNo, ColsNo, VectorSize, ScatterSize, IndexCol, IndexValue, DeviceStatus; //MPI Initialization and getting the number of nodes and node id of this node MPI::Init(argc, argv); NumberOfProcessors = MPI::COMM_WORLD.Get_size(); MyRank = MPI::COMM_WORLD.Get_rank(); //Checking if valid number of arguements have been passed if(argc != 4) { if(MyRank == Root) cout<<"Usage:<-n><./Program Name>"<>>(DeviceMyMatrixA, DeviceVectorB, DeviceMyResultVector, ColsNo, ScatterSize, BLOCKSIZE); //Copying the value of patial result vector from device to host cudaMemcpy( (void *)MyResultVector, (void *)DeviceMyResultVector, ScatterSize * sizeof(float), cudaMemcpyDeviceToHost ); } MPI::COMM_WORLD.Barrier(); //Root processor gathering from all nodes to get the final result vector MPI::COMM_WORLD.Gather( MyResultVector, ScatterSize, MPI::FLOAT, ResultVector, ScatterSize, MPI::FLOAT, Root); //Root processor printing the resultant vector if(MyRank == Root) { cout<<"The resultant vector with size " <= 1) { //Getting device id cudaGetDevice(&Device); //Getting properties of the device cudaGetDeviceProperties(&Properties, Device); cout<<"Processor with rank "<