/* * C-DAC Tech Workshop : hyPACK-2013 * October 15-18, 2013 * * Program : mpi_tbb_matrix_vector_multiply.cpp * Desc. : [MPI + IntelTBB] program to do vector matrix multiplication. * Input : None * Output : Resultant matrix after matrix-vector multiplication * * Email : hpcfte@cdac.in * */ #include #include #include #include #include #include #include #include #include using namespace tbb; using namespace std; /* * Class construct 'PartialVectorCalc' is used by intelTBB's parallel_for API call. * This API call executes the for-loop in parallel. All the details of this operation * is taken care of by intelTBB API. */ class PartialVectorCalc { private: float *const LocalMatrix, *const Vector, *const ResultVector;//private members of this class int *const NumColumns; public: //constructor initializing private members to parameter-passed. PartialVectorCalc(float *A, float *B, float *R, int *NC):LocalMatrix(A), Vector(B), ResultVector(R), NumColumns(NC){} //operator overloading ; required by intelTBB API void operator()(blocked_range&r) const { float *vA, *vB, *vR; vA = LocalMatrix; vB = Vector; vR = ResultVector; int *NumCol = NumColumns; int index=0; for(size_t i=r.begin(); i!=r.end(); ++i) { vR[i]=0; index = i * (*NumCol);//shifts to 'i'th row of vA (i.e., Local Matrix) for(size_t j=0; j<(*NumCol); j++) { vR[i] += (vA[index++] * vB[j]); } } } };//end of class PartialVectorCalc int main(int argc, char * argv[]) { int ProcessId;//Id of the process (rank) int NumProcs;//total no. of processes FILE* MatrixFileDesc;//file descriptor of Matrix input file FILE* VectorFileDesc;//file descriptor of Vector input file int FileSuccess = 1;//file access success indicator (default to 'true') int iRow, iCol, i;//temps float **Matrix;//Pointer to Matrix float *Vector;//Pointer to Vector float *Buffer;//Stores 1D Matrix float *LocalMatrix;//Part of Matrix with each process in 1D float *LocalResultVec;//Local partial result of the vector float *FinalResultVec;//Final result vector int NumColumns;//no. of columns in Matrix int NumRows;//no. of rows in Matrix int VectorSize;//size of Vector int NumRowsInPartn;//number of rows in each partition int PartnSize;//size of each partition int TermFlag=0; //initialize mpi communicator MPI_Init(&argc, &argv); //get this process id MPI_Comm_rank(MPI_COMM_WORLD, &ProcessId); //get total processes MPI_Comm_size(MPI_COMM_WORLD, &NumProcs); /* * Operation exclusive to Rank=0: * Read: input Matrix from file "Matrix.input" and * input Vector from "Vector.input" * Convert 2D 'Matrix' to 1D 'Buffer' */ if(ProcessId == 0)//if process rank is '0' { //read Matrix from "./Matrix.input" if((MatrixFileDesc = fopen("./Matrix.input", "r"))==NULL) cout << "\nError opening Matrix.input\n"; if(MatrixFileDesc != NULL)//if no error opening { //get NumRows & NumColumns fscanf(MatrixFileDesc, "%d %d\n", &NumRows, &NumColumns); //allocate memory and read Matrix Matrix = (float**)malloc(NumRows * sizeof(float*)); //read all values of Matrix[NumRows][NumColumns] from "Matrix.input" for(iRow=0; iRow(0, NumRowsInPartn, 1000), PartialVectorCalc(LocalMatrix, Vector, LocalResultVec, &NumColumns)); parallel_for(blocked_range(0, NumRowsInPartn), PartialVectorCalc(LocalMatrix, Vector, LocalResultVec, &NumColumns)); //par_matrix_vector_multiply( NumRowsInPartn,NumColumns,LocalMatrix,Vector,LocalResultVec); //par_matrix_vector_multiply( NumRowsInPartn,NumColumns,LocalMatrix,Vector,LocalResultVec); //par_matrix_vector_multiply( PartnSize,NumColumns,LocalMatrix,Vector,LocalResultVec); //barrier synchronization; wait for all tasks to reach here MPI_Barrier(MPI_COMM_WORLD); //allocate memory for FinalResultVec FinalResultVec = (float*) malloc(NumColumns * sizeof(float)); //Gather all partial LocalResultVec from each MPI process to FinalResultVec MPI_Gather(LocalResultVec, NumRowsInPartn, MPI_FLOAT, FinalResultVec, NumRowsInPartn, MPI_FLOAT, 0, MPI_COMM_WORLD); if(ProcessId == 0)//if process '0' (root) { //display FinalResultVec for(iRow=0; iRow