c c******************************************************************* c c C-DAC Tech Workshop : hyPACK-2013 c October 15-18, 2013 c c Example 5.9 : mm_mult_cartesian.f c c Objective : Matrix Matrix multiplication(Using Cartesian Topology) c c Input : Read files (mdata1.inp) for first input matrix c and (mdata2.inp) for second input matrix c c Output : Result of matrix matrix multiplication on Processor 0. c c Necessary Condition : Number of Processes should be less than c or equal to 8. Vector size for Vectors A and c B should be properly striped. that is Vector c size should be properly divisible by c Number of processes used. c c Created : August-2013 c c E-mail : hpcfte@cdac.in c c********************************************************************* program main include 'mpif.h' include 'cart.h' integer A_Bloc_MatrixSize, B_Bloc_MatrixSize integer NoofRows_A, NoofCols_A, NoofRows_B, NoofCols_B integer NoofRows_BlocA, NoofCols_BlocA integer NoofRows_BlocB, NoofCols_BlocB integer Matrix_Size(0:4) integer Local_Index, Global_Row_Index, Global_Col_Index integer Local_varb, irow, icol, jrow integer Local_iproc, Local_irow integer iproc, jproc, index, Proc_Id, Root integer Numprocs, MyRank, MyGlobalRank, root_p real Matrix_A(ROW_SIZE_A,COL_SIZE_A), $ Matrix_B(ROW_SIZE_B,COL_SIZE_B), $ Matrix_C(ROW_SIZE_A,COL_SIZE_B) real MatA_array(TOTAL_SIZE_A), MatB_array(TOTAL_SIZE_B), $ MatC_array(TOTAL_SIZE_C) real A_Bloc_Matrix(TOTAL_SIZE_A), $ B_Bloc_Matrix(TOTAL_SIZE_B), $ C_Bloc_Matrix(TOTAL_SIZE_C) real LocalMatrix_A(TOTAL_SIZE_A) real LocalMatrix_B(TOTAL_SIZE_B) real RowBlock_A(TOTAL_SIZE_A), ColBlock_B(TOTAL_SIZE_B) c For Setting Cartesian Topology c Periods - For Wraparound in each dimension. c Dimensions - Number of processors in each dimension. c Coordinates - processor Row and Column identification c Remain_dims - For row and column communicators. integer Periods(2) integer Dimensions(2) integer Coordinates(2) integer Remain_dims(2) integer Comm2D, Row_comm, Col_comm c .....Initialising call MPI_INIT (ierror) call MPI_COMM_SIZE(MPI_COMM_WORLD, Numprocs, ierror) call MPI_COMM_RANK(MPI_COMM_WORLD, MyGlobalRank, ierror) c ........Set up the MPI_COMM_WORLD and CARTESIAN TOPOLOGY Dimensions(1) = sqrt(dble(Numprocs)) Dimensions(2) = sqrt(dble(Numprocs)) root_p = sqrt(dble(Numprocs)) c ........Wraparound mesh in both dimensions. Periods(1) = 1 Periods(2) = 1 c ........Create Cartesian topology in two dimnesions and c Cartesian decomposition of the processes call MPI_DIMS_CREATE( Numprocs, 2, Dimensions, ierr ) call MPI_CART_CREATE(MPI_COMM_WORLD, 2, Dimensions, $ Periods, .TRUE., Comm2D, ierror) call MPI_COMM_RANK(Comm2D,MyRank,ierror) call MPI_CART_COORDS(Comm2D,MyRank,2,Coordinates,ierror) Row = Coordinates(1) Col = Coordinates(2) c Construction of row communicator and column communicators c (use cartesian row and columne machanism to get c Row and Col Communicators) Remain_dims(1) = 0 Remain_dims(2) = 1 c The output communicator represents the column c containing the process call MPI_CART_SUB(Comm2D, Remain_dims, Row_comm, ierror) Remain_dims(1) = 1 Remain_dims(2) = 0 c The output communicator represents the row containing c the process call MPI_CART_SUB(Comm2D, Remain_dims, Col_comm, ierror) c Reading Input Root = 0 if ( MyRank .eq. Root) then c ......Read the Matrix From Input file ..... open(unit=12, file = './data/mdata1.inp') read(12,*) NoofRows_A, NoofCols_A write(6,*) 'Input matrix - A ' do i = 1,NoofRows_A read(12,*) (Matrix_A(i,j),j=1,NoofCols_A) write(6,75) (Matrix_A(i,j),j=1,NoofCols_A) enddo open(unit=13, file = './data/mdata2.inp') read(13,*) NoofRows_B, NoofCols_B write(6,*) 'Input matrix - B' do i = 1,NoofRows_B read(13,*) (Matrix_B(i,j),j=1,NoofCols_B) write(6,75) (Matrix_B(i,j),j=1,NoofCols_B) enddo close(12) close(13) Matrix_Size(0) = NoofRows_A Matrix_Size(1) = NoofCols_A Matrix_Size(2) = NoofRows_B Matrix_Size(3) = NoofCols_B endif c ..........Send Matrix Size to all processors call MPI_BCAST (NoofRows_A, 1, MPI_INTEGER, Root, $ Comm2D, ierror) call MPI_BCAST (NoofCols_A, 1, MPI_INTEGER, Root, $ Comm2D, ierror) call MPI_BCAST (NoofRows_B, 1, MPI_INTEGER, Root, $ Comm2D, ierror) call MPI_BCAST (NoofCols_B, 1, MPI_INTEGER, Root, $ Comm2D, ierror) if(NoofCols_A .ne. NoofRows_B) then if(MyRank .eq. Root) $ print*," Matrix Dimension is in correct for Multiplication" goto 100 endif c rem_Rows_A = mod(NoofRows_A, Dimensions(1)) c rem_Cols_A = mod(NoofCols_A, Dimensions(1)) c c rem_Rows_B = mod(NoofRows_B, Dimensions(1)) c rem_Cols_B = mod(NoofCols_B, Dimensions(1)) c if((mod(NoofRows_A, Dimensions(1)) .ne. 0) .or. $ (mod(NoofCols_A, Dimensions(1)) .ne. 0) .or. $ (mod(NoofRows_B, Dimensions(1)) .ne. 0) .or. $ (mod(NoofCols_B, Dimensions(1)) .ne. 0)) then if(MyRank .eq. Root) $ print*,"Matrices can't be divided among processors equally" goto 100 endif NoofRows_BlocA = NoofRows_A / Dimensions(1) NoofCols_BlocA = NoofCols_A / Dimensions(1) NoofRows_BlocB = NoofRows_B / Dimensions(1) NoofCols_BlocB = NoofCols_B / Dimensions(1) A_Bloc_MatrixSize = NoofRows_BlocA * NoofCols_BlocA B_Bloc_MatrixSize = NoofRows_BlocB * NoofCols_BlocB c print*,'MyRank',MyRank,NoofRows_BlocA, NoofCols_BlocA c ....Memory allocating for Bloc Matrices c ....Rearrange the input matrices in one dimensional c arrays by approriate order if(MyRank .eq. Root) then c ...... Convert Matrix into one-dimensional array c according to Checkerboard Local_Index = 1 do iproc = 0, root_p-1 do jproc = 0, root_p-1 do irow = 1, NoofRows_BlocA Global_Row_Index = iproc * NoofRows_BlocA + irow do icol = 1, NoofCols_BlocA Global_Col_Index = jproc * NoofCols_BlocA + icol MatA_array(Local_Index) = $ Matrix_A(Global_Row_Index, Global_Col_Index) Local_Index = Local_Index + 1 enddo enddo enddo enddo Local_Index = 1 do iproc = 0, root_p-1 do jproc = 0, root_p-1 do irow = 1, NoofRows_BlocB Global_Row_Index = iproc * NoofRows_BlocB + irow do icol = 1, NoofCols_BlocB Global_Col_Index = jproc * NoofCols_BlocB + icol MatB_array(Local_Index) = $ Matrix_B(Global_Row_Index, Global_Col_Index) Local_Index = Local_Index + 1 enddo enddo enddo enddo endif c Scatter the Data to all processes by MPI_SCATTER call MPI_SCATTER(MatA_array, A_Bloc_MatrixSize, MPI_REAL, $ A_Bloc_Matrix, A_Bloc_MatrixSize, MPI_REAL, $ Root, Comm2D, ierror) call MPI_SCATTER(MatB_array, B_Bloc_MatrixSize, MPI_REAL, $ B_Bloc_Matrix, B_Bloc_MatrixSize, MPI_REAL, $ Root, Comm2D, ierror) c .........Broadcasting Data amongst rows and columns c call MPI_ALLGATHER(A_Bloc_Matrix, A_Bloc_MatrixSize, $ MPI_REAL, RowBlock_A, A_Bloc_MatrixSize, MPI_REAL, $ Row_comm, ierror) call MPI_ALLGATHER (B_Bloc_Matrix, B_Bloc_MatrixSize, $ MPI_REAL, ColBlock_B, B_Bloc_MatrixSize, MPI_REAL, $ Col_comm, ierror) call MPI_BARRIER(Comm2D, ierror) c ..Rearrange Data to do matrix multiplication for RowBlock_A index = 1 do irow = 0, NoofRows_BlocA-1 Local_irow = irow*NoofCols_BlocA do iproc = 0, root_p-1 Local_iproc = iproc * A_Bloc_MatrixSize do icol = 1, NoofCols_BlocA Local_varb = icol + Local_irow + Local_iproc LocalMatrix_A(index) = RowBlock_A(Local_varb) index = index + 1 enddo enddo enddo c ... Rearrange Data to do matrix multiplication for ColBlock_B index = 1 do icol = 1, NoofCols_BlocB do iproc = 0, root_p-1 Local_iproc = iproc*B_Bloc_MatrixSize do irow = 0, NoofRows_BlocB - 1 Local_varb = icol + Local_iproc + irow*NoofCols_BlocB LocalMatrix_B(index) = ColBlock_B(Local_varb) index = index + 1 enddo enddo enddo C ........Multiply LocalBlock matrices to get C_Bloc_matrix index = 1 do irow = 0, NoofRows_BlocA-1 do icol = 0, NoofCols_BlocB-1 C_Bloc_Matrix(index) = 0.0 do jrow = 1, NoofRows_B C_Bloc_Matrix(index) = C_Bloc_Matrix(index) + $ LocalMatrix_A(irow*NoofCols_A + jrow) * $ LocalMatrix_B(icol*NoofRows_B+jrow) enddo index = index + 1 enddo enddo c ........Gather output block matrices at processor 0 call MPI_GATHER(C_Bloc_Matrix, NoofRows_BlocA*NoofCols_BlocB, $ MPI_REAL, MatC_array, NoofRows_BlocA * NoofCols_BlocB, $ MPI_REAL, Root, Comm2D, ierror) c ...Rearranging the output matrix in a array by approriate order if (MyRank .eq. Root) then do iproc = 0, root_p-1 do jproc = 0, root_p-1 Proc_Id = iproc * root_p + jproc do irow = 1, NoofRows_BlocA Global_Row_Index = iproc * NoofRows_BlocA + irow do icol = 1, NoofCols_BlocB Global_Col_Index = jproc * NoofCols_BlocB + icol Local_Index = (Proc_Id*NoofRows_BlocA*NoofCols_BlocB) $ + ((irow-1) * NoofCols_BlocB) + icol Matrix_C(Global_Row_Index,Global_Col_Index) = $ MatC_array(Local_Index) enddo enddo enddo enddo write(6,*) 'Output matrix ' do i = 1,NoofRows_A write(6,75) (Matrix_C(i,j),j=1,NoofCols_B) enddo 75 format(8(2x,f8.3)) endif 100 call MPI_FINALIZE(ierror) stop end