c
c*****************************************************************
c
c		C-DAC Tech Workshop : hyPACK-2013
c                           October 15-18, 2013
c
c*******************************************************************
c
c
c       Example 6.3	     : conjugate-gradient-mpi-code-flang.f
c
c       Objective            : Conjugate Gradient Method to solve 
c                              AX = b matrix system of linear
c                              equations.
c	
c       Input                : Real Symmetric Positive definite 
c                              Matrix and the real vector - Input_A
c                              and Vector_B. Read files (mdatcg.inp) 
c                              for Matrix A and (vdatcg.inp) for 
c                              Vector b
c
c       Description          : Input matrix is stored in n by n
c                              format.Diagonal preconditioning
c                              matrix is used.Rowwise block striped
c                              partitioning matrix is used.Maximum
c                              iterations is given by MAX_ITERATIONS
c                              Tolerance value is given by EPSILON
c                              Header file used  : cgconst.h 
c
c       Output               : The solution of  Ax=b on process 
c                              with Rank 0 and the number of 
c                              iterations for convergence of the 
c                              method.
c
c       Necessary conditions : Number of Processes should be less
c                              than or equal to 8 Input Matrix 
c                              Should be Square Matrix. Matrix size 
c                              for Matrix A and vector size for
c                              vector b should be equally striped,
c                              that is Matrix size and Vector Size 
c                              should be dividible by the number of 
c                              processes used.
c
c   Created                  : August-2013
c
c   E-mail                   : hpcfte@cdac.in     
c
c********************************************************************


	program main

	include 'mpif.h'
	include 'cgconst.h'

	integer  NumProcs, MyRank, Root
	integer  NoofRows, NoofCols, VectorSize
	integer  NoofRows_Bloc, Bloc_MatrixSize
	integer  Bloc_VectorSize
	integer	 irow, icol, index, local_index, Iteration 

	double precision  Matrix_A(TOTAL_SIZE), 
     $	                  Input_A(MATRIX_SIZE, MATRIX_SIZE)
	double precision  Vector_B(VECTOR_SIZE), 
     $ 	                  Vector_X(VECTOR_SIZE)
	double precision  Bloc_Matrix_A(TOTAL_SIZE), 
     $	                  Bloc_Vector_X(VECTOR_SIZE)
	double precision  Direction_Vector(VECTOR_SIZE), 
     $	                  Bloc_Direction_Vector(VECTOR_SIZE)

	double precision  Bloc_Residue_Vector(VECTOR_SIZE)
	double precision  Bloc_Precond_Matrix(TOTAL_SIZE)
	double precision  Buffer(VECTOR_SIZE)
	double precision  Bloc_HVector(VECTOR_SIZE)

        double precision Delta0,Delta1,Bloc_Delta0,Bloc_Delta1
	double precision Tau, val, temp, Beta

	Root = 0
	Iteration = 1

C  	......Initialising MPI 
	call MPI_INIT(ierror)
	call MPI_COMM_SIZE(MPI_COMM_WORLD, NumProcs, ierror)
	call MPI_COMM_RANK(MPI_COMM_WORLD, MyRank, ierror)

C       .....Read Input Matrix_A and Vector_B from file
	if(MyRank .eq. Root) then

	   open(unit = 12, file = './matrix-data-cg.inp')
         write(6,*) 'Input Matrix' 
	   read(12,*) NoofRows, NoofCols
  	   do i = 1, NoofRows
	    read(12,*) (Input_A(i,j), j=1, NoofCols)
         write(6,75) (Input_A(i,j),j=1,NoofCols)
  	   end do	

75    format(8(2x,f8.3))
C          ......Read vector from input file .......
	   open(unit=13, file = './vector-data-cg.inp')
         write(6,*) 'Input Vector' 
         read(13,*) VectorSize
         read(13,*) (Vector_B(i), i=1,VectorSize)
         write(6,75) (Vector_B(i), i = 1, NoofRows)

     	   n_size = VectorSize

C 	   .......Convert Input_A into 1-D array Matrix_A 
	   index = 1
	   do irow=1, NoofRows
	      do icol=1, NoofCols
		 Matrix_A(index) = Input_A(irow, icol)
		 index = index + 1
	      enddo
	   enddo
	endif
c
c  	Broadcast Matrix and Vector size and perform 
c	input validation tests
c
	call MPI_BCAST(NoofRows, 1, MPI_INTEGER, Root, 
     $	MPI_COMM_WORLD, ierror)

	call MPI_BCAST(NoofCols, 1, MPI_INTEGER, Root, 
     $	MPI_COMM_WORLD, ierror)

	call MPI_BCAST(VectorSize, 1, MPI_INTEGER, Root, 
     $	MPI_COMM_WORLD, ierror)

   	if(NoofRows .ne. NoofCols) then
	  if(MyRank .eq. Root) 
     $	  print*,"Error : Input Matrix Should be square matrix"
	  goto 100
	endif

   	if(NoofRows .ne.  VectorSize) then
	   if(MyRank .eq. Root) print*,
     $	      "Error : Matrix Size should be equal to VectorSize"
	   goto 100
	endif

	if(mod(NoofRows, NumProcs) .ne. 0) then
	   if(MyRank .eq. Root) print*, 
     $	     "Matrix cannot be evenly striped among processes"
	   goto 100
	endif

c	.....BroadCast Vector_B.......
	call MPI_BCAST(Vector_B, VectorSize, MPI_DOUBLE_PRECISION, 
     $	     Root, MPI_COMM_WORLD, ierror)

c	....... Scatter Matrix_A .......
   	NoofRows_Bloc   = NoofRows / NumProcs
	Bloc_VectorSize = NoofRows_Bloc
	Bloc_MatrixSize = NoofRows_Bloc * NoofCols

	call MPI_SCATTER(Matrix_A, Bloc_MatrixSize, 
     $	    MPI_DOUBLE_PRECISION, Bloc_Matrix_A, Bloc_MatrixSize, 
     $	    MPI_DOUBLE_PRECISION, Root, MPI_COMM_WORLD, ierror)

c	........Intialise Solution vector it to zero...... 
	do i = 1, VectorSize
       	   Vector_X(i) = 0.0
	end do

c	....Calculation of RESIDUE = Ax-B
	call CalculateResidueVector(Bloc_Residue_Vector,
     $ 	Bloc_Matrix_A, Vector_B, Vector_X, NoofRows_Bloc, 
     $  VectorSize, MyRank)
		
c	......Precondtion Matrix is identity matrix ......

	call GetPreconditionMatrix( Bloc_Precond_Matrix, 
     $	NoofRows_Bloc, NoofCols)

	call SolvePrecondMatrix(Bloc_HVector, Bloc_Residue_Vector, 
     $	Bloc_VectorSize)

c       .......Initailise Bloc Direction Vector = -(Bloc_HVector)...
	do index=1, Bloc_VectorSize
	   Bloc_Direction_Vector(index) = 0 - Bloc_HVector(index)
	enddo

c 	........Calculate Delta0 and check for convergence ..

	Bloc_Delta0 = 0.0
        do index = 1, Bloc_VectorSize 
	   Bloc_Delta0 = Bloc_Delta0 + 
     $	   Bloc_Residue_Vector(index) * Bloc_HVector(index)
	enddo

	call MPI_Allreduce(Bloc_Delta0, Delta0, 1, 
     $	  MPI_DOUBLE_PRECISION, MPI_SUM, MPI_COMM_WORLD,ierror)

	if(Delta0 .le. EPSILON) then
      	   if(MyRank .eq. Root) write(6,77) 
77	format(3x, / '**** Algorithm Converged '/)
	   goto 100
	endif

	Iteration = 0			

 400    Iteration = Iteration + 1

c	......Gather Direction Vector on all processes.......
	call MPI_Allgather(Bloc_Direction_Vector,Bloc_VectorSize, 
     $	  MPI_DOUBLE_PRECISION, Direction_Vector,Bloc_VectorSize, 
     $	  MPI_DOUBLE_PRECISION, MPI_COMM_WORLD, ierror)

c   	Compute Tau value 
c	Tau = Delta0/(DirVector Transpose*Matrix_A*DirVector)..

	do irow=1, NoofRows_Bloc
	   index = (irow-1) * NoofCols + 1
	   Buffer(irow) = 0.0
	   do icol=1, NoofCols
	      Buffer(irow) = Buffer(irow) +  
     $	      Bloc_Matrix_A(index) * Direction_Vector(icol)
	      index = index + 1
	   enddo
	enddo

	temp = 0.0
	do index=1, Bloc_VectorSize
	 temp = temp+Bloc_Direction_Vector(index)*Buffer(index)
	enddo

	call MPI_Allreduce(temp, val, 1, MPI_DOUBLE_PRECISION,
     $	     MPI_SUM, MPI_COMM_WORLD, ierror)

	Tau = Delta0 / val

c       Compute new vector Xnew = Xold + Tau*Direction
c       Compute BlocResidueVector which is given by
c       BlocResidueVect+Tau*Bloc_MatA*DirVector
c
	do index = 1, Bloc_VectorSize
	   local_index = MyRank*Bloc_VectorSize + index
	   Bloc_Vector_X(index) = Vector_X(local_index) + 
     $	                       Tau*Bloc_Direction_Vector(index)
	   Bloc_Residue_Vector(index)=Bloc_Residue_Vector(index) 
     $					+ Tau*Buffer(index)
	enddo

c	...Gather New Vector X at all  processes......
	call MPI_Allgather(Bloc_Vector_X, Bloc_VectorSize, 
     $	     MPI_DOUBLE_PRECISION, Vector_X, Bloc_VectorSize, 
     $	     MPI_DOUBLE_PRECISION, MPI_COMM_WORLD, ierror)


	call SolvePrecondMatrix(Bloc_HVector, Bloc_Residue_Vector, 
     $	     Bloc_VectorSize)
		
	Bloc_Delta1 = 0.0
       	do index = 1, Bloc_VectorSize 
	   Bloc_Delta1 = Bloc_Delta1 + 
     $	   Bloc_Residue_Vector(index) * Bloc_HVector(index)
	enddo

	call MPI_Allreduce(Bloc_Delta1, Delta1, 1, 
     $	     MPI_DOUBLE_PRECISION,MPI_SUM,MPI_COMM_WORLD,ierror)

C	   If(MyRank .eq. 0) then 
C	     write(6,76) Iteration, Delta1
C 76	     format(1x, 'Iteration =', i6, 2x, 'Residue for 
C     $      successive iterations = ', E16.8)
C	   endif

	if(Delta1 .le. EPSILON) then
	   goto 90
	endif

	Beta   = Delta1 / Delta0
	Delta0 = Delta1

	do index=1, Bloc_VectorSize
	   Bloc_Direction_Vector(index) = -Bloc_HVector(index) 
     $		           + Beta*Bloc_Direction_Vector(index)
	enddo

	if(Iteration .lt. MAX_ITERATIONS) then
	   go to 400
	else
	   goto 80
	endif

 80	if(MyRank .eq. Root) print*,'Iterations Exhausted'
	   go to 200
 90	if(MyRank .eq. Root)  write(6,77)
	   go to 200

 200	if(MyRank .eq. 0) then
	   print*,'Number of iterations : ',Iteration
     	   print*,'Number of processors : ',NumProcs
	   print*,'Results for solution of matrix system of equations'
	   do index = 1,NoofRows
	     print*,'X(',index,')= ',Vector_X(index)
	   enddo
	endif

 100	call MPI_FINALIZE(ierror)

	stop
	end	

c 	**************************************************************
c
	subroutine CalculateResidueVector(Bloc_Residue_Vector, 
     $ 	           Bloc_Matrix_A, Vector_B, Vector_X, NoofRows_Bloc, 
     $             VectorSize, MyRank)
c 	**************************************************************
c
	include 'cgconst.h'

	double precision Bloc_Residue_Vector(VECTOR_SIZE)
	double precision Bloc_Matrix_A(TOTAL_SIZE)
	double precision Vector_B(VECTOR_SIZE)
	double precision Vector_X(VECTOR_SIZE)
	integer NoofRows_Bloc, VectorSize, MyRank
 
C	... Computes residue = AX - b .......

	integer   irow, index, GlobalVectorIndex
	double precision value

	GlobalVectorIndex = MyRank * NoofRows_Bloc + 1
	do irow=1, NoofRows_Bloc
	   value = 0.0
	   index = irow * VectorSize
	  do jcol = 1, VectorSize
	    value = value + Bloc_Matrix_A(index)*Vector_X(jcol)
	   index = index + 1
	  enddo
	  Bloc_Residue_Vector(irow) =
     $	       value-Vector_B(GlobalVectorIndex)
	  GlobalVectorIndex = GlobalVectorIndex + 1
	enddo

        return
        end
c
c 	*********************************************************
	subroutine GetPreconditionMatrix( Bloc_Precond_Matrix, 
     $             NoofRows_Bloc, NoofCols)
c
c 	********************************************************
c
	include 'cgconst.h'
	double precision Bloc_Precond_Matrix(TOTAL_SIZE) 

	integer NoofRows_Bloc, NoofCols
	integer	Bloc_MatrixSize
	integer	irow, icol, index

	Bloc_MatrixSize = NoofRows_Bloc*NoofCols

c	......Preconditioned Martix is identity matrix .......
   	index = 1
	do irow=1, NoofRows_Bloc
	   do icol=1, NoofCols
		Bloc_Precond_Matrix(index) = 1.0
		index = index + 1
	   enddo
	enddo

	return
	end

c 	************************************************************
	subroutine SolvePrecondMatrix(HVector, Bloc_Residue_Vector, 
     $	           Bloc_VectorSize)
c 	************************************************************
c
	include 'cgconst.h'
	double precision HVector(VECTOR_SIZE)
	double precision Bloc_Residue_Vector(VECTOR_SIZE) 

	integer Bloc_VectorSize
	integer	index

c	Calculate the HVector 
c	HVector=Inverse of Bloc_Precond_Matrix*Bloc_Residue_Vector.

	do index = 1, Bloc_VectorSize
		HVector(index) = Bloc_Residue_Vector(index)
	enddo

	return
	end

c ********************************************************************