/**************************************************************************** C-DAC Tech Workshop : hyPACK-2013 October 15-18, 2013 Objective : program to solve a linear system of equations (Ax = b) using conjugate gradient method in a GPU (using cublas lib) Input : Number of unknowns and max no. of iterations Output : Solution Vector. Created : August-2013 E-mail : hpcfte@cdac.in ****************************************************************************/ #include #include #include #include #include "cublas.h" //---------------------------------------------------------------------------------------------------- #define BLOCKSIZE 16 #define EPSILON 1.0E-20 //---------------------------------------------------------------------------------------------------- void GetTheInputToTheProblem(float **,float **,float **,int ); void GetPreConditionMatrix(float ** ,int ); void SolvePreConditionMatrix(float *,float *,float *,int ); void GenerateCoefficientMatrix(float **,int); void ConvertMatrixToOneDimension(float **,float *,int ); void GenerateRhsVector(float **,float *,int ); void IntializeDirectionVector(float ** ,int ,float * ,float * ,dim3 ,dim3 ); void CalculateResidueVector(float ** ,float * ,float * ,float * ,int , float * ,float * ,dim3 ,dim3 ); void FreeTheDeviceMemory(float *,float *,float *,float *,float *,float *,float *); void FreeTheHostMemory(float *,float *,float *,float *,float *,float *); void *malloc_safe_call(int size); //-------------------------------------------------------------------------------------------------------- //Pragma routine to report the detail of cuda error #define CUDA_SAFE_CALL(call) \ do{ \ cudaError_t err = call; \ if(err != cudaSuccess) \ { \ fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",\ __FILE__, __LINE__, cudaGetErrorString( err) ); \ exit(1); \ } \ } while (0) \ //------------------------------------------------------------------------------------------------------------ int main(int argc,char **argv) { //Checking if Valid number of Arguements have been passed if(argc != 3) { printf("Usage:<./a.out> \n"); exit(-1); } // Variable Declaration float *RhsVector,*SolutionVector,*CoefficientMatrix,*ResidueVector; float *PreConditionMatrix,*HVector,*DirectionVector; float *DeviceRhsVector,*DeviceSolutionVector,*DeviceCoefficientMatrix,*DeviceResidueVector; float *DeviceHVector,*DeviceDirectionVector; int MaxIterations,Size,PresentIteration,rowNum; float Delta0, TempValue, *DeviceTempBuffer; float Delta1; float Tau,Beta = 0.0; struct timeval tv; double timing; //Obtaining the Size of the Matrix and Maximum number of Iterations from the given Arguements Size = atoi(argv[1]); MaxIterations = atoi(argv[2]); // Generating and Intializing the required Vectors and Matrix in the Host GetTheInputToTheProblem(&RhsVector,&SolutionVector,&CoefficientMatrix,Size); //Allocating Memory on Device CUDA_SAFE_CALL(cudaMalloc( (void **)&DeviceRhsVector,Size * sizeof(float))); CUDA_SAFE_CALL(cudaMalloc( (void **)&DeviceSolutionVector,Size * sizeof(float))); CUDA_SAFE_CALL(cudaMalloc( (void **)&DeviceCoefficientMatrix,Size * Size * sizeof(float))); CUDA_SAFE_CALL(cudaMalloc( (void **)&DeviceResidueVector,Size * sizeof(float))); CUDA_SAFE_CALL(cudaMalloc( (void **)&DeviceTempBuffer,Size * sizeof(float))); //Copying Data from Host To Device CUDA_SAFE_CALL(cudaMemcpy((void*)DeviceRhsVector,(void*)RhsVector,Size*sizeof(float),cudaMemcpyHostToDevice)); CUDA_SAFE_CALL(cudaMemcpy((void*)DeviceSolutionVector,(void*)SolutionVector,Size*sizeof(float),cudaMemcpyHostToDevice)); CUDA_SAFE_CALL(cudaMemcpy((void*)DeviceCoefficientMatrix,(void*)CoefficientMatrix,Size*Size*sizeof(float),cudaMemcpyHostToDevice)); //Defining Thread Grid and the Thread Block dim3 DimGrid(1,1); dim3 DimBlock(BLOCKSIZE,BLOCKSIZE); //intialising cublas cublasInit(); //start timing the computation gettimeofday(&tv, NULL); double t1=tv.tv_sec+(tv.tv_usec/1000000.0); //Calculate the Residue vector RESIDUE=AX-B CalculateResidueVector(&ResidueVector,DeviceCoefficientMatrix,DeviceSolutionVector,DeviceTempBuffer,Size,DeviceRhsVector,DeviceResidueVector,DimGrid,DimBlock); //Getting Pre-Conditional Matrix, Preconditional Matrix is Identity Matrix GetPreConditionMatrix(&PreConditionMatrix,Size); //Allocating Memory for HVector on Host HVector = (float *)malloc_safe_call(Size * sizeof(float)); //HVector = PreConditionMatrix Inverse * ResidueVector SolvePreConditionMatrix(PreConditionMatrix,HVector,ResidueVector,Size); //Allocating Memory on Device Memory CUDA_SAFE_CALL(cudaMalloc( (void **)&DeviceHVector,Size*sizeof(float))); CUDA_SAFE_CALL(cudaMalloc( (void **)&DeviceDirectionVector,Size*sizeof(float))); //Copying Hvector from Host to Device cudaMemcpy((void *)DeviceHVector,(void *)HVector,Size*sizeof(float),cudaMemcpyHostToDevice); //Intialize DirectionVector i.e DirectionVector = -(HVector) IntializeDirectionVector(&DirectionVector,Size,DeviceHVector,DeviceDirectionVector,DimGrid,DimBlock); //Compute Delta0 = ResidueVector * HVector Delta0 = cublasSdot (Size, DeviceResidueVector, 1, DeviceHVector, 1); if(Delta0 < EPSILON) exit(-1); PresentIteration = 0; do { //Incrementing the Iteration Count PresentIteration++; //Compute Tau, Tau = Delta0/DirectionVectorTranspose*CoeffientMatrix*DirectionVector //Computing First DirectionVectorTranpose * CoeffientMatrix cublasSgemv('T', Size, Size, 1, DeviceCoefficientMatrix, Size, DeviceDirectionVector, 1, 0, DeviceTempBuffer, 1); //multiplying the above result with direction vector TempValue = cublasSdot (Size, DeviceTempBuffer, 1, DeviceDirectionVector, 1); Tau = Delta0 / TempValue; //Compute New Solution Vector, NewSolutionVector = OldSolutionVector + Tau*DirectionVector cublasSaxpy (Size, Tau, DeviceDirectionVector, 1, DeviceSolutionVector, 1); //Copying the solution from Device to Host cudaMemcpy((void *)SolutionVector,(void *)DeviceSolutionVector,Size*sizeof(float),cudaMemcpyDeviceToHost); //Compute New ResidueVector, NewResidueVector = OldResidueVector + Tau*CoefficientMatrix*DirectionVector cublasSgemv('N', Size, Size, 1, DeviceCoefficientMatrix, Size, DeviceDirectionVector, 1, 0, DeviceTempBuffer, 1); cublasSaxpy (Size, Tau, DeviceTempBuffer, 1, DeviceResidueVector, 1); //Copying ResidueVector From Device to Host cudaMemcpy((void *)ResidueVector,(void *)DeviceResidueVector,Size*sizeof(float),cudaMemcpyDeviceToHost); //Computing the new Delta value i.e Delta1 = DotProduct of ResidueVector and HVector //Updating the HVector SolvePreConditionMatrix(PreConditionMatrix,HVector,ResidueVector,Size); //Copying the new HVector from Host to Device cudaMemcpy((void*)DeviceHVector,(void *)HVector,Size*sizeof(float),cudaMemcpyHostToDevice); Delta1 = cublasSdot (Size, DeviceResidueVector, 1, DeviceHVector, 1); if(Delta1 < EPSILON) break; Beta = Delta1 / Delta0; Delta0 = Delta1; //Computing new Direction Vector,NewDirectionVector = -HVector + Beta*OldDirectionVector cublasSscal (Size, Beta, DeviceDirectionVector, 1); cublasSaxpy (Size, -1, DeviceHVector, 1, DeviceDirectionVector, 1); }while(Delta0 > EPSILON && PresentIteration < MaxIterations); //Stop timing the computation gettimeofday(&tv,NULL); double t2=tv.tv_sec+(tv.tv_usec/1000000.0); //Calculating total time taken for computation timing = t2 - t1; //shutting down cublas cublasShutdown(); //Freeing the Memory allocated in Device FreeTheDeviceMemory(DeviceCoefficientMatrix,DeviceRhsVector,DeviceSolutionVector,DeviceDirectionVector,DeviceHVector,DeviceResidueVector,DeviceTempBuffer); //Printing the Solution for(rowNum=0;rowNum