/* ******************************************************************** C-DAC Tech Workshop : hyPACK-2013 October 15-18, 2013 Example : matrix-matrix-multiply-mpi-openmp-offload.c Objective : To implement Matrix Matrix multiplication Algorithm using openMP on Xeon Phi Coprocessor Input : Automatic input generation of Input Matrix data Size of the Square Matrices Output : Print the Gflop/s and output Matrix C Time Elapsed and GFLOPS Created : August-2013 E-mail : hpcfte@cdac.in ******************************************************************** */ #ifndef MIC_DEV #define MIC_DEV 0 #endif #include #include #include #include #include #include #include // An OpenMP simple matrix multiply void doMult(int size,float (* restrict Matrix_A)[size], \ float (* restrict Matrix_B)[size], \ float (* restrict Matrix_C)[size]) { #pragma offload target(mic:MIC_DEV) \ in(Matrix_A:length(size*size)) \ in(Matrix_B:length(size*size)) \ out(Matrix_C:length(size*size)) { #pragma omp parallel for default(none) shared(Matrix_C,size) // Zero the C matrix for(int i = 0; i < size; ++i) for(int j = 0; j < size; ++j) Matrix_C[i][j] =0.f; // Compute matrix multiplication. #pragma omp parallel for default(none) shared(Matrix_A,Matrix_B,Matrix_C,size) for (int i = 0; i < size; ++i) for (int k = 0; k < size; ++k) for (int j = 0; j < size; ++j) Matrix_C[i][j] += Matrix_A[i][k] * Matrix_B[k][j]; } } /* ................... Main Program ................ Starts ...........*/ int main(int argc, char* argv[]) { /* check number of args */ if(argc != 4) { fprintf(stderr,"Use: %s size nThreads nIter \n",argv[0]); return -1; } int Numprocs, MyRank; /* ........MPI Initialisation ....... */ MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &MyRank); MPI_Comm_size(MPI_COMM_WORLD, &Numprocs); /* variable declarations */ /* for getting host name */ int namelen; char name[MPI_MAX_PROCESSOR_NAME]; /* for communication part */ int tag=1, index; MPI_Status status; char msg[6] = "hello"; char inmsg[6] = ""; /* for matrix multiplication */ int i,j,k; int size = atoi(argv[1]); int nThreads = atoi(argv[2]); int nIter = atoi(argv[3]); int dsec00 = 0; double aveTime, minTime = 1e6, maxTime =0.0; /* communication between Procs : begin */ if (MyRank == 0) { for(index = 1; index < Numprocs ; index++) { MPI_Send(&msg,6,MPI_CHAR,index,tag,MPI_COMM_WORLD); printf("\nMessage send by rank = %d to rank = %d is %s\n", MyRank, index, msg); } } else { MPI_Recv(&inmsg,6,MPI_CHAR,0,tag,MPI_COMM_WORLD,&status); printf("\nMessage recieved by rank = %d is %s\n", MyRank, inmsg); } /* communication between Procs : Ends */ /* create nThreads number of threads */ omp_set_num_threads(omp_get_max_threads()); printf("\nMKL threads = %d\n", omp_get_max_threads()); /* memory allocation */ float (*restrict Matrix_A)[size] = malloc(sizeof(float)*size*size); float (*restrict Matrix_B)[size] = malloc(sizeof(float)*size*size); float (*restrict Matrix_C)[size] = malloc(sizeof(float)*size*size); // Fill the A and B arrays #pragma omp parallel for default(none) shared(Matrix_A, Matrix_B,size) \ private(i,j,k) for(i = 0; i < size; ++i) { for(j = 0; j < size; ++j) { Matrix_A[i][j] = (float)i + j; Matrix_B[i][j] = (float)i - j; } } // warm up doMult(size, Matrix_A,Matrix_B,Matrix_C); /* Multiplication */ for (int i=0; i < nIter; i++) { double startTime = dsecnd(); // Matrix Multiplication doMult(size, Matrix_A,Matrix_B, Matrix_C); double endTime = dsecnd(); double runtime = endTime - startTime; maxTime = (maxTime > runtime)?maxTime:runtime; minTime = (minTime < runtime)?minTime:runtime; aveTime += runtime; } aveTime /= nIter; /* get host name: begins */ MPI_Get_processor_name (name, &namelen); /* get host name: begins */ printf("\nProcess %d is running on %s host with results : \n%s nThrds %d matrix %d maxRT %g minRT %g aveRT %g ave_GFlop/s %g\n", MyRank, name,argv[0],omp_get_num_threads(), size, maxTime, minTime, aveTime, 2e-9*size*size*size/aveTime); /* freeing all memories */ free(Matrix_A); free(Matrix_B); free(Matrix_C); MPI_Finalize(); return 0; } /* ..........Program Listing Completed ............. */