/* *********************************************************************** C-DAC Tech Workshop : hyPACK-2013 October 15-18, 2013 Example 1 : matrix-matrix-multiply-sgemm-mkl-native.c Objective : To implement Matrix Matrix multiplication Algorithm using openMP on Xeon Phi Coprocessor Input : Automatic input generation of Input Matrix data Size of the Square Matrix Output : Print the Gflop/s and output Matrix C Time Elapsed and GFLOPS Created : August-2013 E-mail : hpcfte@cdac.in ************************************************************************ */ #include #include #include "omp.h" #pragma native_attribute(push, target(mic)) #include "mkl.h" #pragma native_attribute(pop) #pragma UNROLL_AND_JAM //#define THREADS 168 #define NITERS 3 void local_sgemm(int N, int LD, float *A, float *B, float *C) { cblas_sgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, N, N, N, 1.0, A, LD, B, LD, 1.0, C, LD); } double native_sgemm(int N, int LD, float *A, float *B, float *C) { double t; static int first_run = 1; t = dsecnd(); local_sgemm(N, LD, A, B, C); t = dsecnd() - t; first_run = 0; return t; } void bench_sgemm(int use_native, int N) { /* Choose such leading dimension that there is no cache aliasing. */ int LD = (N % 512) ? N : N + 128; /* Allocate memory using MKL function to make sure the addresses are * * properly aligned. */ // double *A = mkl_malloc(sizeof(double) * N * LD, 4096); // double *B = mkl_malloc(sizeof(double) * N * LD, 4096); // double *C = mkl_malloc(sizeof(double) * N * LD, 4096); float *Matrix_A = mkl_malloc(sizeof(float) * N * LD, 64); float *Matrix_B = mkl_malloc(sizeof(float) * N * LD, 64); float *Matrix_C = mkl_malloc(sizeof(float) * N * LD, 64); /*Initialise Matrices */ for(int i=0;i 0) t += t_tmp; } mkl_free(Matrix_A); mkl_free(Matrix_B); mkl_free(Matrix_C); const double NOPS = 2.0 * N * N * N; double gflops = NOPS / (t * 1E9 / NITERS); printf("Native %dx%d DGEMM: %8.2f GFlops\n", N, N, gflops); } int main(int argc, char **argv) { int N; if(argc<2) { printf("Syntax %s \n",argv[0]); exit(1); } N=atoi(argv[1]); printf("Matrix : %d", N); printf(" ITR : %d\n", NITERS); /* The following settings will make MKL use OpenMP even when called * from an OpenMP region. */ /* Enables Intel MKL to dynamically change the number of threads */ mkl_set_dynamic(0); /* Enable nested parallel region */ omp_set_nested(1); /* Set number of MKL threads */ mkl_set_num_threads(mkl_get_max_threads()); printf("\nMKL threads = %d\n", mkl_get_max_threads()); /* call fcuntion to tun MKL DGEMM on MIC */ bench_sgemm(1, N); printf("\n-----------------------------------------------------\n\n"); return 0; }