/********************************************************************

		C-DAC Tech Workshop : hyPACK-2013
                           October 15-18, 2013

  Example     :  VectVectAdd_shared.cu

  Objective   : Write a CUDA  program to compute Vector-Vector Addition 
                using shared memory implementation.

  Input       : None

  Output      : Execution time in seconds , Gflops achieved

  Created     : August-2013

  E-mail      : hpcfte@cdac.in     

************************************************************************/

#include<math.h>
#include<stdio.h>
#include<cuda.h>

#define SIZE 128

#define EPS 1.0e-12
#define GRIDSIZE 10
#define BLOCKSIZE 16


double *dMatA, *dMatB;
double *hMatA, *hMatB;
double *dresult, *hresult,*CPU_Result,*host_MatC;

int   vlength , count = 0;
int   blockWidth;

cudaEvent_t start,stop;
cudaDeviceProp deviceProp;

int device_Count,size=SIZE;

/* Kernel Function */

__global__ void vectvectadd(double *vect1,double *vect2,double *result,int width)
{
   int threads_per_block=blockDim.x*blockDim.y;
   int total_blocks_in_grid=gridDim.x*gridDim.y;
   int tid=threadIdx.x + threadIdx.y*blockDim.x + blockIdx.x*threads_per_block + blockIdx.y*gridDim.x;
   
   while(tid<width)
   {
     *(result + tid) = *(vect1 + tid) + *(vect2 + tid);
      tid +=threads_per_block*total_blocks_in_grid;
   }
 __syncthreads();
}

/*
 * Check for safe return of all calls to the device
 */


void CUDA_SAFE_CALL(cudaError_t call)
{
        cudaError_t ret = call;
        //printf("RETURN FROM THE CUDA CALL:%d\t:",ret);
        switch(ret)
        {
                case cudaSuccess:
                //              printf("Success\n");
                                break;
        /*      case cudaErrorInvalidValue:
                                {
                                printf("ERROR: InvalidValue:%i.\n",__LINE__);
                                exit(-1);
                                break;
                                }
                case cudaErrorInvalidDevicePointer:
                                {
                                printf("ERROR:Invalid Device pointeri:%i.\n",__LINE__);
                                exit(-1);
                                break;
                                }
                case cudaErrorInvalidMemcpyDirection:
                                {
                                printf("ERROR:Invalid memcpy direction:%i.\n",__LINE__);
                                exit(-1);
                                break;
                                }                       */
                default:
                        {
                                printf(" ERROR at line :%i.%d' ' %s\n",__LINE__,ret,cudaGetErrorString(ret));
                                exit(-1);
                                break;
                        }
        }
}


/*
 * Get the number of GPU devices present on the host
 */

int get_DeviceCount()
{
	int count;
	cudaGetDeviceCount(&count);	
	return count;	
}

/* Calculate Gfloaps */

double print_Gflops_rating(float Tsec,int size)
{
      //  printf("time taken is %.8f\n",Tsec);
	double gflops;	
        gflops=(1.0e-9 * (( 1.0 * size*10 )/Tsec));
        //printf("Gflops is \t%f\n",gflops);
	return gflops;
}

/* Function to check block and grid dimensions */

void check_block_grid_dim(cudaDeviceProp devProp,dim3 blockDim,dim3 gridDim)
{

        if( blockDim.x >= devProp.maxThreadsDim[0] || blockDim.y >= devProp.maxThreadsDim[1] || blockDim.z >= devProp.maxThreadsDim[2] )
        {
                printf("\nBlock Dimensions exceed the maximum limits:%d * %d * %d \n",devProp.maxThreadsDim[0],devProp.maxThreadsDim[1],devProp.maxThreadsDim[2]);
               exit(-1);
        }

        if( gridDim.x >= devProp.maxGridSize[0] || gridDim.y >= devProp.maxGridSize[1] || gridDim.z >= devProp.maxGridSize[2] )
        {
                printf("\nGrid Dimensions exceed the maximum limits:%d * %d * %d \n",devProp.maxGridSize[0],devProp.maxGridSize[1],devProp.maxGridSize[2]);
               exit(-1);
        }
}


/* Function to Launch kernel */

void launch_kernel()
{
	 dim3 dimBlock(BLOCKSIZE, BLOCKSIZE);
         dim3 dimGrid(1,1);
         check_block_grid_dim(deviceProp,dimBlock,dimGrid);
      	printf("calling kernel\n"); 
	 vectvectadd<<<dimGrid, dimBlock>>>(dMatA, dMatB, dresult,vlength );
	printf("after kernel call\n");
}

/* Function for device information*/

void deviceQuery()
{
	 int device_Count;
        device_Count=get_DeviceCount();
        printf("\n\nNUmber of Devices : %d\n\n", device_Count);

        cudaSetDevice(0);
        int device;
        cudaGetDevice(&device);
        cudaGetDeviceProperties(&deviceProp,device);
        printf("Using device %d: %s \n", device, deviceProp.name);
}

void mem_error(char *arrayname, char *benchmark, int len, char *type)
{

        printf("\nMemory not sufficient to allocate for array %s\n\tBenchmark : %s  \n\tMemory requested = %d number of %s elements\n",arrayname, benchmark, len, type);
        printf("\n\tAborting\n\n");
        exit(-1);
}

/*
 * Fill in the vector with double precision values
 */
void fill_dp_vector(double* vec,int size)
{
        int ind;
        for(ind=0;ind<size;ind++)
                vec[ind]=drand48();
}

/* prints the result on screen */

void print_on_screen(char * program_name,float tsec,double gflops,int size,int flag)//flag=1 if gflops has been calculated else flag =0
{
        printf("\n---------------%s----------------\n",program_name);
        printf("\tSIZE\t TIME_SEC\t Gflops\n");
        if(flag==1)
        printf("\t%d\t%f\t%lf\t",size,tsec,gflops);
        else
        printf("\t%d\t%lf\t%lf\t",size,"---","---");

}

void dfree(double * arr[],int len)
{
        for(int i=0;i<len;i++)
                CUDA_SAFE_CALL(cudaFree(arr[i]));
        printf("mem freed\n");
}


int main()
{
	double *array[3];
	array[0]=dMatA;
	array[1]=dMatB;
	array[2]=dresult;
	
	deviceQuery();

	vlength=SIZE;
	CUDA_SAFE_CALL(cudaEventCreate (&start));
        CUDA_SAFE_CALL(cudaEventCreate (&stop));
   

   	/*  allocation host memory */
   	hMatA = (double*) malloc( vlength *  sizeof(double));
   	hMatB = (double*) malloc( vlength * sizeof(double));
   	hresult = (double*) malloc( vlength * sizeof(double));

	 if(hMatA==NULL)
                mem_error("hMatA","vectvectmul",vlength,"double");

	 if(hMatB==NULL)
                mem_error("hMatB","vectvectmul",vlength,"double");

	 if(hresult==NULL)
                mem_error("hresult","vectvectmul",vlength,"double");

   	/* allocation device memory */

   	CUDA_SAFE_CALL(cudaMalloc( (void**)&dMatA, vlength * sizeof(double)));
   	CUDA_SAFE_CALL(cudaMalloc( (void**)&dMatB, vlength * sizeof(double)));
   	CUDA_SAFE_CALL(cudaMalloc( (void**)&dresult, vlength*sizeof(double)));
  
       	fill_dp_vector(hMatA,vlength);
	fill_dp_vector(hMatB,vlength); 
	  
    	CUDA_SAFE_CALL(cudaMemcpy((void*)dMatA, (void*)hMatA, vlength* sizeof(double) , cudaMemcpyHostToDevice ));
    	CUDA_SAFE_CALL(cudaMemcpy((void*)dMatB, (void*)hMatB, vlength* sizeof(double) , cudaMemcpyHostToDevice ));
 
   	/* calling device kernel */
	CUDA_SAFE_CALL(cudaEventRecord (start, 0));
	launch_kernel(); 
   	printf("after launch kernel\n");
	CUDA_SAFE_CALL(cudaEventRecord (stop, 0));
   	CUDA_SAFE_CALL(cudaEventSynchronize (stop));
	printf("before memcpy\n");
	CUDA_SAFE_CALL(cudaMemcpy((void*)hresult, (void*)dresult,vlength*sizeof(double) , cudaMemcpyDeviceToHost ));
   	printf("\n ----------------------------------------------------------------------");
	for(int i=0;i<size;i++)
	{
		if(hresult[i]!=(hMatA[i]+hMatB[i]))
		{
			printf("error%d\n",i);
			break;
		}
	}
	printf("\n cpu gpu results matched\n");

	/* computing elapsed time   */
	float elapsedTime;
	double Tsec;      
        CUDA_SAFE_CALL(cudaEventElapsedTime(&elapsedTime, start, stop));
        Tsec = elapsedTime *1.0e-3; //time in sec now

	/* calculation of Gflops */
        print_Gflops_rating(Tsec,vlength);
	//CPU_ADD(hMatA ,hMatB,host_MatC,vlength);


	/* printing the result on screen */
     	print_on_screen("vect vect ADDITION",Tsec, print_Gflops_rating(Tsec,vlength),size,1);

	
	/* free the memory */
   	dfree(array,3);
   	free(hMatA);
   	free(hMatB);
   	free(hresult);
}