C-DAC,Pune : High-Perf. Comp. Frontier Technologies Exploration Group and CMSD, University of Hyderabad, Technology Workshop hyPACK (October 15-18), 2013

hyPACK-2013 GPGPU OpenCL Prog. using AMD-APP

AMD Accelerated Parallel Processing (AMD APP) SOftware harnesses the tremendous processing power of GPUs for high-performance, data-parallel computing in a wide range of applications. The AMD Accelerated Parallel Processing system includes a software stack and the AMD GPUs. AMD-APP SDK provides complete heterogeneous OpenCL development platform for both the CPU and GPU. The software includes OpenCL compiler & runtime, Device Driver for GPU compute device \96 AMD Performance Profiling Tools \96 AMD APP Profiler and AMD APP KernelAnalyzer and Performance Libraries \96 AMD Core Math Library (ACML). AMD-APP OpenCL software development platform is available for x86-based CPUs and it provides complete heterogeneous OpenCL development platform for both the CPU and GPU.Please refer to AMD-APP Accelerated Parallel Processing (AMD APP) Programming Guide OpenCL to understand the relationship of the AMD Accelerated Parallel Processing components.

cl_device_type deviceType;
cl_int status = CL_SUCCESS;
size_t length;

// CALL function readInput() to get the program input
readInput(argc,argv,&length, &deviceType); // Variable declaration
cl_context context;
size_t deviceListSize;
cl_device_id* devices;
cl_command_queue queue;

// CALL function setExeEnv() to set Prog. Env. and build kernel
char path[100];
strcpy(path,KERNEL_SOURCE_PATH);
cl_program hProgram;
cout<< "\n---------------------------------------------------\n";
setExeEnv(&context, &deviceListSize, &devices, &queue, &hProgram, path, deviceType);
cout<<"\n---------------------------------------------------\n";

//
//
//
//
//
//
//
//
//

Kernel Objects : Create kernel handle
To create a kernel handle, use the function clCreateKernel (cl_program program, const char *kernel_name,cl_int *errcode_ret) program (hprogram) is a program object with a successfully built executable. kernel_name (vectVectMult_kernel) is a function name in the program declared with the __kernel qualifier.
A kernel is a function declared in a program. A kernel is identified by the __kernel qualifier applied to any function in a program. A kernel object encapsulates the specific __kernel function declared in a program and the argument values to be used when executing this __kernel function.

cl_kernel vectVectMult_kernel;

VectVectMult_kernel =

clCreateKernel( hProgram, "vectVectMult_kernel", &status);

STATUSCHKMSG("kernel handle");

//
//
//
//

Memory Objects : Create Memory Objects to hold input , output on host / device
A buffer object is created using the function cl_mem clCreateBuffer(*,*,*,*.*) Elements in a buffer are stored in sequential fashion and can be accessed using a pointer by a kernel executing on a device.

// create input vector 1 on - Host : hIntVectOne / Device : dintVectOne
cl_int *hInVectOne;
hInVectOne = new cl_int[length];
fillInArray(hInVectOne, length);

cl_mem dInVectOne =

clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
length * sizeof(cl_int), (void*) hInVectOne, &status);

STATUSCHKMSG("memory allocation Vect one");

// Create input vector 2 on - Host - hIntVectTwo / Device :
// dintVectTwo
cl_int *hInVectTwo;
hInVectTwo = new cl_int[length];
fillInArray(hInVectTwo, length);

cl_mem dInVectTwo =

clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
length * sizeof(cl_int), (void*)hInVectTwo, &status);

STATUSCHKMSG("memory allocation vect two");

// Create space for holding vector length : Host & Device
cl_int hLength = length;

cl_mem dLength =

clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_int),(void*)&hLength, &status);

STATUSCHKMSG("scalar memory setting");

// Create space for output on - Host : hOutScalar
// Device : dOutScalar
cl_int *hOutScalar;
hOutScalar = new cl_int[1];

cl_mem dOutScalar =

< clCreateBuffer( context, CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_int),(void*)hOutScalar, &status);

STATUSCHKMSG("o/p memory allocation");

//
//
//
//

Kernel Objects : Setting Kernel Arguments :
To execute a kernel, the kernel arguments must be set. The function
cl_int clSetKernelArg (*,*,*,*) is used to set the argument value for a specific argument of a kernel. kernel is a valid kernel object.

//set input vector argument

status =

clSetKernelArg(VectVectMult_kernel,0,sizeof(cl_mem), (void*) &dInVectOne);

STATUSCHKMSG("in arg setting vectone");

//set output vector argument

status =

clSetKernelArg(VectVectMult_kernel,1,sizeof(cl_mem), (void*) &dInVectTwo);

STATUSCHKMSG("in arg setting vecttwo");

//set length of two input vector argument

status =

clSetKernelArg(VectVectMult_kernel,2,sizeof(cl_mem), (void*) &dLength);

STATUSCHKMSG("scalar value argument length");

//set output scalar value argument

status =

clSetKernelArg(VectVectMult_kernel,3,sizeof(cl_mem), (void*) &dOutScalar);

STATUSCHKMSG("scalar value argument output value");

//set space for temporary value argument

status =

clSetKernelArg(VectVectMult_kernel,4,sizeof(int), NULL);

STATUSCHKMSG("scalar value argument temp variable");

// Enqueue/Launch kernel

// number of global items in work dimension
size_t globalThreads[] = { GLOBAL_WORK_SIZE }; <
// number of work items per group
size_t localThreads[] = { LOCAL_WORK_SIZE};

//
//
//
//
//
//
//
//
//
//
//

Enqueue Kernel :
The runtime system assigns the work in the command queues to the underlying devices. Commands are placed into the queue using the clEnqueue commands The commands can be broadly classified into three categories:

Kernel commands (for example, clEnqueueNDRangeKernel(), etc.), Memory commands (for example, clEnqueueReadBuffer(), etc.), and
Event commands (for example, clEnqueueWaitForEvents(), etc.

The function clEnqueueNDRangeKernel () enqueues a command to execute a kernel on a device.

status =

clEnqueueNDRangeKernel( queue,VectVectMult_kernel,1,
NULL, globalThreads, localThreads,0,NULL,NULL);

STATUSCHKMSG("kernel enqueue");

// Wait for kernel execution to finish
status = clFinish(queue);
STATUSCHKMSG("clFinish");
cl_event events[1];

// Read output result from device to host

status =

clEnqueueReadBuffer(queue, dOutScalar, CL_TRUE, 0,
sizeof(cl_int), hOutScalar, 0, NULL, &events[0]);

STATUSCHKMSG("read output");

// Wait for read buffer to complete the read of output produce by kernel
status = clWaitForEvents(1, &events[0]);
STATUSCHKMSG("read event not completed");

// Print the Input & Output vectors
cout<<"\n Input Vector One : \n";
for(size_t count=0; count ≤ length; count++)
{
cout < <" "<< hInVectOne[ count ];
}

cout<< "\n Input Vector Two \n";
for(size_t count=0; count ≤length; count++)
{
cout<<" "<< hInVectTwo[ count];
}
cout <<"\n Output : "<< (*hOutScalar) << "\n";

// Releasing memory objects.
clReleaseMemObject(dInVectTwo);
clReleaseMemObject(dLength);
clReleaseMemObject(dOutScalar);
clReleaseKernel(vectVectMult_kernel);
clReleaseProgram(hProgram);
clReleaseCommandQueue(queue);
clReleaseContext(context);

delete [] hInVectOne;
delete [] hInVectTwo;
delete [] hOutScalar;
} // end of main

// VectVectMult_kernel.cl : Function __kernel void vectVectMult_kernel

//
//
//

Kernel: A kernel is a function declared in a program and executed on an OpenCL device. A kernel is identified by the __kernel qualifier applied to any function defined in a program.
OpenCL Kernel implementation, to find to vector vector multiplication.

// @param[in] inVectOne	Handle to input matrix One
// @param[in] inVectTwo	Handle to input matrix two
// @param[out] length	Handle to variable, defining lenght of input vectors
// @param[in] outScalar	Handle to output scalar variable.
// @param[in] tempScalar	Handle to temporary scalar variable.

__kernel void vectVectMult_kernel(

__global int *inVectOne, __global int *inVectTwo,
__global int *length, __global int *outScalar,
__local int *tempScalar)

// Synchornization functions

//
//
//
//
//
//

All work-items in a work-group executing the kernel on a processor must execute this function before any are allowed to continue execution beyond the barrier. This function must be encountered by all work-items in a work-group executing the kernel.
CLK_LOCAL_MEM_FENCE : The barrier function will either flush any variables stored in local memory or queue a memory fence to ensure correct ordering of memory operations to local memory.

{
// get unique global work item ID
unsigned int gid = get_global_id(0);
//get the number of global work items
unsigned int global_work_items = get_global_size(0);
(*tempScalar) = 0;

for(int currCell = gid; currCell < (*length); currCell+=global_work_items )
{
(*tempScalar) = (*tempScalar) + inVectOne[ currCell] * inVectTwo[currCell];
}

inVectTwo[gid] = (*tempScalar);
barrier(CLK_LOCAL_MEM_FENCE);

if( gid == 0 )
{
*outScalar = 0;

for( int cellInd = 0; cellInd< global_work_items ; cellInd++)
{
*outScalar = (*outScalar) + inVectTwo[cellInd];
}
}
}
// end vectVectMult_kernel