/*************************************************************************** C-DAC Tech Workshop : hyPACK-2013 October 15-18, 2013 Example : Mat_Vect_Mult_Pthreads_OpenCL_mGPU.c Objective : Example program to demonstrate mixed mode programming using opencl and pthreads for Matrix-Vector Multiplication. Description : Here we are performing Matrix-Vector Multiplication using opencl-pthread mixed model. We are taking number of rows and number of columns/ vectorLenght from the user. According to number of devices available program will choose number of threads and launch on separate device. Condition : Number of rows should be divisible by number of devices available. Input : Number of Rows , Vector Length / Number of Columns Output : Resultant Vector (along with Input Matrix and Vector ) Created : August-2013 E-mail : hpcfte@cdac.in *****************************************************************************/ #include #include #include #include #include #include //-----------------------------------------------Kernel Function Starts-------------------------------------------------------------- // string containing kernel source const char *programSource = "__kernel void matrixvectorMult(__global int* input_mat, __global int* input_vec,__global int* result_vec, __global int* vecLen , __global int* numRows)\n" "{\n" " int threadGid = get_global_id(0);\n" " int numThread = get_global_size(0);\n" " int tempResult = 0; \n" " if( threadGid < (*numRows))\n" " {\n" " for( int rowCount = threadGid; rowCount < (*numRows); rowCount = rowCount + 1)\n" " {\n" " tempResult = 0;\n" " for(int colCount = 0; colCount < (*vecLen); colCount++ )\n" " {\n" " tempResult = tempResult + input_vec[colCount] * input_mat[rowCount * (*vecLen) + colCount];\n" " }\n" " result_vec[rowCount] = tempResult; \n" " }\n" " }\n" "}\n"; //--------------------------------------------------Kernel Function Ends-------------------------------------------------------------- //-------------------------------------------------- Declaration Part Starts------------------------------------------------------- cl_int status = CL_SUCCESS; // integer value used to check whether operation is successfull or not // variables declared for platform information cl_uint maxNumofPlatforms; cl_uint numOfPlatformsAvailable; cl_platform_id *platforms; char platformName[200]; cl_int platformId; // variables declared for device information cl_uint numOfDevicesAvailable; cl_device_id *devices; cl_device_type deviceType; char deviceBuffer[100]; char **deviceName; // context related variables cl_context_properties contextProp[3]; cl_context context; cl_command_queue *cmdQueue; // command Q variable cl_program progObject; // program object cl_kernel *myKernelMatVectMult; // kernel object size_t global[1]; // array to define global work size void* threadWorkMatVecMultiplication(int); // thread function void check_status(char*); // function to check whether operation is successfull or not void getPlatformDeviceInfo(); // function to get platform and device information int checkResultCPUGPU(int*,int*,int,int); // check correctness of GPU vs CPU // variables for vector addition on host cl_int *hst_inputMat , *hst_inputVec; cl_int *hst_resultVec; cl_int numRows,numCols,vecLength; int numThreads , threadPart; //-------------------------------------------------- Declaration Part Ends--------------------------------------------------- //-------------------------------------------------- Main Starts--------------------------------------------------- int main(int argc , char *argv[]) { pthread_t *threads; int threadCount , threadStatus , chkresult ; int i ,j; size_t rowCount ,colCount ,vecCount ; if(argc!=3) { printf("\n Usage : \n"); exit(0); } numRows = atoi(argv[1]); numCols = vecLength = atoi(argv[2]); getPlatformDeviceInfo(); // call the function to get platform and device information if((numRows % numThreads) != 0) // check input condition { printf("\n Number of Rows should be divisible by numOfDevicesAvailable......\n\n"); exit(0); } threadPart = numRows / numThreads ; // define chunk size of each thread /* Create Context to associate devices context properties list - must be terminated with 0 */ contextProp[0] = CL_CONTEXT_PLATFORM; // prop name contextProp[1] = (cl_context_properties)platforms[platformId]; // prop value contextProp[2] = 0; // must be terminated with 0 context = clCreateContext(contextProp , numOfDevicesAvailable , devices ,NULL , NULL , &status); check_status("clCreateContext"); // command Queue memory allocation according to num of device available assert( (cmdQueue=(cl_command_queue *) malloc (sizeof(cl_command_queue) * numOfDevicesAvailable))) ; printf("\n"); // Create program object progObject = clCreateProgramWithSource(context , 1 , &programSource , NULL , &status); check_status("clCreateProgramWithSource"); // Build Program Executables status = clBuildProgram(progObject,numOfDevicesAvailable ,devices , NULL,NULL,NULL); check_status("clBuildProgram"); // kernel Object memory allocation according to num of device available assert((myKernelMatVectMult = (cl_kernel *) malloc (sizeof(cl_kernel) * numOfDevicesAvailable))); // Memory allocation and data filling on host assert(hst_inputMat = (cl_int*)malloc(numRows*numCols*sizeof(cl_int))); assert(hst_inputVec = (cl_int*)malloc(vecLength*sizeof(cl_int))); assert(hst_resultVec = (cl_int*)malloc(numRows*sizeof(cl_int))); for(rowCount=0; rowCount < numRows; rowCount++) for(colCount=0; colCount < numCols; colCount++) hst_inputMat[rowCount * numCols + colCount] = (cl_int)rand()%10; for(vecCount=0; vecCount < vecLength; vecCount++) hst_inputVec[vecCount] = (cl_int)rand()%10; for(colCount=0; colCount < numRows; colCount++) hst_resultVec[colCount] = 0; global[0] = threadPart; // global work size for each kernel assert(threads = (pthread_t *)malloc(numThreads * sizeof(pthread_t))); // allocate memory for number of threads // Call thread function for(threadCount = 0 ; threadCount < numThreads ; threadCount++) { threadStatus = pthread_create(&threads[threadCount], NULL, (void *(*) (void *))threadWorkMatVecMultiplication, (void *)(threadCount)); if(threadStatus) { printf("Error in creating the thread and the return status is %d \n",threadStatus); exit(-1); } } // join threads with main thread for(threadCount = 0 ; threadCount < numThreads ; threadCount++) { threadStatus = pthread_join(threads[threadCount], NULL); if(threadStatus) { printf("Error in joining the threads and the return status is %d \n",threadStatus); exit(-1); } } // Check correctness of result chkresult = checkResultCPUGPU(hst_inputMat, hst_inputVec,numRows,vecLength); if(chkresult) { printf("\n Uncorrect result :CPU and GPU results are not same.....Aborting... \n"); exit(1); } printf("\n Successfull....CPU and GPU Results are same....!!\n\n"); // Print Input Matrix,Vector and resultant vector /*printf("\nInput Matrix \n"); for(rowCount = 0; rowCount < numRows ; rowCount++) { printf("\n"); for(colCount = 0; colCount < numCols; colCount++) { printf("%d\t", hst_inputMat[rowCount*numCols+colCount]); } } printf("\n"); printf("\nInput Vec \n"); for(vecCount = 0; vecCount < vecLength ; vecCount++) { printf("%d\t", hst_inputVec[vecCount]); } printf("\n"); printf("\nResult Vector \n"); for(vecCount = 0; vecCount < numRows ; vecCount++) { printf("%d\t", hst_resultVec[vecCount]); }*/ printf("\n"); // Cleanup status = clReleaseProgram(progObject); check_status(" clReleaseProgram progObject"); status = clReleaseContext(context); check_status("clReleaseContext"); free(hst_inputMat); free(hst_inputVec); free(hst_resultVec); return 0; } //-------------------------------------------------- Main Ends--------------------------------------------------- //-------------------------------------------------- Thread Function Starts--------------------------------------------------- // thread function will launch the kernel according to threadId void* threadWorkMatVecMultiplication(int threadId) { int iStrt , iEnd ,i ; cl_mem devc_inputMat , devc_inputVec; cl_mem devc_resultVec; cl_mem devc_vecLength , devc_numRows; // Create command Queue cmdQueue[threadId] = clCreateCommandQueue( context, devices[threadId], 0, &status); check_status("clCreateCommandQueue"); printf("\n\t Command Queue has been created for device = %s ",deviceName[threadId]); printf("\n"); // Create kernel object myKernelMatVectMult[threadId] = clCreateKernel ( progObject, "matrixvectorMult", &status); check_status(" clCreateKernel "); printf("\n\t Kernel Object has been created for device = %s ",deviceName[threadId]); printf("\n"); // chunk size range for each thread iStrt = threadId * (threadPart*numCols) ; iEnd = iStrt + ((threadPart*numCols)-1); printf(" \n\n\t Chunk size for device %s is from %d to %d", deviceName[threadId],iStrt,iEnd); // create buffer objects devc_inputMat = clCreateBuffer ( context, CL_MEM_READ_ONLY , numCols * threadPart * sizeof(int),NULL, &status); check_status("devc_inputMat"); devc_inputVec = clCreateBuffer ( context, CL_MEM_READ_ONLY , vecLength * sizeof(int),NULL, &status); check_status("devc_inputVec"); devc_resultVec = clCreateBuffer ( context, CL_MEM_WRITE_ONLY , threadPart * sizeof(int),NULL, &status); check_status("devc_resultVec"); devc_numRows = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_int), (void*) &threadPart, &status); check_status(" create buffer object for devc_numRows"); devc_vecLength = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_int), (void*) &vecLength, &status); check_status(" create buffer object for devc_vecLength"); // copy data from host to device memory status = clEnqueueWriteBuffer (cmdQueue[threadId], devc_inputMat, CL_TRUE, 0, numCols * threadPart * sizeof(float), &hst_inputMat[threadId * threadPart * numCols], 0, 0 , 0); check_status("clEnqueueWriteBuffer for VectorA"); status = clEnqueueWriteBuffer (cmdQueue[threadId], devc_inputVec, CL_TRUE, 0, vecLength * sizeof(float), hst_inputVec, 0, 0 , 0); check_status("clEnqueueWriteBuffer for VectorB"); // set kernel arguments status = clSetKernelArg( myKernelMatVectMult[threadId],0,sizeof(cl_mem),(void *) &devc_inputMat); check_status("clSetKernelArg 0"); status = clSetKernelArg( myKernelMatVectMult[threadId],1,sizeof(cl_mem),(void *) &devc_inputVec); check_status("clSetKernelArg 1"); status = clSetKernelArg( myKernelMatVectMult[threadId],2,sizeof(cl_mem),(void *) &devc_resultVec); check_status("clSetKernelArg 2"); status = clSetKernelArg( myKernelMatVectMult[threadId],3,sizeof(cl_mem),(void *) &devc_vecLength); check_status("clSetKernelArg 3"); status = clSetKernelArg( myKernelMatVectMult[threadId],4,sizeof(cl_mem),(void *) &devc_numRows); check_status("clSetKernelArg 4"); // enqueue kernel for execution in command queue status = clEnqueueNDRangeKernel(cmdQueue[threadId] , myKernelMatVectMult[threadId] , 1 , NULL , global , NULL , 0 , NULL , NULL); check_status("clEnqueueNDRangeKernel"); // read result from device into host memory status = clEnqueueReadBuffer(cmdQueue[threadId] , devc_resultVec , CL_TRUE , 0 , threadPart * sizeof(cl_int) , &hst_resultVec[threadId*threadPart] ,0 ,NULL,NULL); check_status("clEnqueueReadBuffer"); // wait until all commands complete execution status=clFinish(cmdQueue[threadId]); check_status("clFinish"); // CleanUp clReleaseMemObject(devc_inputMat); check_status("clReleaseMemObject devc_inputMat"); clReleaseMemObject(devc_inputVec); check_status("clReleaseMemObject devc_inputVec"); clReleaseMemObject(devc_resultVec); check_status("clReleaseMemObject devc_resultVec"); clReleaseMemObject(devc_vecLength); check_status("clReleaseMemObject devc_vecLength"); clReleaseMemObject(devc_numRows); check_status("clReleaseMemObject devc_numRows"); clReleaseKernel(myKernelMatVectMult[threadId]); check_status("clReleaseKernel"); status = clReleaseCommandQueue(cmdQueue[threadId]); check_status("clReleaseCommandQueue"); pthread_exit((void *) NULL); } //-------------------------------------------------- Thread Function Ends--------------------------------------------------- //-------------------------------------------------- check_status Function Starts--------------------------------------------------- // Function to check correctness of OpenCL APIs void check_status(char* op) { if(status != CL_SUCCESS) { printf("\n Operation %s is not sucessfull.....\n",op); exit(1); } } //-------------------------------------------------- check_status Function Ends--------------------------------------------------- //-------------------------------------------------- gettformDeviceInfo() Function Starts--------------------------------------------- void getPlatformDeviceInfo() { int i , j ; // 1 .query host system for platform information status = clGetPlatformIDs(0 , 0 , &numOfPlatformsAvailable); check_status("clGetPlatformIDs"); assert(platforms = (cl_platform_id*)malloc(numOfPlatformsAvailable * sizeof(cl_platform_id))); status = clGetPlatformIDs(numOfPlatformsAvailable , platforms , NULL); check_status("clGetPlatformIDs"); printf("\n Total number of platforms available are %d \n",numOfPlatformsAvailable); for(i = 0 ; i < numOfPlatformsAvailable ; i++) { status = clGetPlatformInfo(platforms[i] , CL_PLATFORM_NAME , sizeof(platformName) , &platformName , NULL); check_status("clGetPlatformInfo for CL_PLATFORM_NAME"); printf("\n PLATFORM_NAME = %s \n",platformName); } for(i = 0 ; i < numOfPlatformsAvailable ; i++) { status = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU , 0 , 0 , &numOfDevicesAvailable); if(status != CL_SUCCESS || numOfDevicesAvailable == 0) { status = clGetPlatformInfo(platforms[i] , CL_PLATFORM_NAME , sizeof(platformName) , &platformName , NULL); printf("\n------------------------------------------------------------------\n"); printf("\nNo GPU device found for the %s\n",platformName); printf("\n------------------------------------------------------------------\n"); continue; } else { assert(devices = (cl_device_id*)malloc(numOfDevicesAvailable * sizeof(cl_device_id))); status = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU , numOfDevicesAvailable , devices , 0); platformId = i; } } //2.query host system for device information assert( (deviceName = (char **) malloc( sizeof(char *) * numOfDevicesAvailable)) != NULL); for ( i = 0 ; i < numOfDevicesAvailable ; i++) { // Get devices Name status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(deviceBuffer), &deviceBuffer, NULL); check_status("clGetDeviceInfo "); assert( (deviceName[i] = (char *) malloc( sizeof(char ) * sizeof(deviceBuffer))) != NULL); strcpy(deviceName[i],deviceBuffer); printf("\n GPU Device %d Name \t= %s ",i,deviceBuffer); } printf("\n"); numThreads = numOfDevicesAvailable ; } //-------------------------------------------------- gettformDeviceInfo() Function Ends--------------------------------------------- //-------------------------------------------------- checkResultCPUGPU Function Starts--------------------------------------------- int checkResultCPUGPU(int* input_mat,int* input_vec,int numRows , int vecLength) { int *result_vec; int rowCount,colCount ,err = 0 , tempResult ; assert(result_vec = (int *)malloc(sizeof(int)* numRows)); for( rowCount = 0; rowCount < numRows; rowCount++) { tempResult = 0; for(colCount = 0; colCount < vecLength; colCount++ ) { tempResult = tempResult + input_vec[colCount] * input_mat[(rowCount * vecLength) + colCount]; } result_vec[rowCount] = tempResult; } /*printf("\nCPU result vector\n"); for(rowCount=0;rowCount < numRows; rowCount++) { printf("%d\t",result_vec[rowCount]); } printf("\nGPU result vector\n"); for(rowCount=0;rowCount < numRows; rowCount++) { printf("%d\t",hst_resultVec[rowCount]); }*/ printf("\n"); for(rowCount = 0; rowCount < numRows; rowCount++) { if(result_vec[rowCount] == hst_resultVec[rowCount]) err = 0; else { err = 1; free(result_vec); break; } } //free(result_vec); return err; } //-------------------------------------------------- checkResultCPUGPU Function Ends---------------------------------------------