From 819c9ee22b91c5b861c0c08838db01454780b59a Mon Sep 17 00:00:00 2001 From: Blaise Tine Date: Thu, 21 Nov 2019 23:32:06 -0500 Subject: [PATCH] bfs benchmark --- benchmarks/opencl/bfs/CLHelper.h | 814 +++++++++++++++++++++++++++++++ benchmarks/opencl/bfs/Makefile | 4 +- benchmarks/opencl/bfs/kernel.cl | 53 ++ benchmarks/opencl/bfs/libbfs.a | Bin 0 -> 7846 bytes benchmarks/opencl/bfs/main.cc | 299 ++++++++++++ benchmarks/opencl/bfs/run | 1 + benchmarks/opencl/bfs/timer.cc | 78 +++ benchmarks/opencl/bfs/timer.h | 128 +++++ benchmarks/opencl/bfs/util.h | 72 +++ 9 files changed, 1447 insertions(+), 2 deletions(-) create mode 100755 benchmarks/opencl/bfs/CLHelper.h create mode 100755 benchmarks/opencl/bfs/kernel.cl create mode 100644 benchmarks/opencl/bfs/libbfs.a create mode 100755 benchmarks/opencl/bfs/main.cc create mode 100755 benchmarks/opencl/bfs/run create mode 100755 benchmarks/opencl/bfs/timer.cc create mode 100755 benchmarks/opencl/bfs/timer.h create mode 100755 benchmarks/opencl/bfs/util.h diff --git a/benchmarks/opencl/bfs/CLHelper.h b/benchmarks/opencl/bfs/CLHelper.h new file mode 100755 index 00000000..b9a873e4 --- /dev/null +++ b/benchmarks/opencl/bfs/CLHelper.h @@ -0,0 +1,814 @@ +//------------------------------------------ +//--cambine:helper function for OpenCL +//--programmer: Jianbin Fang +//--date: 27/12/2010 +//------------------------------------------ +#ifndef _CL_HELPER_ +#define _CL_HELPER_ + +#include +#include +#include +#include +#include + +using std::string; +using std::ifstream; +using std::cerr; +using std::endl; +using std::cout; +//#pragma OPENCL EXTENSION cl_nv_compiler_options:enable +#define WORK_DIM 2 //work-items dimensions + +struct oclHandleStruct +{ + cl_context context; + cl_device_id *devices; + cl_command_queue queue; + cl_program program; + cl_int cl_status; + std::string error_str; + std::vector kernel; +}; + +struct oclHandleStruct oclHandles; + +char kernel_file[100] = "Kernels.cl"; +int total_kernels = 2; +string kernel_names[2] = {"BFS_1", "BFS_2"}; +int work_group_size = 512; +int device_id_inused = 0; //deviced id used (default : 0) + +/* + * Converts the contents of a file into a string + */ +string FileToString(const string fileName) +{ + ifstream f(fileName.c_str(), ifstream::in | ifstream::binary); + + try + { + size_t size; + char* str; + string s; + + if(f.is_open()) + { + size_t fileSize; + f.seekg(0, ifstream::end); + size = fileSize = f.tellg(); + f.seekg(0, ifstream::beg); + + str = new char[size+1]; + if (!str) throw(string("Could not allocate memory")); + + f.read(str, fileSize); + f.close(); + str[size] = '\0'; + + s = str; + delete [] str; + return s; + } + } + catch(std::string msg) + { + cerr << "Exception caught in FileToString(): " << msg << endl; + if(f.is_open()) + f.close(); + } + catch(...) + { + cerr << "Exception caught in FileToString()" << endl; + if(f.is_open()) + f.close(); + } + string errorMsg = "FileToString()::Error: Unable to open file " + + fileName; + throw(errorMsg); +} +//--------------------------------------- +//Read command line parameters +// +void _clCmdParams(int argc, char* argv[]){ + for (int i =0; i < argc; ++i) + { + switch (argv[i][1]) + { + case 'g': //--g stands for size of work group + if (++i < argc) + { + sscanf(argv[i], "%u", &work_group_size); + } + else + { + std::cerr << "Could not read argument after option " << argv[i-1] << std::endl; + throw; + } + break; + case 'd': //--d stands for device id used in computaion + if (++i < argc) + { + sscanf(argv[i], "%u", &device_id_inused); + } + else + { + std::cerr << "Could not read argument after option " << argv[i-1] << std::endl; + throw; + } + break; + default: + ; + } + } + +} + +//--------------------------------------- +//Initlize CL objects +//--description: there are 5 steps to initialize all the OpenCL objects needed +//--revised on 04/01/2011: get the number of devices and +// devices have no relationship with context +void _clInit() +{ + int DEVICE_ID_INUSED = device_id_inused; + cl_int resultCL; + + oclHandles.context = NULL; + oclHandles.devices = NULL; + oclHandles.queue = NULL; + oclHandles.program = NULL; + + cl_uint deviceListSize; + + //----------------------------------------------- + //--cambine-1: find the available platforms and select one + + cl_uint numPlatforms; + cl_platform_id targetPlatform = NULL; + + resultCL = clGetPlatformIDs(0, NULL, &numPlatforms); + if (resultCL != CL_SUCCESS) + throw (string("InitCL()::Error: Getting number of platforms (clGetPlatformIDs)")); + //printf("number of platforms:%d\n",numPlatforms); //by cambine + + if (!(numPlatforms > 0)) + throw (string("InitCL()::Error: No platforms found (clGetPlatformIDs)")); + + cl_platform_id* allPlatforms = (cl_platform_id*) malloc(numPlatforms * sizeof(cl_platform_id)); + + resultCL = clGetPlatformIDs(numPlatforms, allPlatforms, NULL); + if (resultCL != CL_SUCCESS) + throw (string("InitCL()::Error: Getting platform ids (clGetPlatformIDs)")); + + /* Select the target platform. Default: first platform */ + targetPlatform = allPlatforms[0]; + for (int i = 0; i < numPlatforms; i++) + { + char pbuff[128]; + resultCL = clGetPlatformInfo( allPlatforms[i], + CL_PLATFORM_VENDOR, + sizeof(pbuff), + pbuff, + NULL); + if (resultCL != CL_SUCCESS) + throw (string("InitCL()::Error: Getting platform info (clGetPlatformInfo)")); + + //printf("vedor is %s\n",pbuff); + + } + free(allPlatforms); + + //----------------------------------------------- + //--cambine-2: create an OpenCL context + cl_context_properties cprops[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)targetPlatform, 0 }; + oclHandles.context = clCreateContextFromType(cprops, + CL_DEVICE_TYPE_GPU, + NULL, + NULL, + &resultCL); + + if ((resultCL != CL_SUCCESS) || (oclHandles.context == NULL)) + throw (string("InitCL()::Error: Creating Context (clCreateContextFromType)")); + //----------------------------------------------- + //--cambine-3: detect OpenCL devices + /* First, get the size of device list */ + oclHandles.cl_status = clGetDeviceIDs(targetPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &deviceListSize); + if(oclHandles.cl_status!=CL_SUCCESS){ + throw(string("exception in _clInit -> clGetDeviceIDs")); + } + if (deviceListSize == 0) + throw(string("InitCL()::Error: No devices found.")); + + //std::cout<<"device number:"< clGetDeviceIDs-2")); + } + //----------------------------------------------- + //--cambine-4: Create an OpenCL command queue + oclHandles.queue = clCreateCommandQueue(oclHandles.context, + oclHandles.devices[DEVICE_ID_INUSED], + 0, + &resultCL); + + if ((resultCL != CL_SUCCESS) || (oclHandles.queue == NULL)) + throw(string("InitCL()::Creating Command Queue. (clCreateCommandQueue)")); + //----------------------------------------------- + //--cambine-5: Load CL file, build CL program object, create CL kernel object + std::string source_str = FileToString(kernel_file); + const char * source = source_str.c_str(); + size_t sourceSize[] = { source_str.length() }; + + oclHandles.program = clCreateProgramWithSource(oclHandles.context, + 1, + &source, + sourceSize, + &resultCL); + + if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL)) + throw(string("InitCL()::Error: Loading Binary into cl_program. (clCreateProgramWithBinary)")); + //insert debug information + //std::string options= "-cl-nv-verbose"; //Doesn't work on AMD machines + //options += " -cl-nv-opt-level=3"; + resultCL = clBuildProgram(oclHandles.program, deviceListSize, oclHandles.devices, NULL, NULL,NULL); + + if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL)) + { + cerr << "InitCL()::Error: In clBuildProgram" << endl; + + size_t length; + resultCL = clGetProgramBuildInfo(oclHandles.program, + oclHandles.devices[DEVICE_ID_INUSED], + CL_PROGRAM_BUILD_LOG, + 0, + NULL, + &length); + if(resultCL != CL_SUCCESS) + throw(string("InitCL()::Error: Getting Program build info(clGetProgramBuildInfo)")); + + char* buffer = (char*)malloc(length); + resultCL = clGetProgramBuildInfo(oclHandles.program, + oclHandles.devices[DEVICE_ID_INUSED], + CL_PROGRAM_BUILD_LOG, + length, + buffer, + NULL); + if(resultCL != CL_SUCCESS) + throw(string("InitCL()::Error: Getting Program build info(clGetProgramBuildInfo)")); + + cerr << buffer << endl; + free(buffer); + + throw(string("InitCL()::Error: Building Program (clBuildProgram)")); + } + + //get program information in intermediate representation + #ifdef PTX_MSG + size_t binary_sizes[deviceListSize]; + char * binaries[deviceListSize]; + //figure out number of devices and the sizes of the binary for each device. + oclHandles.cl_status = clGetProgramInfo(oclHandles.program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*deviceListSize, &binary_sizes, NULL ); + if(oclHandles.cl_status!=CL_SUCCESS){ + throw(string("--cambine:exception in _InitCL -> clGetProgramInfo-2")); + } + + std::cout<<"--cambine:"< clGetProgramInfo-3")); + } + for(int i=0;i getting resource information")); + } + + build_log = (char *)malloc(ret_val_size+1); + oclHandles.cl_status = clGetProgramBuildInfo(oclHandles.program, oclHandles.devices[DEVICE_ID_INUSED], CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL); + if(oclHandles.cl_status!=CL_SUCCESS){ + throw(string("exceptions in _InitCL -> getting resources allocation information-2")); + } + build_log[ret_val_size] = '\0'; + std::cout<<"--cambine:"<d_mem_pinned")); + #endif + //------------ + d_mem = clCreateBuffer(oclHandles.context, CL_MEM_READ_ONLY, \ + size, NULL, &oclHandles.cl_status); + #ifdef ERRMSG + if(oclHandles.cl_status != CL_SUCCESS) + throw(string("excpetion in _clCreateAndCpyMem() -> d_mem ")); + #endif + //---------- + h_mem_pinned = (cl_float *)clEnqueueMapBuffer(oclHandles.queue, d_mem_pinned, CL_TRUE, \ + CL_MAP_WRITE, 0, size, 0, NULL, \ + NULL, &oclHandles.cl_status); + #ifdef ERRMSG + if(oclHandles.cl_status != CL_SUCCESS) + throw(string("excpetion in _clCreateAndCpyMem() -> clEnqueueMapBuffer")); + #endif + int element_number = size/sizeof(float); + #pragma omp parallel for + for(int i=0;i clEnqueueWriteBuffer")); + #endif + + return d_mem; +} + + +//-------------------------------------------------------- +//--cambine:create write only buffer on device +cl_mem _clMallocWO(int size) throw(string){ + cl_mem d_mem; + d_mem = clCreateBuffer(oclHandles.context, CL_MEM_WRITE_ONLY, size, 0, &oclHandles.cl_status); + #ifdef ERRMSG + if(oclHandles.cl_status != CL_SUCCESS) + throw(string("excpetion in _clCreateMem()")); + #endif + return d_mem; +} + +//-------------------------------------------------------- +//transfer data from device to host +void _clMemcpyD2H(cl_mem d_mem, int size, void * h_mem) throw(string){ + oclHandles.cl_status = clEnqueueReadBuffer(oclHandles.queue, d_mem, CL_TRUE, 0, size, h_mem, 0,0,0); + #ifdef ERRMSG + oclHandles.error_str = "excpetion in _clCpyMemD2H -> "; + switch(oclHandles.cl_status){ + case CL_INVALID_COMMAND_QUEUE: + oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE"; + break; + case CL_INVALID_CONTEXT: + oclHandles.error_str += "CL_INVALID_CONTEXT"; + break; + case CL_INVALID_MEM_OBJECT: + oclHandles.error_str += "CL_INVALID_MEM_OBJECT"; + break; + case CL_INVALID_VALUE: + oclHandles.error_str += "CL_INVALID_VALUE"; + break; + case CL_INVALID_EVENT_WAIT_LIST: + oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST"; + break; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: + oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE"; + break; + case CL_OUT_OF_HOST_MEMORY: + oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY"; + break; + default: + oclHandles.error_str += "Unknown reason"; + break; + } + if(oclHandles.cl_status != CL_SUCCESS) + throw(oclHandles.error_str); + #endif +} + +//-------------------------------------------------------- +//set kernel arguments +void _clSetArgs(int kernel_id, int arg_idx, void * d_mem, int size = 0) throw(string){ + if(!size){ + oclHandles.cl_status = clSetKernelArg(oclHandles.kernel[kernel_id], arg_idx, sizeof(d_mem), &d_mem); + #ifdef ERRMSG + oclHandles.error_str = "excpetion in _clSetKernelArg() "; + switch(oclHandles.cl_status){ + case CL_INVALID_KERNEL: + oclHandles.error_str += "CL_INVALID_KERNEL"; + break; + case CL_INVALID_ARG_INDEX: + oclHandles.error_str += "CL_INVALID_ARG_INDEX"; + break; + case CL_INVALID_ARG_VALUE: + oclHandles.error_str += "CL_INVALID_ARG_VALUE"; + break; + case CL_INVALID_MEM_OBJECT: + oclHandles.error_str += "CL_INVALID_MEM_OBJECT"; + break; + case CL_INVALID_SAMPLER: + oclHandles.error_str += "CL_INVALID_SAMPLER"; + break; + case CL_INVALID_ARG_SIZE: + oclHandles.error_str += "CL_INVALID_ARG_SIZE"; + break; + case CL_OUT_OF_RESOURCES: + oclHandles.error_str += "CL_OUT_OF_RESOURCES"; + break; + case CL_OUT_OF_HOST_MEMORY: + oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY"; + break; + default: + oclHandles.error_str += "Unknown reason"; + break; + } + if(oclHandles.cl_status != CL_SUCCESS) + throw(oclHandles.error_str); + #endif + } + else{ + oclHandles.cl_status = clSetKernelArg(oclHandles.kernel[kernel_id], arg_idx, size, d_mem); + #ifdef ERRMSG + oclHandles.error_str = "excpetion in _clSetKernelArg() "; + switch(oclHandles.cl_status){ + case CL_INVALID_KERNEL: + oclHandles.error_str += "CL_INVALID_KERNEL"; + break; + case CL_INVALID_ARG_INDEX: + oclHandles.error_str += "CL_INVALID_ARG_INDEX"; + break; + case CL_INVALID_ARG_VALUE: + oclHandles.error_str += "CL_INVALID_ARG_VALUE"; + break; + case CL_INVALID_MEM_OBJECT: + oclHandles.error_str += "CL_INVALID_MEM_OBJECT"; + break; + case CL_INVALID_SAMPLER: + oclHandles.error_str += "CL_INVALID_SAMPLER"; + break; + case CL_INVALID_ARG_SIZE: + oclHandles.error_str += "CL_INVALID_ARG_SIZE"; + break; + case CL_OUT_OF_RESOURCES: + oclHandles.error_str += "CL_OUT_OF_RESOURCES"; + break; + case CL_OUT_OF_HOST_MEMORY: + oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY"; + break; + default: + oclHandles.error_str += "Unknown reason"; + break; + } + if(oclHandles.cl_status != CL_SUCCESS) + throw(oclHandles.error_str); + #endif + } +} +void _clFinish() throw(string){ + oclHandles.cl_status = clFinish(oclHandles.queue); + #ifdef ERRMSG + oclHandles.error_str = "excpetion in _clFinish"; + switch(oclHandles.cl_status){ + case CL_INVALID_COMMAND_QUEUE: + oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE"; + break; + case CL_OUT_OF_RESOURCES: + oclHandles.error_str += "CL_OUT_OF_RESOURCES"; + break; + case CL_OUT_OF_HOST_MEMORY: + oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY"; + break; + default: + oclHandles.error_str += "Unknown reasons"; + break; + + } + if(oclHandles.cl_status!=CL_SUCCESS){ + throw(oclHandles.error_str); + } + #endif +} +//-------------------------------------------------------- +//--cambine:enqueue kernel +void _clInvokeKernel(int kernel_id, int work_items, int work_group_size) throw(string){ + cl_uint work_dim = WORK_DIM; + cl_event e[1]; + if(work_items%work_group_size != 0) //process situations that work_items cannot be divided by work_group_size + work_items = work_items + (work_group_size-(work_items%work_group_size)); + size_t local_work_size[] = {work_group_size, 1}; + size_t global_work_size[] = {work_items, 1}; + oclHandles.cl_status = clEnqueueNDRangeKernel(oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0, \ + global_work_size, local_work_size, 0 , 0, &(e[0]) ); + #ifdef ERRMSG + oclHandles.error_str = "excpetion in _clInvokeKernel() -> "; + switch(oclHandles.cl_status) + { + case CL_INVALID_PROGRAM_EXECUTABLE: + oclHandles.error_str += "CL_INVALID_PROGRAM_EXECUTABLE"; + break; + case CL_INVALID_COMMAND_QUEUE: + oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE"; + break; + case CL_INVALID_KERNEL: + oclHandles.error_str += "CL_INVALID_KERNEL"; + break; + case CL_INVALID_CONTEXT: + oclHandles.error_str += "CL_INVALID_CONTEXT"; + break; + case CL_INVALID_KERNEL_ARGS: + oclHandles.error_str += "CL_INVALID_KERNEL_ARGS"; + break; + case CL_INVALID_WORK_DIMENSION: + oclHandles.error_str += "CL_INVALID_WORK_DIMENSION"; + break; + case CL_INVALID_GLOBAL_WORK_SIZE: + oclHandles.error_str += "CL_INVALID_GLOBAL_WORK_SIZE"; + break; + case CL_INVALID_WORK_GROUP_SIZE: + oclHandles.error_str += "CL_INVALID_WORK_GROUP_SIZE"; + break; + case CL_INVALID_WORK_ITEM_SIZE: + oclHandles.error_str += "CL_INVALID_WORK_ITEM_SIZE"; + break; + case CL_INVALID_GLOBAL_OFFSET: + oclHandles.error_str += "CL_INVALID_GLOBAL_OFFSET"; + break; + case CL_OUT_OF_RESOURCES: + oclHandles.error_str += "CL_OUT_OF_RESOURCES"; + break; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: + oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE"; + break; + case CL_INVALID_EVENT_WAIT_LIST: + oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST"; + break; + case CL_OUT_OF_HOST_MEMORY: + oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY"; + break; + default: + oclHandles.error_str += "Unkown reseason"; + break; + } + if(oclHandles.cl_status != CL_SUCCESS) + throw(oclHandles.error_str); + #endif + //_clFinish(); + // oclHandles.cl_status = clWaitForEvents(1, &e[0]); + // #ifdef ERRMSG + // if (oclHandles.cl_status!= CL_SUCCESS) + // throw(string("excpetion in _clEnqueueNDRange() -> clWaitForEvents")); + // #endif +} +void _clInvokeKernel2D(int kernel_id, int range_x, int range_y, int group_x, int group_y) throw(string){ + cl_uint work_dim = WORK_DIM; + size_t local_work_size[] = {group_x, group_y}; + size_t global_work_size[] = {range_x, range_y}; + cl_event e[1]; + /*if(work_items%work_group_size != 0) //process situations that work_items cannot be divided by work_group_size + work_items = work_items + (work_group_size-(work_items%work_group_size));*/ + oclHandles.cl_status = clEnqueueNDRangeKernel(oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0, \ + global_work_size, local_work_size, 0 , 0, &(e[0]) ); + #ifdef ERRMSG + oclHandles.error_str = "excpetion in _clInvokeKernel() -> "; + switch(oclHandles.cl_status) + { + case CL_INVALID_PROGRAM_EXECUTABLE: + oclHandles.error_str += "CL_INVALID_PROGRAM_EXECUTABLE"; + break; + case CL_INVALID_COMMAND_QUEUE: + oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE"; + break; + case CL_INVALID_KERNEL: + oclHandles.error_str += "CL_INVALID_KERNEL"; + break; + case CL_INVALID_CONTEXT: + oclHandles.error_str += "CL_INVALID_CONTEXT"; + break; + case CL_INVALID_KERNEL_ARGS: + oclHandles.error_str += "CL_INVALID_KERNEL_ARGS"; + break; + case CL_INVALID_WORK_DIMENSION: + oclHandles.error_str += "CL_INVALID_WORK_DIMENSION"; + break; + case CL_INVALID_GLOBAL_WORK_SIZE: + oclHandles.error_str += "CL_INVALID_GLOBAL_WORK_SIZE"; + break; + case CL_INVALID_WORK_GROUP_SIZE: + oclHandles.error_str += "CL_INVALID_WORK_GROUP_SIZE"; + break; + case CL_INVALID_WORK_ITEM_SIZE: + oclHandles.error_str += "CL_INVALID_WORK_ITEM_SIZE"; + break; + case CL_INVALID_GLOBAL_OFFSET: + oclHandles.error_str += "CL_INVALID_GLOBAL_OFFSET"; + break; + case CL_OUT_OF_RESOURCES: + oclHandles.error_str += "CL_OUT_OF_RESOURCES"; + break; + case CL_MEM_OBJECT_ALLOCATION_FAILURE: + oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE"; + break; + case CL_INVALID_EVENT_WAIT_LIST: + oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST"; + break; + case CL_OUT_OF_HOST_MEMORY: + oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY"; + break; + default: + oclHandles.error_str += "Unkown reseason"; + break; + } + if(oclHandles.cl_status != CL_SUCCESS) + throw(oclHandles.error_str); + #endif + //_clFinish(); + /*oclHandles.cl_status = clWaitForEvents(1, &e[0]); + + #ifdef ERRMSG + + if (oclHandles.cl_status!= CL_SUCCESS) + + throw(string("excpetion in _clEnqueueNDRange() -> clWaitForEvents")); + + #endif*/ +} + +//-------------------------------------------------------- +//release OpenCL objects +void _clFree(cl_mem ob) throw(string){ + if(ob!=NULL) + oclHandles.cl_status = clReleaseMemObject(ob); + #ifdef ERRMSG + oclHandles.error_str = "excpetion in _clFree() ->"; + switch(oclHandles.cl_status) + { + case CL_INVALID_MEM_OBJECT: + oclHandles.error_str += "CL_INVALID_MEM_OBJECT"; + break; + case CL_OUT_OF_RESOURCES: + oclHandles.error_str += "CL_OUT_OF_RESOURCES"; + break; + case CL_OUT_OF_HOST_MEMORY: + oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY"; + break; + default: + oclHandles.error_str += "Unkown reseason"; + break; + } + if (oclHandles.cl_status!= CL_SUCCESS) + throw(oclHandles.error_str); + #endif +} +#endif //_CL_HELPER_ diff --git a/benchmarks/opencl/bfs/Makefile b/benchmarks/opencl/bfs/Makefile index 5ad87059..0472f69f 100644 --- a/benchmarks/opencl/bfs/Makefile +++ b/benchmarks/opencl/bfs/Makefile @@ -29,12 +29,12 @@ CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sectio LIBS = -lOpenCL -PROJECT=saxpy +PROJECT=bfs all: $(PROJECT).dump $(PROJECT).hex lib$(PROJECT).a: kernel.cl - POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl + POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl $(PROJECT).elf: main.cc lib$(PROJECT).a $(CXX) $(CXXFLAGS) -I$(POCL_RT_PATH)/include -L$(POCL_RT_PATH)/lib/static -L. $(VX_SRCS) main.cc -Wl,--whole-archive -l$(PROJECT) -Wl,--no-whole-archive $(LIBS) -o $(PROJECT).elf diff --git a/benchmarks/opencl/bfs/kernel.cl b/benchmarks/opencl/bfs/kernel.cl new file mode 100755 index 00000000..51ce5a08 --- /dev/null +++ b/benchmarks/opencl/bfs/kernel.cl @@ -0,0 +1,53 @@ +/* ============================================================ +//--cambine: kernel funtion of Breadth-First-Search +//--author: created by Jianbin Fang +//--date: 06/12/2010 +============================================================ */ + +//#pragma OPENCL EXTENSION cl_khr_byte_addressable_store: enable + +//Structure to hold a node information +typedef struct{ + int starting; + int no_of_edges; +} Node; + +//--7 parameters +__kernel void BFS_1( const __global Node* g_graph_nodes, + const __global int* g_graph_edges, + __global char* g_graph_mask, + __global char* g_updating_graph_mask, + __global char* g_graph_visited, + __global int* g_cost, + const int no_of_nodes){ + int tid = get_global_id(0); + if( tid^6+YL`c5uiN76aXa1Suwg1nk&;u^ms-L^=W+izrjKX+o%vXU7Q{PMq3? z!76QGj}W>ds^TKL?oS07FovdTt~5>R4>C1?*wp^mk4_ar)vRpQG;K@}hT0PEJJ0Vq z&k2yAb(My=(S7&abM8I&p7-5%@BN%7zpSR?yT4N1)>xVRmTF11J{ubY(V)>{S2Yni ziCo8L@)ui)N?937#godOM0zlhRG!-Wj1r)^D&_mB^q#(SYVXhkjY_YY$Fo?~ULY^Tv9;e!nl(R$Vr=$GnMNZ!F;r zMZKX8UnW&y?G1b7h&L2s5AAY=sVC%($Gq(kuNw9GdyS55R%kE}Yt@QTxV<#XKr|FC zO?OxIkGi@y3n2hlDG9nsGc}^Dqoi-wi*-~Bt6PYor}^{bS#yxwb|E5Y<90acB-;FS zm`H+NYaHixq7U`i?LDny(`KLFpl@!(K@qGMca>8wb12=;A<99wP@nf}hbbTtl@^*F z1W`Px4)!(fOQbXXslmpm&+iK~uG^i>4rRX7*4Edb-Mu&Fi>C(KlF5Ao-huvfI+bqI z`}kmNj3s*HNKbn#&=F6Bqfxmx9t?#eiH@E`IM5r8$^PCRe>@fmwan|2{p&8{!QPPU zkH&(5a6IM@s(w{X1R`=s4z)*nq8*`FDCQ4^bFB)Vdz;l_53~>I7lRxP5pNvuQ2p1oBExQ8~7z7v25yhL0)0#SvK`a$o0H0BC%}h zUx8e|h_ys4n|iJbiJwG^s$G%;--IunNUFa6!TzkGrqk*HQg&<$1eGn_Pe0Y!tti`+ zKrpE8&8C!eqOU)bO{C3THv83J*FK|rHnBfTyt)SxgV;Ki`2PJ$UosU_lgd6dt+03U zWn!5O`S_viSnwEuu6k~8)fbPG zY4V2R-d@$4z(%clOD3LLR*m5`9T+H;J!)2+i$+LM_A5#kUVMrhaJPUjP{MQ}&lm75 z8B0KK{!3J@SiUb)?1GhLW7$zV znqz+e*&GXES^g=Mthcr^J{4|OZZ`zk>hD8~joX=X`mNzF$Vwz`XU4>J9*Z;UYLU%x zZ$LJEKrGAL)~bIM9XLLYm*-=~`wht4){o5T7V=xVRC<`>HbY?9OmA2#P@2^+1gX22 zf@+xCj$vGWFFY!ttyuLTy=@uOVHg?rrAf~eOFO-g@v}sMTJB5X%D$5_ezxH>Jucb z-%5q8zrX8tZH2Cd@-Lhut+g6*E9H-#pxleqBweVXynR$?Z`&#UW4kSHe_rH{c2n*r z4OF=H+8wLCaNItfTe^Yd7vX0N;~ZK_p2nY1{?K9J8Gc)6>)~&`3;rxVF243j>c*-m zx3J&UHguA-0e7SI<7g*@*4ing)`LQ885dHEuxV>LZMZwxaCfo+1F3IVzdHd7;IIZb ztOgE^0btPyEPz9+3M_y_GklyH5%S^R*|akglz-_GN#`7R?-N368WGYj9HcdM3GF>`cJCfIxq%DHCna$}9iBkxjtXOji5+B(LoYWLc<{1Z%YhV=kr&9~-B1yrd1|zzHC%?*vW%bTjq^)K=7z^OQ*tPsGu((t#V=;1yv`Y?rJ5FGMv6cagqj}O8 zi>sAbXw5a`KG95h$7!s6tQD*y*?zW+g**Z*M!J9p@V<8Dj@8ase7^z1z0UWijdm#Xbiv>!W}SU-${<_MZEm+6I<}kl zDE-_HJ?y#t|G>+)e`>rOuS@6U=6&x``gT7nUd~JOF)yFt&5wEc9Pa;F@N&y9sGlk? z4~CUkK=DTvHKeo$D|mS@?2QGy{-{?Cc}u3*hmbGxn3wY&`vJ1`C^OHoA?P3T^8bRD zcbLEWmx%eiyj(VZfNz}tclYhaiy$7eueYQ@b zKXbX|MfOqHF$qow&a#F1L)CP?>Rs{-|6Alo-_c9Y@NMD2%G3MoUVaxm_+#+kA~lcWj6YIh*+ ze}K143fWQ6OV9A1@f!eQU;{q!a_&OaFS`*ADFK{y|`vNWCZtMx;1tMdIBbDJ;C=2jQ6FD#T>4O>Xh{kYYe YXtNM*HDo1tn6!B&&E{6z?q1yLKT3|^-T(jq literal 0 HcmV?d00001 diff --git a/benchmarks/opencl/bfs/main.cc b/benchmarks/opencl/bfs/main.cc new file mode 100755 index 00000000..eacc9cbf --- /dev/null +++ b/benchmarks/opencl/bfs/main.cc @@ -0,0 +1,299 @@ +//--by Jianbin Fang + +#define __CL_ENABLE_EXCEPTIONS +#include +#include +#include +#include + +#ifdef PROFILING +#include "timer.h" +#endif + +#include "CLHelper.h" +#include "util.h" + +#define MAX_THREADS_PER_BLOCK 256 + +//Structure to hold a node information +struct Node +{ + int starting; + int no_of_edges; +}; + + +//---------------------------------------------------------- +//--bfs on cpu +//--programmer: jianbin +//--date: 26/01/2011 +//--note: width is changed to the new_width +//---------------------------------------------------------- +void run_bfs_cpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \ + int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \ + char *h_graph_visited, int *h_cost_ref){ + char stop; + int k = 0; + do{ + //if no thread changes this value then the loop stops + stop=false; + for(int tid = 0; tid < no_of_nodes; tid++ ) + { + if (h_graph_mask[tid] == true){ + h_graph_mask[tid]=false; + for(int i=h_graph_nodes[tid].starting; i<(h_graph_nodes[tid].no_of_edges + h_graph_nodes[tid].starting); i++){ + int id = h_graph_edges[i]; //--cambine: node id is connected with node tid + if(!h_graph_visited[id]){ //--cambine: if node id has not been visited, enter the body below + h_cost_ref[id]=h_cost_ref[tid]+1; + h_updating_graph_mask[id]=true; + } + } + } + } + + for(int tid=0; tid< no_of_nodes ; tid++ ) + { + if (h_updating_graph_mask[tid] == true){ + h_graph_mask[tid]=true; + h_graph_visited[tid]=true; + stop=true; + h_updating_graph_mask[tid]=false; + } + } + k++; + } + while(stop); +} +//---------------------------------------------------------- +//--breadth first search on GPUs +//---------------------------------------------------------- +void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size, \ + int *h_graph_edges, char *h_graph_mask, char *h_updating_graph_mask, \ + char *h_graph_visited, int *h_cost) + throw(std::string){ + + //int number_elements = height*width; + char h_over; + cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask, \ + d_graph_visited, d_cost, d_over; + try{ + //--1 transfer data from host to device + _clInit(); + d_graph_nodes = _clMalloc(no_of_nodes*sizeof(Node), h_graph_nodes); + d_graph_edges = _clMalloc(edge_list_size*sizeof(int), h_graph_edges); + d_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_graph_mask); + d_updating_graph_mask = _clMallocRW(no_of_nodes*sizeof(char), h_updating_graph_mask); + d_graph_visited = _clMallocRW(no_of_nodes*sizeof(char), h_graph_visited); + + + d_cost = _clMallocRW(no_of_nodes*sizeof(int), h_cost); + d_over = _clMallocRW(sizeof(char), &h_over); + + _clMemcpyH2D(d_graph_nodes, no_of_nodes*sizeof(Node), h_graph_nodes); + _clMemcpyH2D(d_graph_edges, edge_list_size*sizeof(int), h_graph_edges); + _clMemcpyH2D(d_graph_mask, no_of_nodes*sizeof(char), h_graph_mask); + _clMemcpyH2D(d_updating_graph_mask, no_of_nodes*sizeof(char), h_updating_graph_mask); + _clMemcpyH2D(d_graph_visited, no_of_nodes*sizeof(char), h_graph_visited); + _clMemcpyH2D(d_cost, no_of_nodes*sizeof(int), h_cost); + + //--2 invoke kernel +#ifdef PROFILING + timer kernel_timer; + double kernel_time = 0.0; + kernel_timer.reset(); + kernel_timer.start(); +#endif + do{ + h_over = false; + _clMemcpyH2D(d_over, sizeof(char), &h_over); + //--kernel 0 + int kernel_id = 0; + int kernel_idx = 0; + _clSetArgs(kernel_id, kernel_idx++, d_graph_nodes); + _clSetArgs(kernel_id, kernel_idx++, d_graph_edges); + _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); + _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); + _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); + _clSetArgs(kernel_id, kernel_idx++, d_cost); + _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); + + //int work_items = no_of_nodes; + _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); + + //--kernel 1 + kernel_id = 1; + kernel_idx = 0; + _clSetArgs(kernel_id, kernel_idx++, d_graph_mask); + _clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask); + _clSetArgs(kernel_id, kernel_idx++, d_graph_visited); + _clSetArgs(kernel_id, kernel_idx++, d_over); + _clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int)); + + //work_items = no_of_nodes; + _clInvokeKernel(kernel_id, no_of_nodes, work_group_size); + + _clMemcpyD2H(d_over,sizeof(char), &h_over); + }while(h_over); + + _clFinish(); +#ifdef PROFILING + kernel_timer.stop(); + kernel_time = kernel_timer.getTimeInSeconds(); +#endif + //--3 transfer data from device to host + _clMemcpyD2H(d_cost,no_of_nodes*sizeof(int), h_cost); + //--statistics +#ifdef PROFILING + std::cout<<"kernel time(s):"<\n", argv[0]); + +} +//---------------------------------------------------------- +//--cambine: main function +//--author: created by Jianbin Fang +//--date: 25/01/2011 +//---------------------------------------------------------- +int main(int argc, char * argv[]) +{ + int no_of_nodes; + int edge_list_size; + FILE *fp; + Node* h_graph_nodes; + char *h_graph_mask, *h_updating_graph_mask, *h_graph_visited; + try{ + char *input_f; + if(argc!=2){ + Usage(argc, argv); + exit(0); + } + + input_f = argv[1]; + printf("Reading File\n"); + //Read in Graph from a file + fp = fopen(input_f,"r"); + if(!fp){ + printf("Error Reading graph file\n"); + return 0; + } + + int source = 0; + + fscanf(fp,"%d",&no_of_nodes); + + int num_of_blocks = 1; + int num_of_threads_per_block = no_of_nodes; + + //Make execution Parameters according to the number of nodes + //Distribute threads across multiple Blocks if necessary + if(no_of_nodes>MAX_THREADS_PER_BLOCK){ + num_of_blocks = (int)ceil(no_of_nodes/(double)MAX_THREADS_PER_BLOCK); + num_of_threads_per_block = MAX_THREADS_PER_BLOCK; + } + work_group_size = num_of_threads_per_block; + // allocate host memory + h_graph_nodes = (Node*) malloc(sizeof(Node)*no_of_nodes); + h_graph_mask = (char*) malloc(sizeof(char)*no_of_nodes); + h_updating_graph_mask = (char*) malloc(sizeof(char)*no_of_nodes); + h_graph_visited = (char*) malloc(sizeof(char)*no_of_nodes); + + int start, edgeno; + // initalize the memory + for(int i = 0; i < no_of_nodes; i++){ + fscanf(fp,"%d %d",&start,&edgeno); + h_graph_nodes[i].starting = start; + h_graph_nodes[i].no_of_edges = edgeno; + h_graph_mask[i]=false; + h_updating_graph_mask[i]=false; + h_graph_visited[i]=false; + } + //read the source node from the file + fscanf(fp,"%d",&source); + source=0; + //set the source node as true in the mask + h_graph_mask[source]=true; + h_graph_visited[source]=true; + fscanf(fp,"%d",&edge_list_size); + int id,cost; + int* h_graph_edges = (int*) malloc(sizeof(int)*edge_list_size); + for(int i=0; i < edge_list_size ; i++){ + fscanf(fp,"%d",&id); + fscanf(fp,"%d",&cost); + h_graph_edges[i] = id; + } + + if(fp) + fclose(fp); + // allocate mem for the result on host side + int *h_cost = (int*) malloc(sizeof(int)*no_of_nodes); + int *h_cost_ref = (int*)malloc(sizeof(int)*no_of_nodes); + for(int i=0;i(h_cost_ref, h_cost, no_of_nodes); + //release host memory + free(h_graph_nodes); + free(h_graph_mask); + free(h_updating_graph_mask); + free(h_graph_visited); + + } + catch(std::string msg){ + std::cout<<"--cambine: exception in main ->"< +#include +#include +#include + +#include "timer.h" + + +using namespace std; + +double timer::CPU_speed_in_MHz = timer::get_CPU_speed_in_MHz(); + + +double timer::get_CPU_speed_in_MHz() +{ +#if defined __linux__ + ifstream infile("/proc/cpuinfo"); + char buffer[256], *colon; + + while (infile.good()) { + infile.getline(buffer, 256); + + if (strncmp("cpu MHz", buffer, 7) == 0 && (colon = strchr(buffer, ':')) != 0) + return atof(colon + 2); + } +#endif + + return 0.0; +} + + +void timer::print_time(ostream &str, const char *which, double time) const +{ + static const char *units[] = { " ns", " us", " ms", " s", " ks", 0 }; + const char **unit = units; + + time = 1000.0 * time / CPU_speed_in_MHz; + + while (time >= 999.5 && unit[1] != 0) { + time /= 1000.0; + ++ unit; + } + + str << which << " = " << setprecision(3) << setw(4) << time << *unit; +} + + +ostream &timer::print(ostream &str) +{ + str << left << setw(25) << (name != 0 ? name : "timer") << ": " << right; + + if (CPU_speed_in_MHz == 0) + str << "could not determine CPU speed\n"; + else if (count > 0) { + double total = static_cast(total_time); + + print_time(str, "avg", total / static_cast(count)); + print_time(str, ", total", total); + str << ", count = " << setw(9) << count << '\n'; + } + else + str << "not used\n"; + + return str; +} + + +ostream &operator << (ostream &str, class timer &timer) +{ + return timer.print(str); +} + +double timer::getTimeInSeconds() +{ + double total = static_cast(total_time); + double res = (total / 1000000.0) / CPU_speed_in_MHz; + return res; +} diff --git a/benchmarks/opencl/bfs/timer.h b/benchmarks/opencl/bfs/timer.h new file mode 100755 index 00000000..e5efdc18 --- /dev/null +++ b/benchmarks/opencl/bfs/timer.h @@ -0,0 +1,128 @@ +#ifndef timer_h +#define timer_h + +#include + + +class timer { + public: + timer(const char *name = 0); + timer(const char *name, std::ostream &write_on_exit); + + ~timer(); + + void start(), stop(); + void reset(); + std::ostream &print(std::ostream &); + + double getTimeInSeconds(); + + private: + void print_time(std::ostream &, const char *which, double time) const; + + union { + long long total_time; + struct { +#if defined __PPC__ + int high, low; +#else + int low, high; +#endif + }; + }; + + unsigned long long count; + const char *const name; + std::ostream *const write_on_exit; + + static double CPU_speed_in_MHz, get_CPU_speed_in_MHz(); +}; + + +std::ostream &operator << (std::ostream &, class timer &); + + +inline void timer::reset() +{ + total_time = 0; + count = 0; +} + + +inline timer::timer(const char *name) +: + name(name), + write_on_exit(0) +{ + reset(); +} + + +inline timer::timer(const char *name, std::ostream &write_on_exit) +: + name(name), + write_on_exit(&write_on_exit) +{ + reset(); +} + + +inline timer::~timer() +{ + if (write_on_exit != 0) + print(*write_on_exit); +} + + +inline void timer::start() +{ +#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64) + unsigned eax, edx; + + asm volatile ("rdtsc" : "=a" (eax), "=d" (edx)); + + total_time -= ((unsigned long long) edx << 32) + eax; +#elif (defined __GNUC__ || defined __INTEL_COMPILER) && (defined __i386 || defined __x86_64) + asm volatile + ( + "rdtsc\n\t" + "subl %%eax, %0\n\t" + "sbbl %%edx, %1" + : + "+m" (low), "+m" (high) + : + : + "eax", "edx" + ); +#else +#error Compiler/Architecture not recognized +#endif +} + + +inline void timer::stop() +{ +#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64) + unsigned eax, edx; + + asm volatile ("rdtsc" : "=a" (eax), "=d" (edx)); + + total_time += ((unsigned long long) edx << 32) + eax; +#elif (defined __GNUC__ || defined __INTEL_COMPILER) && (defined __i386 || defined __x86_64) + asm volatile + ( + "rdtsc\n\t" + "addl %%eax, %0\n\t" + "adcl %%edx, %1" + : + "+m" (low), "+m" (high) + : + : + "eax", "edx" + ); +#endif + + ++ count; +} + +#endif diff --git a/benchmarks/opencl/bfs/util.h b/benchmarks/opencl/bfs/util.h new file mode 100755 index 00000000..425edfba --- /dev/null +++ b/benchmarks/opencl/bfs/util.h @@ -0,0 +1,72 @@ +#ifndef _C_UTIL_ +#define _C_UTIL_ +#include +#include + +//------------------------------------------------------------------- +//--initialize array with maximum limit +//------------------------------------------------------------------- +template +void fill(datatype *A, const int n, const datatype maxi){ + for (int j = 0; j < n; j++) + { + A[j] = ((datatype) maxi * (rand() / (RAND_MAX + 1.0f))); + } +} + +//--print matrix +template +void print_matrix(datatype *A, int height, int width){ + for(int i=0; i +void verify_array(const datatype *cpuResults, const datatype *gpuResults, const int size){ + + char passed = true; +#pragma omp parallel for + for (int i=0; i MAX_RELATIVE_ERROR){ + passed = false; + } + } + if (passed){ + std::cout << "--cambine:passed:-)" << endl; + } + else{ + std::cout << "--cambine: failed:-(" << endl; + } + return ; +} +template +void compare_results(const datatype *cpu_results, const datatype *gpu_results, const int size){ + + char passed = true; +//#pragma omp parallel for + for (int i=0; i