diff --git a/benchmarks/opencl/sad/DESCRIPTION b/benchmarks/opencl/sad/DESCRIPTION new file mode 100755 index 00000000..87ef107a --- /dev/null +++ b/benchmarks/opencl/sad/DESCRIPTION @@ -0,0 +1 @@ +Inputs: reference.bin frame.bin diff --git a/benchmarks/opencl/sad/Makefile b/benchmarks/opencl/sad/Makefile new file mode 100644 index 00000000..8b843513 --- /dev/null +++ b/benchmarks/opencl/sad/Makefile @@ -0,0 +1,68 @@ +RISCV_TOOL_PATH = $(wildcard ~/dev/riscv-gnu-toolchain/drops) +POCL_CC_PATH = $(wildcard ~/dev/pocl/drops_riscv_cc) +POCL_INC_PATH = $(wildcard ../include) +POCL_LIB_PATH = $(wildcard ../lib) +VX_RT_PATH = $(wildcard ../../../runtime) +VX_SIMX_PATH = $(wildcard ../../../simX/obj_dir) + +CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc +CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++ +DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump +HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy +GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb + +VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c +VX_SRCS += $(VX_RT_PATH)/startup/vx_start.s +VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s +VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c +VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s +VX_SRCS += $(VX_RT_PATH)/tests/tests.c +VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c +VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) + +VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld + +CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32 +CXXFLAGS += -ffreestanding # program may not begin at main() +CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections +CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions +CXXFLAGS += -I$(POCL_INC_PATH) -I. + +VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a +QEMU_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/qemu/libOpenCL.a + +PROJECT = sad + +SRCS = main.cc args.c parboil_opencl.c ocl.c gpu_info.c file.c image.c OpenCL_common.cpp + +all: $(PROJECT).dump $(PROJECT).hex + +lib$(PROJECT).a: kernel.cl + POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl + +$(PROJECT).elf: $(SRCS) lib$(PROJECT).a + $(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf + +$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a + $(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu + +$(PROJECT).hex: $(PROJECT).elf + $(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex + +$(PROJECT).dump: $(PROJECT).elf + $(DMP) -D $(PROJECT).elf > $(PROJECT).dump + +run: $(PROJECT).hex + POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug + +qemu: $(PROJECT).qemu + POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu + +gdb-s: $(PROJECT).qemu + POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu + +gdb-c: $(PROJECT).qemu + $(GDB) $(PROJECT).qemu + +clean: + rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug \ No newline at end of file diff --git a/benchmarks/opencl/sad/OpenCL_common.cpp b/benchmarks/opencl/sad/OpenCL_common.cpp new file mode 100644 index 00000000..2e147e4c --- /dev/null +++ b/benchmarks/opencl/sad/OpenCL_common.cpp @@ -0,0 +1,298 @@ + + +#include "OpenCL_common.h" +#include +#include + +// -1 for NO suitable device found, 0 if an appropriate device was found +int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...) { + + // Supported Device Requests (anything that returns cl_bool) + // CL_DEVICE_IMAGE_SUPPORT + // CL_DEVICE_HOST_UNIFIED_MEMORY + // CL_DEVICE_ERROR_CORRECTION_SUPPORT + // CL_DEVICE_AVAILABLE + // CL_DEVICE_COMPILER_AVAILABLE + + cl_uint numEntries = 16; + cl_platform_id clPlatforms[numEntries]; + cl_uint numPlatforms; + + cl_device_id clDevices[numEntries]; + cl_uint numDevices; + + OCL_ERRCK_RETVAL ( clGetPlatformIDs(numEntries, clPlatforms, &numPlatforms) ); + //fprintf(stderr, "Number of Platforms found: %d\n", numPlatforms); + bool needDevice = true; + + for (int ip = 0; ip < numPlatforms && needDevice; ++ip) { + + cl_platform_id clPlatform = clPlatforms[ip]; + + OCL_ERRCK_RETVAL ( clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_ALL, numEntries, clDevices, &numDevices) ); + //fprintf(stderr, " Number of Devices found for Platform %d: %d\n", ip, numDevices); + + for (int id = 0; (id < numDevices) && needDevice ; ++id) { + cl_device_id clDevice = clDevices[id]; + cl_device_type clDeviceType; + + bool canSatisfy = true; + + if (reqDeviceType != NULL) { + OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_TYPE, sizeof(cl_device_type), &clDeviceType, NULL)); + if (*reqDeviceType != CL_DEVICE_TYPE_ALL) { + if (*reqDeviceType != clDeviceType) { + canSatisfy = false; + } + } + } + + va_list paramList; + va_start(paramList, numRequests); + for (int i = 0; (i < numRequests) && canSatisfy ; ++i) { + + cl_device_info devReq = va_arg( paramList, cl_device_info ); + cl_bool clInfoBool; + size_t infoRetSize = sizeof(cl_bool); + + OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, devReq, infoRetSize, &clInfoBool, NULL)); + if (clInfoBool != true) { + canSatisfy = false; + } + } + + va_end(paramList); + if (canSatisfy) { + *device = clDevice; + *platform = clPlatform; + needDevice = false; + if (reqDeviceType != NULL && (*reqDeviceType == CL_DEVICE_TYPE_ALL)) { + *reqDeviceType = clDeviceType; + } + } + } // End checking all devices for a platform + } // End checking all platforms + + int retVal = -1; + if (needDevice) { + retVal = -1; + } else { + retVal = 0; + } + + return retVal; + +} + +const char* oclErrorString(cl_int error) +{ +// From NVIDIA SDK + static const char* errorString[] = { + "CL_SUCCESS", + "CL_DEVICE_NOT_FOUND", + "CL_DEVICE_NOT_AVAILABLE", + "CL_COMPILER_NOT_AVAILABLE", + "CL_MEM_OBJECT_ALLOCATION_FAILURE", + "CL_OUT_OF_RESOURCES", + "CL_OUT_OF_HOST_MEMORY", + "CL_PROFILING_INFO_NOT_AVAILABLE", + "CL_MEM_COPY_OVERLAP", + "CL_IMAGE_FORMAT_MISMATCH", + "CL_IMAGE_FORMAT_NOT_SUPPORTED", + "CL_BUILD_PROGRAM_FAILURE", + "CL_MAP_FAILURE", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "CL_INVALID_VALUE", + "CL_INVALID_DEVICE_TYPE", + "CL_INVALID_PLATFORM", + "CL_INVALID_DEVICE", + "CL_INVALID_CONTEXT", + "CL_INVALID_QUEUE_PROPERTIES", + "CL_INVALID_COMMAND_QUEUE", + "CL_INVALID_HOST_PTR", + "CL_INVALID_MEM_OBJECT", + "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR", + "CL_INVALID_IMAGE_SIZE", + "CL_INVALID_SAMPLER", + "CL_INVALID_BINARY", + "CL_INVALID_BUILD_OPTIONS", + "CL_INVALID_PROGRAM", + "CL_INVALID_PROGRAM_EXECUTABLE", + "CL_INVALID_KERNEL_NAME", + "CL_INVALID_KERNEL_DEFINITION", + "CL_INVALID_KERNEL", + "CL_INVALID_ARG_INDEX", + "CL_INVALID_ARG_VALUE", + "CL_INVALID_ARG_SIZE", + "CL_INVALID_KERNEL_ARGS", + "CL_INVALID_WORK_DIMENSION", + "CL_INVALID_WORK_GROUP_SIZE", + "CL_INVALID_WORK_ITEM_SIZE", + "CL_INVALID_GLOBAL_OFFSET", + "CL_INVALID_EVENT_WAIT_LIST", + "CL_INVALID_EVENT", + "CL_INVALID_OPERATION", + "CL_INVALID_GL_OBJECT", + "CL_INVALID_BUFFER_SIZE", + "CL_INVALID_MIP_LEVEL", + "CL_INVALID_GLOBAL_WORK_SIZE", + }; + + const int errorCount = sizeof(errorString) / sizeof(errorString[0]); + + const int index = -error; + + return (index >= 0 && index < errorCount) ? errorString[index] : ""; +} + + +const char* oclDebugErrString(cl_int error, cl_device_id device) +{ +// From NVIDIA SDK + static const char* errorString[] = { + "CL_SUCCESS", + "CL_DEVICE_NOT_FOUND", + "CL_DEVICE_NOT_AVAILABLE", + "CL_COMPILER_NOT_AVAILABLE", + "CL_MEM_OBJECT_ALLOCATION_FAILURE", + "CL_OUT_OF_RESOURCES", + "CL_OUT_OF_HOST_MEMORY", + "CL_PROFILING_INFO_NOT_AVAILABLE", + "CL_MEM_COPY_OVERLAP", + "CL_IMAGE_FORMAT_MISMATCH", + "CL_IMAGE_FORMAT_NOT_SUPPORTED", + "CL_BUILD_PROGRAM_FAILURE", + "CL_MAP_FAILURE", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "", + "CL_INVALID_VALUE", + "CL_INVALID_DEVICE_TYPE", + "CL_INVALID_PLATFORM", + "CL_INVALID_DEVICE", + "CL_INVALID_CONTEXT", + "CL_INVALID_QUEUE_PROPERTIES", + "CL_INVALID_COMMAND_QUEUE", + "CL_INVALID_HOST_PTR", + "CL_INVALID_MEM_OBJECT", + "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR", + "CL_INVALID_IMAGE_SIZE", + "CL_INVALID_SAMPLER", + "CL_INVALID_BINARY", + "CL_INVALID_BUILD_OPTIONS", + "CL_INVALID_PROGRAM", + "CL_INVALID_PROGRAM_EXECUTABLE", + "CL_INVALID_KERNEL_NAME", + "CL_INVALID_KERNEL_DEFINITION", + "CL_INVALID_KERNEL", + "CL_INVALID_ARG_INDEX", + "CL_INVALID_ARG_VALUE", + "CL_INVALID_ARG_SIZE", + "CL_INVALID_KERNEL_ARGS", + "CL_INVALID_WORK_DIMENSION", + "CL_INVALID_WORK_GROUP_SIZE", + "CL_INVALID_WORK_ITEM_SIZE", + "CL_INVALID_GLOBAL_OFFSET", + "CL_INVALID_EVENT_WAIT_LIST", + "CL_INVALID_EVENT", + "CL_INVALID_OPERATION", + "CL_INVALID_GL_OBJECT", + "CL_INVALID_BUFFER_SIZE", + "CL_INVALID_MIP_LEVEL", + "CL_INVALID_GLOBAL_WORK_SIZE", + }; + + const int errorCount = sizeof(errorString) / sizeof(errorString[0]); + + const int index = -error; + + if (index == 4) { + cl_uint maxMemAlloc = 0; + + OCL_ERRCK_RETVAL ( clGetDeviceInfo( device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &maxMemAlloc, NULL) ); + + + fprintf(stderr, " Device Maximum block allocation size: %lu\n", maxMemAlloc); + } + + return (index >= 0 && index < errorCount) ? errorString[index] : ""; +} + +char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength) +{ + // locals + FILE* pFileStream = NULL; + size_t szSourceLength; + + // open the OpenCL source code file + #ifdef _WIN32 // Windows version + if(fopen_s(&pFileStream, cFilename, "rb") != 0) + { + return NULL; + } + #else // Linux version + pFileStream = fopen(cFilename, "rb"); + if(pFileStream == 0) + { + return NULL; + } + #endif + + size_t szPreambleLength = strlen(cPreamble); + szPreambleLength = 0; + + // get the length of the source code + fseek(pFileStream, 0, SEEK_END); + szSourceLength = ftell(pFileStream); + fseek(pFileStream, 0, SEEK_SET); + + // allocate a buffer for the source code string and read it in + char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1); + memcpy(cSourceString, cPreamble, szPreambleLength); + if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream) != 1) + { + fclose(pFileStream); + free(cSourceString); + return 0; + } + + // close the file and return the total length of the combined (preamble + source) string + fclose(pFileStream); + if(szFinalLength != 0) + { + *szFinalLength = szSourceLength + szPreambleLength; + } + cSourceString[szSourceLength + szPreambleLength] = '\0'; + + return cSourceString; +} diff --git a/benchmarks/opencl/sad/OpenCL_common.h b/benchmarks/opencl/sad/OpenCL_common.h new file mode 100644 index 00000000..c5180053 --- /dev/null +++ b/benchmarks/opencl/sad/OpenCL_common.h @@ -0,0 +1,22 @@ + +#ifndef __OPENCL_COMMON_H_ +#define __OPENCL_COMMON_H_ + +#include +#include +#include + +int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...); +const char* oclErrorString(cl_int error); +const char* oclDebugErrString(cl_int error, cl_device_id device); + +#define OCL_ERRCK_VAR(var) \ + { if (var != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(var)); } + +#define OCL_ERRCK_RETVAL(s) \ + { cl_int clerr = (s);\ + if (clerr != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(clerr)); } + +char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength); + +#endif diff --git a/benchmarks/opencl/sad/args.c b/benchmarks/opencl/sad/args.c new file mode 100644 index 00000000..9d751e29 --- /dev/null +++ b/benchmarks/opencl/sad/args.c @@ -0,0 +1,617 @@ + +#include +#include +#include +#include +#include +#include + +/*****************************************************************************/ +/* Memory management routines */ + +/* Free an array of owned strings. */ +void +pb_FreeStringArray(char **string_array) +{ + char **p; + + if (!string_array) return; + for (p = string_array; *p; p++) free(*p); + free(string_array); +} + +struct pb_PlatformParam * +pb_PlatformParam(char *name, char *version) +{ + if (name == NULL) { + fprintf(stderr, "pb_PlatformParam: Invalid argument\n"); + exit(-1); + } + + struct pb_PlatformParam *ret = + (struct pb_PlatformParam *)malloc(sizeof (struct pb_PlatformParam)); + + ret->name = name; + ret->version = version; + return ret; +} + +void +pb_FreePlatformParam(struct pb_PlatformParam *p) +{ + if (p == NULL) return; + + free(p->name); + free(p->version); + free(p); +} + +struct pb_DeviceParam * +pb_DeviceParam_index(int index) +{ + struct pb_DeviceParam *ret = + (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam)); + ret->criterion = pb_Device_INDEX; + ret->index = index; + return ret; +} + +struct pb_DeviceParam * +pb_DeviceParam_cpu(void) +{ + struct pb_DeviceParam *ret = + (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam)); + ret->criterion = pb_Device_CPU; + return ret; +} + +struct pb_DeviceParam * +pb_DeviceParam_gpu(void) +{ + struct pb_DeviceParam *ret = + (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam)); + ret->criterion = pb_Device_GPU; + return ret; +} + +struct pb_DeviceParam * +pb_DeviceParam_accelerator(void) +{ + struct pb_DeviceParam *ret = + (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam)); + ret->criterion = pb_Device_ACCELERATOR; + return ret; +} + +struct pb_DeviceParam * +pb_DeviceParam_name(char *name) +{ + struct pb_DeviceParam *ret = + (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam)); + ret->criterion = pb_Device_NAME; + ret->name = name; + return ret; +} + +void +pb_FreeDeviceParam(struct pb_DeviceParam *p) +{ + if (p == NULL) return; + + switch(p->criterion) { + case pb_Device_NAME: + free(p->name); + break; + case pb_Device_INDEX: + case pb_Device_CPU: + case pb_Device_ACCELERATOR: + break; + default: + fprintf(stderr, "pb_FreeDeviceParam: Invalid argument\n"); + exit(-1); + } +} + +void +pb_FreeParameters(struct pb_Parameters *p) +{ + free(p->outFile); + pb_FreeStringArray(p->inpFiles); + pb_FreePlatformParam(p->platform); + pb_FreeDeviceParam(p->device); + free(p); +} + +/*****************************************************************************/ + +/* Parse a comma-delimited list of strings into an + * array of strings. */ +static char ** +read_string_array(char *in) +{ + char **ret; + int i; + int count; /* Number of items in the input */ + char *substring; /* Current substring within 'in' */ + + /* Count the number of items in the string */ + count = 1; + for (i = 0; in[i]; i++) if (in[i] == ',') count++; + + /* Allocate storage */ + ret = (char **)malloc((count + 1) * sizeof(char *)); + + /* Create copies of the strings from the list */ + substring = in; + for (i = 0; i < count; i++) { + char *substring_end; + int substring_length; + + /* Find length of substring */ + for (substring_end = substring; + (*substring_end != ',') && (*substring_end != 0); + substring_end++); + + substring_length = substring_end - substring; + + /* Allocate memory and copy the substring */ + ret[i] = (char *)malloc(substring_length + 1); + memcpy(ret[i], substring, substring_length); + ret[i][substring_length] = 0; + + /* go to next substring */ + substring = substring_end + 1; + } + ret[i] = NULL; /* Write the sentinel value */ + + return ret; +} + +static void +report_parse_error(const char *str) +{ + fputs(str, stderr); +} + +/* Interpret a string as a 'pb_DeviceParam' value. + * Return a pointer to a new value, or NULL on failure. + */ +static struct pb_DeviceParam * +read_device_param(char *str) +{ + /* Try different ways of interpreting 'device_string' until one works */ + + /* If argument is an integer, then interpret it as a device index */ + errno = 0; + char *end; + long device_int = strtol(str, &end, 10); + if (!errno) { + /* Negative numbers are not valid */ + if (device_int < 0 || device_int > INT_MAX) return NULL; + + return pb_DeviceParam_index(device_int); + } + + /* Match against predefined strings */ + if (strcmp(str, "CPU") == 0) + return pb_DeviceParam_cpu(); + if (strcmp(str, "GPU") == 0) + return pb_DeviceParam_gpu(); + if (strcmp(str, "ACCELERATOR") == 0) + return pb_DeviceParam_accelerator(); + + /* Assume any other string is a device name */ + return pb_DeviceParam_name(strdup(str)); +} + +/* Interpret a string as a 'pb_PlatformParam' value. + * Return a pointer to a new value, or NULL on failure. + */ +static struct pb_PlatformParam * +read_platform_param(char *str) +{ + int separator_index; /* Index of the '-' character separating + * name and version number. It's -1 if + * there's no '-' character. */ + + /* Find the last occurrence of '-' in 'str' */ + { + char *cur; + separator_index = -1; + for (cur = str; *cur; cur++) { + if (*cur == '-') separator_index = cur - str; + } + } + + /* The platform name is either the entire string, or all characters before + * the separator */ + int name_length = separator_index == -1 ? strlen(str) : separator_index; + char *name_str = (char *)malloc(name_length + 1); + memcpy(name_str, str, name_length); + name_str[name_length] = 0; + + /* The version is either NULL, or all characters after the separator */ + char *version_str; + if (separator_index == -1) { + version_str = NULL; + } + else { + const char *version_input_str = str + separator_index + 1; + int version_length = strlen(version_input_str); + + version_str = (char *)malloc(version_length + 1); + memcpy(version_str, version_input_str, version_length); + version_str[version_length] = 0; + } + + /* Create output structure */ + return pb_PlatformParam(name_str, version_str); +} + +/****************************************************************************/ +/* Argument parsing state */ + +/* Argument parsing state. + * + * Arguments that are interpreted by the argument parser are removed from + * the list. Variables 'argc' and 'argn' do not count arguments that have + * been removed. + * + * During argument parsing, the array of arguments is compacted, overwriting + * the erased arguments. Variable 'argv_put' points to the array element + * where the next argument will be written. Variable 'argv_get' points to + * the array element where the next argument will be read from. + */ +struct argparse { + int argc; /* Number of arguments. Mutable. */ + int argn; /* Current argument index. */ + char **argv_get; /* Argument value being read. */ + char **argv_put; /* Argument value being written. + * argv_put <= argv_get. */ +}; + +static void +initialize_argparse(struct argparse *ap, int argc, char **argv) +{ + ap->argc = argc; + ap->argn = 0; + ap->argv_get = ap->argv_put = argv; +} + +/* Finish argument parsing, without processing the remaining arguments. + * Write new argument count into _argc. */ +static void +finalize_argparse(struct argparse *ap, int *_argc, char **argv) +{ + /* Move the remaining arguments */ + for(; ap->argn < ap->argc; ap->argn++) + *ap->argv_put++ = *ap->argv_get++; + + /* Update the argument count */ + *_argc = ap->argc; + + /* Insert a terminating NULL */ + argv[ap->argc] = NULL; +} + +/* Delete the current argument. The argument will not be visible + * when argument parsing is done. */ +static void +delete_argument(struct argparse *ap) +{ + if (ap->argn >= ap->argc) { + fprintf(stderr, "delete_argument\n"); + } + ap->argc--; + ap->argv_get++; +} + +/* Go to the next argument. Also, move the current argument to its + * final location in argv. */ +static void +next_argument(struct argparse *ap) +{ + if (ap->argn >= ap->argc) { + fprintf(stderr, "next_argument\n"); + } + /* Move argument to its new location. */ + *ap->argv_put++ = *ap->argv_get++; + ap->argn++; +} + +static int +is_end_of_arguments(struct argparse *ap) +{ + return ap->argn == ap->argc; +} + +/* Get the current argument */ +static char * +get_argument(struct argparse *ap) +{ + return *ap->argv_get; +} + +/* Get the current argument, and also delete it */ +static char * +consume_argument(struct argparse *ap) +{ + char *ret = get_argument(ap); + delete_argument(ap); + return ret; +} + +/****************************************************************************/ + +/* The result of parsing a command-line argument */ +typedef enum { + ARGPARSE_OK, /* Success */ + ARGPARSE_ERROR, /* Error */ + ARGPARSE_DONE /* Success, and do not continue parsing */ +} result; + +typedef result parse_action(struct argparse *ap, struct pb_Parameters *params); + + +/* A command-line option */ +struct option { + char short_name; /* If not 0, the one-character + * name of this option */ + const char *long_name; /* If not NULL, the long name of this option */ + parse_action *action; /* What to do when this option occurs. + * Sentinel value is NULL. + */ +}; + +/* Output file + * + * -o FILE + */ +static result +parse_output_file(struct argparse *ap, struct pb_Parameters *params) +{ + if (is_end_of_arguments(ap)) + { + report_parse_error("Expecting file name after '-o'\n"); + return ARGPARSE_ERROR; + } + + /* Replace the output file name */ + free(params->outFile); + params->outFile = strdup(consume_argument(ap)); + + return ARGPARSE_OK; +} + +/* Input files + * + * -i FILE,FILE,... + */ +static result +parse_input_files(struct argparse *ap, struct pb_Parameters *params) +{ + if (is_end_of_arguments(ap)) + { + report_parse_error("Expecting file name after '-i'\n"); + return ARGPARSE_ERROR; + } + + /* Replace the input file list */ + pb_FreeStringArray(params->inpFiles); + params->inpFiles = read_string_array(consume_argument(ap)); + return ARGPARSE_OK; +} + +/* End of options + * + * -- + */ + +static result +parse_end_options(struct argparse *ap, struct pb_Parameters *params) +{ + return ARGPARSE_DONE; +} + +/* OpenCL device + * + * --device X + */ + +static result +parse_device(struct argparse *ap, struct pb_Parameters *params) +{ + /* Read the next argument, which specifies a device */ + + if (is_end_of_arguments(ap)) + { + report_parse_error("Expecting device specification after '--device'\n"); + return ARGPARSE_ERROR; + } + + char *device_string = consume_argument(ap); + struct pb_DeviceParam *device_param = read_device_param(device_string); + + if (!device_param) { + report_parse_error("Unrecognized device specification format on command line\n"); + return ARGPARSE_ERROR; + } + + /* Save the result */ + pb_FreeDeviceParam(params->device); + params->device = device_param; + + return ARGPARSE_OK; +} + +static result +parse_platform(struct argparse *ap, struct pb_Parameters *params) +{ + /* Read the next argument, which specifies a platform */ + + if (is_end_of_arguments(ap)) + { + report_parse_error("Expecting device specification after '--platform'\n"); + return ARGPARSE_ERROR; + } + + char *platform_string = consume_argument(ap); + struct pb_PlatformParam *platform_param = read_platform_param(platform_string); + + if (!platform_param) { + report_parse_error("Unrecognized platform specification format on command line\n"); + return ARGPARSE_ERROR; + } + + /* Save the result */ + pb_FreePlatformParam(params->platform); + params->platform = platform_param; + + return ARGPARSE_OK; +} + + +static struct option options[] = { + { 'o', NULL, &parse_output_file }, + { 'i', NULL, &parse_input_files }, + { '-', NULL, &parse_end_options }, + { 0, "device", &parse_device }, + { 0, "platform", &parse_platform }, + { 0, NULL, NULL } +}; + +static int +is_last_option(struct option *op) +{ + return op->action == NULL; +} + +/****************************************************************************/ + +/* Parse command-line parameters. + * Return zero on error, nonzero otherwise. + * On error, the other outputs may be invalid. + * + * The information collected from parameters is used to update + * 'ret'. 'ret' should be initialized. + * + * '_argc' and 'argv' are updated to contain only the unprocessed arguments. + */ +static int +pb_ParseParameters (struct pb_Parameters *ret, int *_argc, char **argv) +{ + char *err_message; + struct argparse ap; + + /* Each argument */ + initialize_argparse(&ap, *_argc, argv); + while(!is_end_of_arguments(&ap)) { + result arg_result; /* Result of parsing this option */ + char *arg = get_argument(&ap); + + /* Process this argument */ + if (arg[0] == '-') { + /* Single-character flag */ + if ((arg[1] != 0) && (arg[2] == 0)) { + delete_argument(&ap); /* This argument is consumed here */ + + /* Find a matching short option */ + struct option *op; + for (op = options; !is_last_option(op); op++) { + if (op->short_name == arg[1]) { + arg_result = (*op->action)(&ap, ret); + goto option_was_processed; + } + } + + /* No option matches */ + report_parse_error("Unexpected command-line parameter\n"); + arg_result = ARGPARSE_ERROR; + goto option_was_processed; + } + + /* Long flag */ + if (arg[1] == '-') { + delete_argument(&ap); /* This argument is consumed here */ + + /* Find a matching long option */ + struct option *op; + for (op = options; !is_last_option(op); op++) { + if (op->long_name && strcmp(&arg[2], op->long_name) == 0) { + arg_result = (*op->action)(&ap, ret); + goto option_was_processed; + } + } + + /* No option matches */ + report_parse_error("Unexpected command-line parameter\n"); + arg_result = ARGPARSE_ERROR; + goto option_was_processed; + } + } + else { + /* Other arguments are ignored */ + next_argument(&ap); + arg_result = ARGPARSE_OK; + goto option_was_processed; + } + + option_was_processed: + /* Decide what to do next based on 'arg_result' */ + switch(arg_result) { + case ARGPARSE_OK: + /* Continue processing */ + break; + + case ARGPARSE_ERROR: + /* Error exit from the function */ + return 0; + + case ARGPARSE_DONE: + /* Normal exit from the argument parsing loop */ + goto end_of_options; + } + } /* end for each argument */ + + /* If all arguments were processed, then normal exit from the loop */ + + end_of_options: + finalize_argparse(&ap, _argc, argv); + return 1; +} + +/*****************************************************************************/ +/* Other exported functions */ + +struct pb_Parameters * +pb_ReadParameters(int *_argc, char **argv) +{ + struct pb_Parameters *ret = + (struct pb_Parameters *)malloc(sizeof(struct pb_Parameters)); + + /* Initialize the parameters structure */ + ret->outFile = NULL; + ret->inpFiles = (char **)malloc(sizeof(char *)); + ret->inpFiles[0] = NULL; + ret->platform = NULL; + ret->device = NULL; + + /* Read parameters and update _argc, argv */ + if (!pb_ParseParameters(ret, _argc, argv)) { + /* Parse error */ + pb_FreeParameters(ret); + return NULL; + } + + return ret; +} + +int +pb_Parameters_CountInputs(struct pb_Parameters *p) +{ + int n; + + for (n = 0; p->inpFiles[n]; n++); + return n; +} + diff --git a/benchmarks/opencl/sad/file.c b/benchmarks/opencl/sad/file.c new file mode 100644 index 00000000..5187c7f7 --- /dev/null +++ b/benchmarks/opencl/sad/file.c @@ -0,0 +1,55 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2007 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +#include +#include "file.h" + +unsigned short +read16u(FILE *f) +{ + int n; + + n = fgetc(f); + n += fgetc(f) << 8; + + return n; +} + +short +read16i(FILE *f) +{ + int n; + + n = fgetc(f); + n += fgetc(f) << 8; + + return n; +} + +void +write32u(FILE *f, unsigned int i) +{ + putc(i, f); + putc(i >> 8, f); + putc(i >> 16, f); + putc(i >> 24, f); +} + +void +write16u(FILE *f, unsigned short h) +{ + putc(h, f); + putc(h >> 8, f); +} + +void +write16i(FILE *f, short h) +{ + putc(h, f); + putc(h >> 8, f); +} diff --git a/benchmarks/opencl/sad/file.h b/benchmarks/opencl/sad/file.h new file mode 100644 index 00000000..5d783e91 --- /dev/null +++ b/benchmarks/opencl/sad/file.h @@ -0,0 +1,22 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2007 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned short read16u(FILE *f); +short read16i(FILE *f); + +void write32u(FILE *f, unsigned int i); +void write16u(FILE *f, unsigned short h); +void write16i(FILE *f, short h); + +#ifdef __cplusplus +} +#endif diff --git a/benchmarks/opencl/sad/frame.bin b/benchmarks/opencl/sad/frame.bin new file mode 100755 index 00000000..f8142c46 Binary files /dev/null and b/benchmarks/opencl/sad/frame.bin differ diff --git a/benchmarks/opencl/sad/gpu_info.c b/benchmarks/opencl/sad/gpu_info.c new file mode 100644 index 00000000..4d641f81 --- /dev/null +++ b/benchmarks/opencl/sad/gpu_info.c @@ -0,0 +1,55 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2010 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ +//#include +#include +#include +#include +#include + +#include "gpu_info.h" + +void compute_active_thread(size_t *thread, + size_t *grid, + int task, + int pad, + int major, + int minor, + int sm) +{ + int max_thread; + int max_block=8; + if(major==1) + { + if(minor>=2) + max_thread=1024; + else + max_thread=768; + } + else if(major==2) + max_thread=1536; + else + //newer GPU //keep using 2.0 + max_thread=1536; + + int _grid; + int _thread; + + if(task*pad>sm*max_thread) + { + _thread=max_thread/max_block; + _grid = ((task*pad+_thread-1)/_thread)*_thread; + } + else + { + _thread=pad; + _grid=task*pad; + } + + thread[0]=_thread; + grid[0]=_grid; +} diff --git a/benchmarks/opencl/sad/gpu_info.h b/benchmarks/opencl/sad/gpu_info.h new file mode 100644 index 00000000..4219cda9 --- /dev/null +++ b/benchmarks/opencl/sad/gpu_info.h @@ -0,0 +1,20 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2010 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +#ifndef __GPUINFOH__ +#define __GPUINFOH__ + +void compute_active_thread(size_t *thread, + size_t *grid, + int task, + int pad, + int major, + int minor, + int sm); + +#endif diff --git a/benchmarks/opencl/sad/image.c b/benchmarks/opencl/sad/image.c new file mode 100644 index 00000000..d7ed0fcc --- /dev/null +++ b/benchmarks/opencl/sad/image.c @@ -0,0 +1,56 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2007 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +#include +#include +#include "file.h" +#include "image.h" + +struct image_i16 * +load_image(char *filename) +{ + FILE *infile; + short *data; + int w; + int h; + + infile = fopen(filename, "r"); + + if (!infile) + { + fprintf(stderr, "Cannot find file '%s'\n", filename); + exit(-1); + } + + /* Read image dimensions */ + w = read16u(infile); + h = read16u(infile); + + /* Read image contents */ + data = (short *)malloc(w * h * sizeof(short)); + fread(data, sizeof(short), w * h, infile); + + fclose(infile); + + /* Create the return data structure */ + { + struct image_i16 *ret = + (struct image_i16 *)malloc(sizeof(struct image_i16)); + ret->width = w; + ret->height = h; + ret->data = data; + return ret; + } +} + +void +free_image(struct image_i16 *img) +{ + free(img->data); + free(img); +} diff --git a/benchmarks/opencl/sad/image.h b/benchmarks/opencl/sad/image.h new file mode 100644 index 00000000..27fc3e0b --- /dev/null +++ b/benchmarks/opencl/sad/image.h @@ -0,0 +1,25 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2007 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +struct image_i16 +{ + int width; + int height; + short *data; +}; + +#ifdef __cplusplus +extern "C" { +#endif + +struct image_i16 * load_image(char *filename); +void free_image(struct image_i16 *); + +#ifdef __cplusplus +} +#endif diff --git a/benchmarks/opencl/sad/kernel.cl b/benchmarks/opencl/sad/kernel.cl new file mode 100644 index 00000000..f0e1c2e0 --- /dev/null +++ b/benchmarks/opencl/sad/kernel.cl @@ -0,0 +1,326 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2007 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +#ifndef MAX_POS +#define MAX_POS 1089 +#define CEIL_POS 61 +#define POS_PER_THREAD 18 +#define MAX_POS_PADDED 1096 +#define THREADS_W 1 +#define THREADS_H 1 +#define SEARCH_RANGE 16 +#define SEARCH_DIMENSION 33 +#endif + +/* The compute kernel. */ +/* The macros THREADS_W and THREADS_H specify the width and height of the + * area to be processed by one thread, measured in 4-by-4 pixel blocks. + * Larger numbers mean more computation per thread block. + * + * The macro POS_PER_THREAD specifies the number of search positions for which + * an SAD is computed. A larger value indicates more computation per thread, + * and fewer threads per thread block. It must be a multiple of 3 and also + * must be at most 33 because the loop to copy from shared memory uses + * 32 threads per 4-by-4 pixel block. + * + */ + +// AMD OpenCL fails UINT_CUDA_V +#define SHORT2_V 0 +#define UINT_CUDA_V 0 + +// Either works +#define VEC_LOAD 0 + +// CAST_STORE is only method that works for all implementations of OpenCL tested +#define VEC_STORE 0 +#define CAST_STORE 0 +#define SCALAR_STORE 1 + +__kernel void mb_sad_calc(__global unsigned short *blk_sad, + __global unsigned short *frame, + int mb_width, + int mb_height, + __global unsigned short* img_ref) // __read_only image2d_t img_ref) +{ + int tx = (get_local_id(0) / CEIL_POS) % THREADS_W; + int ty = (get_local_id(0) / CEIL_POS) / THREADS_W; + int bx = get_group_id(0); + int by = get_group_id(1); + int img_width = mb_width*16; + int lidx = get_local_id(0); + + // Macroblock and sub-block coordinates + int mb_x = (tx + bx * THREADS_W) >> 2; + int mb_y = (ty + by * THREADS_H) >> 2; + int block_x = (tx + bx * THREADS_W) & 0x03; + int block_y = (ty + by * THREADS_H) & 0x03; + + // If this thread is assigned to an invalid 4x4 block, do nothing + if ((mb_x < mb_width) && (mb_y < mb_height)) + { + // Pixel offset of the origin of the current 4x4 block + int frame_x = ((mb_x << 2) + block_x) << 2; + int frame_y = ((mb_y << 2) + block_y) << 2; + + // Origin of the search area for this 4x4 block + int ref_x = frame_x - SEARCH_RANGE; + int ref_y = frame_y - SEARCH_RANGE; + + // Origin in the current frame for this 4x4 block + int cur_o = frame_y * img_width + frame_x; + + int search_pos; + int search_pos_base = + (lidx % CEIL_POS) * POS_PER_THREAD; + int search_pos_end = search_pos_base + POS_PER_THREAD; + + // Don't go past bounds + if (search_pos_end > MAX_POS) { + search_pos_end = MAX_POS; + } + + // For each search position, within the range allocated to this thread + for (search_pos = search_pos_base; + search_pos < search_pos_end; + search_pos++) { + unsigned short sad4x4 = 0; + int search_off_x = ref_x + (search_pos % SEARCH_DIMENSION); + int search_off_y = ref_y + (search_pos / SEARCH_DIMENSION); + + // 4x4 SAD computation + for(int y=0; y<4; y++) { + for (int x=0; x<4; x++) { + + // ([unsigned] short)read_imageui or + // read_imagei is required for correct calculation. + // Though read_imagei() is shorter, its results are undefined by specification since the input + // is an unsigned type, CL_UNSIGNED_INT16 + + int sx = search_off_x + x; + sx = (sx < 0) ? 0 : sx; + sx = (sx >= img_width) ? img_width - 1 : sx; + int sy = search_off_y + y; + sy = (sy < 0) ? 0 : sy; + sy = (sy >= mb_height * 16) ? mb_height * 16 - 1 : sy; + sad4x4 += abs((unsigned short) img_ref[(sx) + (sy) * img_width] - + frame[cur_o + y * img_width + x]); + } + } + + // Save this value into the local SAD array + blk_sad[mb_width * mb_height * MAX_POS_PADDED * (9 + 16) + + (mb_y * mb_width + mb_x) * MAX_POS_PADDED * 16 + + (4 * block_y + block_x) * MAX_POS_PADDED+search_pos] = sad4x4; + } + } + +} + + +//typedef unsigned int uint; + +__kernel void larger_sad_calc_8(__global unsigned short *blk_sad, + int mb_width, + int mb_height) +{ + int tx = get_local_id(1) & 1; + int ty = get_local_id(1) >> 1; + + // Macroblock and sub-block coordinates + int mb_x = get_group_id(0); + int mb_y = get_group_id(1); + int lidx = get_local_id(0); + + // Number of macroblocks in a frame + int macroblocks = mul24(mb_width, mb_height); + int macroblock_index = (mul24(mb_y, mb_width) + mb_x) * MAX_POS_PADDED; + + __global unsigned short *bi; + __global unsigned short *bo_6, *bo_5, *bo_4; + + // MXPA + bo_4 = (__global unsigned short *) tx; + bo_5 = (__global unsigned short *) tx; + + + bi = blk_sad + + (mul24(macroblocks, 25) + (ty * 8 + tx * 2)) * MAX_POS_PADDED + + macroblock_index * 16; + + // Block type 6: 4x8 + bo_6 = blk_sad + + ((macroblocks << 4) + macroblocks + (ty * 4 + tx * 2)) * MAX_POS_PADDED + + macroblock_index * 8; + + if (ty < 100) // always true, but improves register allocation + { + // Block type 5: 8x4 + bo_5 = blk_sad + + ((macroblocks << 3) + macroblocks + (ty * 4 + tx)) * MAX_POS_PADDED + + macroblock_index * 8; + + // Block type 4: 8x8 + bo_4 = blk_sad + + ((macroblocks << 2) + macroblocks + (ty * 2 + tx)) * MAX_POS_PADDED + + macroblock_index * 4; + } + + for (int search_pos = lidx; search_pos < (MAX_POS+1)/2; search_pos += 32) + { +#if SHORT2_V + #if VEC_LOAD + ushort2 s00 = vload2(search_pos, bi); + ushort2 s01 = vload2(search_pos+ MAX_POS_PADDED/2, bi); + ushort2 s10 = vload2(search_pos+4*MAX_POS_PADDED/2, bi); + ushort2 s11 = vload2(search_pos+5*MAX_POS_PADDED/2, bi); + #else + ushort2 s00 = (ushort2) (bi[search_pos*2], bi[search_pos*2+1]); + ushort2 s01 = (ushort2) (bi[(search_pos + MAX_POS_PADDED/2)*2], bi[(search_pos + MAX_POS_PADDED/2)*2+1]); + ushort2 s10 = (ushort2) (bi[(search_pos + 4*MAX_POS_PADDED/2)*2], bi[(search_pos + 4*MAX_POS_PADDED/2)*2+1]); + ushort2 s11 = (ushort2) (bi[(search_pos + 5*MAX_POS_PADDED/2)*2], bi[(search_pos + 5*MAX_POS_PADDED/2)*2+1]); + #endif + + #if VEC_STORE + ushort2 s0010 = s00 + s10; + ushort2 s0111 = s01 + s11; + ushort2 s0001 = s00 + s01; + ushort2 s1011 = s10 + s11; + ushort2 s00011011 = s0001 + s1011; + + vstore2(s0010, search_pos, bo_6); + vstore2(s0111, search_pos+MAX_POS_PADDED/2, bo_6); + vstore2(s0001, search_pos, bo_5); + vstore2(s1011, search_pos+2*MAX_POS_PADDED/2, bo_5); + vstore2(s00011011, search_pos, bo_4); + #elif CAST_STORE + ((__global ushort2 *)bo_6)[search_pos] = s00 + s10; + ((__global ushort2 *)bo_6)[search_pos+MAX_POS_PADDED/2] = s01 + s11; + ((__global ushort2 *)bo_5)[search_pos] = s00 + s01; + ((__global ushort2 *)bo_5)[search_pos+2*MAX_POS_PADDED/2] = s10 + s11; + ((__global ushort2 *)bo_4)[search_pos] = (s00 + s01) + (s10 + s11); + #else // SCALAR_STORE + bo_6[search_pos*2] = s00.x + s10.x; + bo_6[search_pos*2+1] = s00.y + s10.y; + bo_6[(search_pos+MAX_POS_PADDED/2)*2] = s01.x + s11.x; + bo_6[(search_pos+MAX_POS_PADDED/2)*2+1] = s01.y + s11.y; + bo_5[search_pos*2] = s00.x + s01.x; + bo_5[search_pos*2+1] = s00.y + s01.y; + bo_5[(search_pos+2*MAX_POS_PADDED/2)*2] = s10.x + s11.x; + bo_5[(search_pos+2*MAX_POS_PADDED/2)*2+1] = s10.y + s11.y; + bo_4[search_pos*2] = (s00.x + s01.x) + (s10.x + s11.x); + bo_4[search_pos*2+1] = (s00.y + s01.y) + (s10.y + s11.y); + #endif +#else // UINT_CUDA_V + uint i00 = ((__global uint *)bi)[search_pos]; + uint i01 = ((__global uint *)bi)[search_pos + MAX_POS_PADDED/2]; + uint i10 = ((__global uint *)bi)[search_pos + 4*MAX_POS_PADDED/2]; + uint i11 = ((__global uint *)bi)[search_pos + 5*MAX_POS_PADDED/2]; + + ((__global uint *)bo_6)[search_pos] = i00 + i10; + ((__global uint *)bo_6)[search_pos+MAX_POS_PADDED/2] = i01 + i11; + ((__global uint *)bo_5)[search_pos] = i00 + i01; + ((__global uint *)bo_5)[search_pos+2*MAX_POS_PADDED/2] = i10 + i11; + ((__global uint *)bo_4)[search_pos] = (i00 + i01) + (i10 + i11); +#endif + } + +} + + + +__kernel void larger_sad_calc_16(__global unsigned short *blk_sad, + int mb_width, + int mb_height) +{ + // Macroblock coordinates + int mb_x = get_group_id(0); + int mb_y = get_group_id(1); + int search_pos = get_local_id(0); + + // Number of macroblocks in a frame + int macroblocks = mul24(mb_width, mb_height) * MAX_POS_PADDED; + int macroblock_index = (mul24(mb_y, mb_width) + mb_x) * MAX_POS_PADDED; + + __global unsigned short *bi; + __global unsigned short *bo_3, *bo_2, *bo_1; + + //bi = blk_sad + macroblocks * 5 + macroblock_index * 4; + bi = blk_sad + ((macroblocks + macroblock_index) << 2) + macroblocks; + + // Block type 3: 8x16 + //bo_3 = blk_sad + macroblocks * 3 + macroblock_index * 2; + bo_3 = blk_sad + ((macroblocks + macroblock_index) << 1) + macroblocks; + + // Block type 5: 8x4 + bo_2 = blk_sad + macroblocks + macroblock_index * 2; + + // Block type 4: 8x8 + bo_1 = blk_sad + macroblock_index; + + for ( ; search_pos < (MAX_POS+1)/2; search_pos += 32) + { +#if SHORT2_V + #if VEC_LOAD + ushort2 s00 = vload2(search_pos, bi); + ushort2 s01 = vload2(search_pos+ MAX_POS_PADDED/2, bi); + ushort2 s10 = vload2(search_pos+2*MAX_POS_PADDED/2, bi); + ushort2 s11 = vload2(search_pos+3*MAX_POS_PADDED/2, bi); + #else + ushort2 s00 = (ushort2) (bi[search_pos*2], bi[search_pos*2+1]); + ushort2 s01 = (ushort2) (bi[(search_pos + MAX_POS_PADDED/2)*2], bi[(search_pos + MAX_POS_PADDED/2)*2+1]); + ushort2 s10 = (ushort2) (bi[(search_pos + 2*MAX_POS_PADDED/2)*2], bi[(search_pos + 2*MAX_POS_PADDED/2)*2+1]); + ushort2 s11 = (ushort2) (bi[(search_pos + 3*MAX_POS_PADDED/2)*2], bi[(search_pos + 3*MAX_POS_PADDED/2)*2+1]); + #endif + + #if VEC_STORE + ushort2 s0010 = s00 + s10; + ushort2 s0111 = s01 + s11; + ushort2 s0001 = s00 + s01; + ushort2 s1011 = s10 + s11; + ushort2 s00011011 = s0001 + s1011; + + vstore2(s0010, search_pos, bo_3); + vstore2(s0111, search_pos+MAX_POS_PADDED/2, bo_3); + vstore2(s0001, search_pos, bo_2); + vstore2(s1011, search_pos+MAX_POS_PADDED/2, bo_2); + vstore2(s00011011, search_pos, bo_1); + #elif CAST_STORE + ((__global ushort2 *)bo_3)[search_pos] = s00 + s10; + ((__global ushort2 *)bo_3)[search_pos+MAX_POS_PADDED/2] = s01 + s11; + ((__global ushort2 *)bo_2)[search_pos] = s00 + s01; + ((__global ushort2 *)bo_2)[search_pos+MAX_POS_PADDED/2] = s10 + s11; + ((__global ushort2 *)bo_1)[search_pos] = (s00 + s01) + (s10 + s11); + #else // SCALAR_STORE + bo_3[search_pos*2] = s00.x + s10.x; + bo_3[search_pos*2+1] = s00.y + s10.y; + bo_3[(search_pos+MAX_POS_PADDED/2)*2] = s01.x + s11.x; + bo_3[(search_pos+MAX_POS_PADDED/2)*2+1] = s01.y + s11.y; + bo_2[search_pos*2] = s00.x + s01.x; + bo_2[search_pos*2+1] = s00.y + s01.y; + bo_2[(search_pos+MAX_POS_PADDED/2)*2] = s10.x + s11.x; + bo_2[(search_pos+MAX_POS_PADDED/2)*2+1] = s10.y + s11.y; + bo_1[search_pos*2] = (s00.x + s01.x) + (s10.x + s11.x); + bo_1[search_pos*2+1] = (s00.y + s01.y) + (s10.y + s11.y); + #endif +#else // UINT_CUDA_V + uint i00 = ((__global uint *)bi)[search_pos]; + uint i01 = ((__global uint *)bi)[search_pos + MAX_POS_PADDED/2]; + uint i10 = ((__global uint *)bi)[search_pos + 2*MAX_POS_PADDED/2]; + uint i11 = ((__global uint *)bi)[search_pos + 3*MAX_POS_PADDED/2]; + + ((__global uint *)bo_3)[search_pos] = i00 + i10; + ((__global uint *)bo_3)[search_pos+MAX_POS_PADDED/2] = i01 + i11; + ((__global uint *)bo_2)[search_pos] = i00 + i01; + ((__global uint *)bo_2)[search_pos+MAX_POS_PADDED/2] = i10 + i11; + ((__global uint *)bo_1)[search_pos] = (i00 + i01) + (i10 + i11); +#endif + } +} + + diff --git a/benchmarks/opencl/sad/libsad.a b/benchmarks/opencl/sad/libsad.a new file mode 100644 index 00000000..fa6e1a00 Binary files /dev/null and b/benchmarks/opencl/sad/libsad.a differ diff --git a/benchmarks/opencl/sad/main.cc b/benchmarks/opencl/sad/main.cc new file mode 100644 index 00000000..a156bd3b --- /dev/null +++ b/benchmarks/opencl/sad/main.cc @@ -0,0 +1,545 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2007 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +#include "OpenCL_common.h" +#include "file.h" +#include "image.h" +#include "sad.h" +#include "sad_kernel.h" + +static unsigned short *load_sads(char *filename); +static void write_sads(char *filename, int image_width_macroblocks, + int image_height_macroblocks, unsigned short *sads); +static void write_sads_directly(char *filename, int width, int height, + unsigned short *sads); + +/* FILE I/O */ + +unsigned short *load_sads(char *filename) { + FILE *infile; + unsigned short *sads; + int w; + int h; + int sads_per_block; + + infile = fopen(filename, "r"); + + if (!infile) { + fprintf(stderr, "Cannot find file '%s'\n", filename); + exit(-1); + } + + /* Read image dimensions (measured in macroblocks) */ + w = read16u(infile); + h = read16u(infile); + + /* Read SAD values. Only interested in the 4x4 SAD values, which are + * at the end of the file. */ + sads_per_block = MAX_POS_PADDED * (w * h); + fseek(infile, 25 * sads_per_block * sizeof(unsigned short), SEEK_CUR); + + sads = (unsigned short *)malloc(sads_per_block * 16 * sizeof(unsigned short)); + fread(sads, sizeof(unsigned short), sads_per_block * 16, infile); + fclose(infile); + + return sads; +} + +/* Compare the reference SADs to the expected SADs. + */ +void check_sads(unsigned short *sads_reference, unsigned short *sads_computed, + int image_size_macroblocks) { + int block; + + /* Check the 4x4 SAD values. These are in sads_reference. + * Ignore the data at the beginning of sads_computed. */ + sads_computed += 25 * MAX_POS_PADDED * image_size_macroblocks; + + for (block = 0; block < image_size_macroblocks; block++) { + int subblock; + + for (subblock = 0; subblock < 16; subblock++) { + int sad_index; + + for (sad_index = 0; sad_index < MAX_POS; sad_index++) { + int index = (block * 16 + subblock) * MAX_POS_PADDED + sad_index; + + if (sads_reference[index] != sads_computed[index]) { +#if 0 + /* Print exactly where the mismatch was seen */ + printf("M %3d %2d %4d (%d = %d)\n", block, subblock, sad_index, sads_reference[index], sads_computed[index]); +#else + goto mismatch; +#endif + } + } + } + } + + printf("Success.\n"); + return; + +mismatch: + printf("Computed SADs do not match expected values.\n"); +} + +/* Extract the SAD data for a particular block type for a particular + * macroblock from the array of SADs of that block type. */ +static inline void write_subblocks(FILE *outfile, + unsigned short *subblock_array, + int macroblock, int count) { + int block; + int pos; + + for (block = 0; block < count; block++) { + unsigned short *vec = + subblock_array + (block + macroblock * count) * MAX_POS_PADDED; + + /* Write all SADs for this sub-block */ + for (pos = 0; pos < MAX_POS; pos++) + write16u(outfile, *vec++); + } +} + +/* Write some SAD data to a file for output checking. + * + * All SAD values for six rows of macroblocks are written. + * The six rows consist of the top two, middle two, and bottom two image rows. + */ +void write_sads(char *filename, int mb_width, int mb_height, + unsigned short *sads) { + FILE *outfile = fopen(filename, "w"); + int mbs = mb_width * mb_height; + int row_indir; + int row_indices[6] = { + 0, 1, mb_height / 2 - 1, mb_height / 2, mb_height - 2, mb_height - 1}; + + if (outfile == NULL) { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } + + /* Write the number of output macroblocks */ + write32u(outfile, mb_width * 6); + + /* Write zeros */ + write32u(outfile, 0); + + /* Each row */ + for (row_indir = 0; row_indir < 6; row_indir++) { + int row = row_indices[row_indir]; + + /* Each block in row */ + int block; + for (block = mb_width * row; block < mb_width * (row + 1); block++) { + int blocktype; + + /* Write SADs for all sub-block types */ + for (blocktype = 1; blocktype <= 7; blocktype++) + write_subblocks(outfile, sads + SAD_TYPE_IX(blocktype, mbs), block, + SAD_TYPE_CT(blocktype)); + } + } + + fclose(outfile); +} + +/* FILE I/O for debugging */ + +static void write_sads_directly(char *filename, int width, int height, + unsigned short *sads) { + FILE *f = fopen(filename, "w"); + int n; + + write16u(f, width); + write16u(f, height); + for (n = 0; n < 41 * MAX_POS_PADDED * (width * height); n++) { + write16u(f, sads[n]); + } + fclose(f); +} + +static void print_test_sad_vector(unsigned short *base, int macroblock, + int count) { + int n; + int searchpos = 17 * 33 + 17; + for (n = 0; n < count; n++) + printf(" %d", base[(count * macroblock + n) * MAX_POS_PADDED + searchpos]); +} + +static void print_test_sads(unsigned short *sads_computed, int mbs) { + int macroblock = 5; + int blocktype; + + for (blocktype = 1; blocktype <= 7; blocktype++) { + printf("%d:", blocktype); + print_test_sad_vector(sads_computed + SAD_TYPE_IX(blocktype, mbs), + macroblock, SAD_TYPE_CT(blocktype)); + puts("\n"); + } +} + +/* MAIN */ + +int main(int argc, char **argv) { + struct image_i16 *ref_image; + struct image_i16 *cur_image; + unsigned short *sads_computed; /* SADs generated by the program */ + + int image_size_bytes; + int image_width_macroblocks, image_height_macroblocks; + int image_size_macroblocks; + + struct pb_TimerSet timers; + struct pb_Parameters *params; + + char oclOverhead[] = "OpenCL Overhead"; + + pb_InitializeTimerSet(&timers); + pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); + + params = pb_ReadParameters(&argc, argv); + params->inpFiles = (char **)malloc(sizeof(char *) * 3); + params->inpFiles[0] = (char *)malloc(100); + params->inpFiles[1] = (char *)malloc(100); + params->inpFiles[2] = NULL; + strncpy(params->inpFiles[0], "reference.bin", 100); + strncpy(params->inpFiles[1], "frame.bin", 100); + + if (pb_Parameters_CountInputs(params) != 2) { + fprintf(stderr, "Expecting two input filenames\n"); + exit(-1); + } + + /* Read input files */ + pb_SwitchToTimer(&timers, pb_TimerID_IO); + ref_image = load_image(params->inpFiles[0]); + cur_image = load_image(params->inpFiles[1]); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + printf("Ok\n"); + + if ((ref_image->width != cur_image->width) || + (ref_image->height != cur_image->height)) { + fprintf(stderr, "Input images must be the same size\n"); + exit(-1); + } + if ((ref_image->width % 16) || (ref_image->height % 16)) { + fprintf(stderr, "Input image size must be an integral multiple of 16\n"); + exit(-1); + } + + printf("Ok\n"); + + /* Compute parameters, allocate memory */ + image_size_bytes = ref_image->width * ref_image->height * sizeof(short); + image_width_macroblocks = ref_image->width >> 4; + image_height_macroblocks = ref_image->height >> 4; + image_size_macroblocks = image_width_macroblocks * image_height_macroblocks; + + sads_computed = (unsigned short *)malloc( + 41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(short)); + + // Run the kernel code + // ************************************************************************ + + cl_int ciErrNum; + cl_command_queue clCommandQueue; + + cl_kernel mb_sad_calc; + cl_kernel larger_sad_calc_8; + cl_kernel larger_sad_calc_16; + + cl_mem imgRef; /* Reference image on the device */ + cl_mem d_cur_image; /* Current image on the device */ + cl_mem d_sads; /* SADs on the device */ + + // x : image_width_macroblocks + // y : image_height_macroblocks + + pb_Context *pb_context; + pb_context = pb_InitOpenCLContext(params); + if (pb_context == NULL) { + fprintf(stderr, "Error: No OpenCL platform/device can be found."); + return -1; + } + + printf("Ok+\n"); + + cl_int clStatus; + cl_device_id clDevice = (cl_device_id)pb_context->clDeviceId; + cl_platform_id clPlatform = (cl_platform_id)pb_context->clPlatformId; + cl_context clContext = (cl_context)pb_context->clContext; + + clCommandQueue = clCreateCommandQueue(clContext, clDevice, + CL_QUEUE_PROFILING_ENABLE, &ciErrNum); + OCL_ERRCK_VAR(ciErrNum); + + printf("Ok!\n"); + + pb_SetOpenCL(&clContext, &clCommandQueue); + + printf("Ok!\n"); + + pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); + + // Read Source Code File + /*size_t program_length; +const char* source_path = "src/opencl_base/kernel.cl"; +char* source = oclLoadProgSource(source_path, "", &program_length); +if(!source) { + fprintf(stderr, "Could not load program source\n"); exit(1); +} + + cl_program clProgram = clCreateProgramWithSource(clContext, 1, (const char +**)&source, &program_length, &ciErrNum);*/ +printf("Ok//-\n"); + cl_program clProgram = clCreateProgramWithBuiltInKernels( + clContext, 1, &clDevice, "mb_sad_calc;larger_sad_calc_8;larger_sad_calc_16", &ciErrNum); + printf("Ok//+\n"); + OCL_ERRCK_VAR(ciErrNum); + + printf("Ok+\n"); + + //free(source); + + // JIT Compilation Options + char compileOptions[1024]; + // -cl-nv-verbose + sprintf(compileOptions, "\ + -D MAX_POS=%u -D CEIL_POS=%u\ + -D POS_PER_THREAD=%u -D MAX_POS_PADDED=%u\ + -D THREADS_W=%u -D THREADS_H=%u\ + -D SEARCH_RANGE=%u -D SEARCH_DIMENSION=%u\ + \0", + MAX_POS, CEIL(MAX_POS, POS_PER_THREAD), POS_PER_THREAD, + MAX_POS_PADDED, THREADS_W, THREADS_H, SEARCH_RANGE, SEARCH_DIMENSION); + printf("options = %s\n", compileOptions); + + OCL_ERRCK_RETVAL( + clBuildProgram(clProgram, 1, &clDevice, compileOptions, NULL, NULL)); + + /* + char *build_log; + size_t ret_val_size; + OCL_ERRCK_RETVAL( clGetProgramBuildInfo(clProgram, clDevice, + CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size) ); + build_log = (char *)malloc(ret_val_size+1); + OCL_ERRCK_RETVAL( clGetProgramBuildInfo(clProgram, clDevice, + CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL) ); + + // Null terminate (original writer wasn't sure) + build_log[ret_val_size] = '\0'; + + fprintf(stderr, "%s\n", build_log ); + */ + + mb_sad_calc = clCreateKernel(clProgram, "mb_sad_calc", &ciErrNum); + OCL_ERRCK_VAR(ciErrNum); + larger_sad_calc_8 = clCreateKernel(clProgram, "larger_sad_calc_8", &ciErrNum); + OCL_ERRCK_VAR(ciErrNum); + larger_sad_calc_16 = + clCreateKernel(clProgram, "larger_sad_calc_16", &ciErrNum); + OCL_ERRCK_VAR(ciErrNum); + + size_t wgSize; + size_t comp_wgSize[3]; + cl_ulong localMemSize; + size_t prefwgSizeMult; + cl_ulong privateMemSize; + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + + printf("Ok++\n"); + +#if 0 + cl_image_format img_format; + img_format.image_channel_order = CL_R; + img_format.image_channel_data_type = CL_UNSIGNED_INT16; + + /* Transfer reference image to device */ + imgRef = clCreateImage2D(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &img_format, + ref_image->width /** sizeof(unsigned short)*/, // width + ref_image->height, // height + ref_image->width * sizeof(unsigned short), // row_pitch + ref_image->data, &ciErrNum); +#endif + +#if 1 + imgRef = clCreateBuffer(clContext, CL_MEM_READ_ONLY, + ref_image->width * ref_image->height * + sizeof(unsigned short), + NULL, &ciErrNum); + OCL_ERRCK_VAR(ciErrNum); + OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, imgRef, CL_TRUE, 0, + ref_image->width * ref_image->height * + sizeof(unsigned short), + ref_image->data, 0, NULL, NULL)); +#else + imgRef = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + ref_image->width * ref_image->height * + sizeof(unsigned short), + ref_image->data, &ciErrNum); + printf("Allocating %d bytes\n", + ref_image->width * ref_image->height * sizeof(unsigned short)); + +#endif + OCL_ERRCK_VAR(ciErrNum); + + /* Allocate SAD data on the device */ + + unsigned short *tmpZero = (unsigned short *)calloc( + 41 * MAX_POS_PADDED * image_size_macroblocks, sizeof(unsigned short)); + + /* + size_t max_alloc_size = 0; + clGetDeviceInfo(clDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, + sizeof(max_alloc_size), &max_alloc_size, NULL); + if (max_alloc_size < (41 * MAX_POS_PADDED * + image_size_macroblocks * sizeof(unsigned short))) { + fprintf(stderr, "Can't allocate sad buffer: max alloc size is %dMB\n", + (int) (max_alloc_size >> 20)); + exit(-1); + } + */ + + d_sads = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, + 41 * MAX_POS_PADDED * image_size_macroblocks * + sizeof(unsigned short), + tmpZero, &ciErrNum); + OCL_ERRCK_VAR(ciErrNum); + free(tmpZero); + + d_cur_image = + clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, + image_size_bytes, cur_image->data, &ciErrNum); + OCL_ERRCK_VAR(ciErrNum); + + /* Set Kernel Parameters */ + + OCL_ERRCK_RETVAL( + clSetKernelArg(mb_sad_calc, 0, sizeof(cl_mem), (void *)&d_sads)); + OCL_ERRCK_RETVAL( + clSetKernelArg(mb_sad_calc, 1, sizeof(cl_mem), (void *)&d_cur_image)); + OCL_ERRCK_RETVAL( + clSetKernelArg(mb_sad_calc, 2, sizeof(int), &image_width_macroblocks)); + OCL_ERRCK_RETVAL( + clSetKernelArg(mb_sad_calc, 3, sizeof(int), &image_height_macroblocks)); + OCL_ERRCK_RETVAL( + clSetKernelArg(mb_sad_calc, 4, sizeof(cl_mem), (void *)&imgRef)); + + OCL_ERRCK_RETVAL( + clSetKernelArg(larger_sad_calc_8, 0, sizeof(cl_mem), (void *)&d_sads)); + OCL_ERRCK_RETVAL(clSetKernelArg(larger_sad_calc_8, 1, sizeof(int), + &image_width_macroblocks)); + OCL_ERRCK_RETVAL(clSetKernelArg(larger_sad_calc_8, 2, sizeof(int), + &image_height_macroblocks)); + + OCL_ERRCK_RETVAL( + clSetKernelArg(larger_sad_calc_16, 0, sizeof(cl_mem), (void *)&d_sads)); + OCL_ERRCK_RETVAL(clSetKernelArg(larger_sad_calc_16, 1, sizeof(int), + &image_width_macroblocks)); + OCL_ERRCK_RETVAL(clSetKernelArg(larger_sad_calc_16, 2, sizeof(int), + &image_height_macroblocks)); + + size_t mb_sad_calc_localWorkSize[2] = { + CEIL(MAX_POS, POS_PER_THREAD) * THREADS_W * THREADS_H, 1}; + size_t mb_sad_calc_globalWorkSize[2] = { + mb_sad_calc_localWorkSize[0] * CEIL(ref_image->width / 4, THREADS_W), + mb_sad_calc_localWorkSize[1] * CEIL(ref_image->height / 4, THREADS_H)}; + + size_t larger_sad_calc_8_localWorkSize[2] = {32, 4}; + size_t larger_sad_calc_8_globalWorkSize[2] = {image_width_macroblocks * 32, + image_height_macroblocks * 4}; + + size_t larger_sad_calc_16_localWorkSize[2] = {32, 1}; + size_t larger_sad_calc_16_globalWorkSize[2] = {image_width_macroblocks * 32, + image_height_macroblocks * 1}; + + printf("Ok+++\n"); + + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); + + /* Run the 4x4 kernel */ + OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, mb_sad_calc, 2, 0, + mb_sad_calc_globalWorkSize, + mb_sad_calc_localWorkSize, 0, 0, 0)); + + /* Run the larger-blocks kernels */ + OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel( + clCommandQueue, larger_sad_calc_8, 2, 0, larger_sad_calc_8_globalWorkSize, + larger_sad_calc_8_localWorkSize, 0, 0, 0)); + + OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, larger_sad_calc_16, 2, + 0, larger_sad_calc_16_globalWorkSize, + larger_sad_calc_16_localWorkSize, 0, + 0, 0)); + + OCL_ERRCK_RETVAL(clFinish(clCommandQueue)); + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + + /* Transfer SAD data to the host */ + OCL_ERRCK_RETVAL(clEnqueueReadBuffer( + clCommandQueue, d_sads, CL_TRUE, 0, + 41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(unsigned short), + sads_computed, 0, NULL, NULL)); + + /* Free GPU memory */ + OCL_ERRCK_RETVAL(clReleaseKernel(larger_sad_calc_8)); + OCL_ERRCK_RETVAL(clReleaseKernel(larger_sad_calc_16)); + OCL_ERRCK_RETVAL(clReleaseProgram(clProgram)); + + OCL_ERRCK_RETVAL(clReleaseMemObject(d_sads)); + OCL_ERRCK_RETVAL(clReleaseMemObject(imgRef)); + OCL_ERRCK_RETVAL(clReleaseMemObject(d_cur_image)); + + OCL_ERRCK_RETVAL(clFinish(clCommandQueue)); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + // ************************************************************************ + // End GPU Code + + /* Print output */ + if (params->outFile) { + pb_SwitchToTimer(&timers, pb_TimerID_IO); + write_sads(params->outFile, image_width_macroblocks, + image_height_macroblocks, sads_computed); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + } + +#if 0 /* Debugging */ + print_test_sads(sads_computed, image_size_macroblocks); + write_sads_directly("sad-debug.bin", + ref_image->width / 16, ref_image->height / 16, + sads_computed); +#endif + + /* Free memory */ + free(sads_computed); + free_image(ref_image); + free_image(cur_image); + + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + pb_PrintTimerSet(&timers); + pb_FreeParameters(params); + + OCL_ERRCK_RETVAL(clReleaseCommandQueue(clCommandQueue)); + OCL_ERRCK_RETVAL(clReleaseContext(clContext)); + + pb_DestroyTimerSet(&timers); + + return 0; +} diff --git a/benchmarks/opencl/sad/ocl.c b/benchmarks/opencl/sad/ocl.c new file mode 100644 index 00000000..9ce9a2f5 --- /dev/null +++ b/benchmarks/opencl/sad/ocl.c @@ -0,0 +1,50 @@ +#include +#include +#include +#include +#include "ocl.h" + +char* readFile(const char* fileName) +{ + FILE* fp; + fp = fopen(fileName,"r"); + if(fp == NULL) + { + printf("Error 1!\n"); + exit(1); + } + + fseek(fp,0,SEEK_END); + long size = ftell(fp); + rewind(fp); + + char* buffer = (char*)malloc(sizeof(char)*(size+1)); + if(buffer == NULL) + { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } + + size_t res = fread(buffer,1,size,fp); + if(res != size) + { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } + + buffer[size] = 0; + fclose(fp); + return buffer; +} + +void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size) +{ + cl_int clStatus; + char* temp = (char*)malloc(size); + memset(temp,val,size); + clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + free(temp); +} diff --git a/benchmarks/opencl/sad/ocl.h b/benchmarks/opencl/sad/ocl.h new file mode 100644 index 00000000..8840a868 --- /dev/null +++ b/benchmarks/opencl/sad/ocl.h @@ -0,0 +1,21 @@ +#ifndef __OCLH__ +#define __OCLH__ + +typedef struct { + cl_uint major; + cl_uint minor; + cl_uint multiProcessorCount; +} OpenCLDeviceProp; + +void clMemSet(cl_command_queue, cl_mem, int, size_t); +char* readFile(const char*); + +#define CHECK_ERROR(errorMessage) \ + if(clStatus != CL_SUCCESS) \ + { \ + printf("Error: %s!\n",errorMessage); \ + printf("Line: %d\n",__LINE__); \ + exit(1); \ + } + +#endif diff --git a/benchmarks/opencl/sad/parboil.c b/benchmarks/opencl/sad/parboil.c new file mode 100644 index 00000000..54fca9d0 --- /dev/null +++ b/benchmarks/opencl/sad/parboil.c @@ -0,0 +1,427 @@ +/* + * (c) 2007 The Board of Trustees of the University of Illinois. + */ + +#include +#include +#include +#include + +#if _POSIX_VERSION >= 200112L +# include +#endif + + +/*****************************************************************************/ +/* Timer routines */ + +static void +accumulate_time(pb_Timestamp *accum, + pb_Timestamp start, + pb_Timestamp end) +{ +#if _POSIX_VERSION >= 200112L + *accum += end - start; +#else +# error "Timestamps not implemented for this system" +#endif +} + +#if _POSIX_VERSION >= 200112L +static pb_Timestamp get_time() +{ + struct timeval tv; + gettimeofday(&tv, NULL); + return (pb_Timestamp) (tv.tv_sec * 1000000LL + tv.tv_usec); +} +#else +# error "no supported time libraries are available on this platform" +#endif + +void +pb_ResetTimer(struct pb_Timer *timer) +{ + timer->state = pb_Timer_STOPPED; + +#if _POSIX_VERSION >= 200112L + timer->elapsed = 0; +#else +# error "pb_ResetTimer: not implemented for this system" +#endif +} + +void +pb_StartTimer(struct pb_Timer *timer) +{ + if (timer->state != pb_Timer_STOPPED) { + fputs("Ignoring attempt to start a running timer\n", stderr); + return; + } + + timer->state = pb_Timer_RUNNING; + +#if _POSIX_VERSION >= 200112L + { + struct timeval tv; + gettimeofday(&tv, NULL); + timer->init = tv.tv_sec * 1000000LL + tv.tv_usec; + } +#else +# error "pb_StartTimer: not implemented for this system" +#endif +} + +void +pb_StartTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) +{ + unsigned int numNotStopped = 0x3; // 11 + if (timer->state != pb_Timer_STOPPED) { + fputs("Warning: Timer was not stopped\n", stderr); + numNotStopped &= 0x1; // Zero out 2^1 + } + if (subtimer->state != pb_Timer_STOPPED) { + fputs("Warning: Subtimer was not stopped\n", stderr); + numNotStopped &= 0x2; // Zero out 2^0 + } + if (numNotStopped == 0x0) { + fputs("Ignoring attempt to start running timer and subtimer\n", stderr); + return; + } + + timer->state = pb_Timer_RUNNING; + subtimer->state = pb_Timer_RUNNING; + +#if _POSIX_VERSION >= 200112L + { + struct timeval tv; + gettimeofday(&tv, NULL); + + if (numNotStopped & 0x2) { + timer->init = tv.tv_sec * 1000000LL + tv.tv_usec; + } + + if (numNotStopped & 0x1) { + subtimer->init = tv.tv_sec * 1000000LL + tv.tv_usec; + } + } +#else +# error "pb_StartTimer: not implemented for this system" +#endif + +} + +void +pb_StopTimer(struct pb_Timer *timer) +{ + + pb_Timestamp fini; + + if (timer->state != pb_Timer_RUNNING) { + fputs("Ignoring attempt to stop a stopped timer\n", stderr); + return; + } + + timer->state = pb_Timer_STOPPED; + +#if _POSIX_VERSION >= 200112L + { + struct timeval tv; + gettimeofday(&tv, NULL); + fini = tv.tv_sec * 1000000LL + tv.tv_usec; + } +#else +# error "pb_StopTimer: not implemented for this system" +#endif + + accumulate_time(&timer->elapsed, timer->init, fini); + timer->init = fini; + +} + +void pb_StopTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) { + + pb_Timestamp fini; + + unsigned int numNotRunning = 0x3; // 0b11 + if (timer->state != pb_Timer_RUNNING) { + fputs("Warning: Timer was not running\n", stderr); + numNotRunning &= 0x1; // Zero out 2^1 + } + if (subtimer->state != pb_Timer_RUNNING) { + fputs("Warning: Subtimer was not running\n", stderr); + numNotRunning &= 0x2; // Zero out 2^0 + } + if (numNotRunning == 0x0) { + fputs("Ignoring attempt to stop stopped timer and subtimer\n", stderr); + return; + } + + + timer->state = pb_Timer_STOPPED; + subtimer->state = pb_Timer_STOPPED; + +#if _POSIX_VERSION >= 200112L + { + struct timeval tv; + gettimeofday(&tv, NULL); + fini = tv.tv_sec * 1000000LL + tv.tv_usec; + } +#else +# error "pb_StopTimer: not implemented for this system" +#endif + + if (numNotRunning & 0x2) { + accumulate_time(&timer->elapsed, timer->init, fini); + timer->init = fini; + } + + if (numNotRunning & 0x1) { + accumulate_time(&subtimer->elapsed, subtimer->init, fini); + subtimer->init = fini; + } + +} + +/* Get the elapsed time in seconds. */ +double +pb_GetElapsedTime(struct pb_Timer *timer) +{ + double ret; + + if (timer->state != pb_Timer_STOPPED) { + fputs("Elapsed time from a running timer is inaccurate\n", stderr); + } + +#if _POSIX_VERSION >= 200112L + ret = timer->elapsed / 1e6; +#else +# error "pb_GetElapsedTime: not implemented for this system" +#endif + return ret; +} + +void +pb_InitializeTimerSet(struct pb_TimerSet *timers) +{ + int n; + + timers->wall_begin = get_time(); + + timers->current = pb_TimerID_NONE; + + timers->async_markers = NULL; + + + for (n = 0; n < pb_TimerID_LAST; n++) { + pb_ResetTimer(&timers->timers[n]); + timers->sub_timer_list[n] = NULL; // free first? + } +} + +void +pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category) { + + struct pb_SubTimer *subtimer = (struct pb_SubTimer *) malloc + (sizeof(struct pb_SubTimer)); + + int len = strlen(label); + + subtimer->label = (char *) malloc (sizeof(char)*(len+1)); + sprintf(subtimer->label, "%s\0", label); + + pb_ResetTimer(&subtimer->timer); + subtimer->next = NULL; + + struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[pb_Category]; + if (subtimerlist == NULL) { + subtimerlist = (struct pb_SubTimerList *) malloc + (sizeof(struct pb_SubTimerList)); + subtimerlist->subtimer_list = subtimer; + timers->sub_timer_list[pb_Category] = subtimerlist; + } else { + // Append to list + struct pb_SubTimer *element = subtimerlist->subtimer_list; + while (element->next != NULL) { + element = element->next; + } + element->next = subtimer; + } + +} + +void +pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category) +{ + +// switchToSub( NULL, NONE +// switchToSub( NULL, some +// switchToSub( some, some +// switchToSub( some, NONE -- tries to find "some" in NONE's sublist, which won't be printed + + struct pb_Timer *topLevelToStop = NULL; + if (timers->current != category && timers->current != pb_TimerID_NONE) { + // Switching to subtimer in a different category needs to stop the top-level current, different categoried timer. + // NONE shouldn't have a timer associated with it, so exclude from branch + topLevelToStop = &timers->timers[timers->current]; + } + + struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; + struct pb_SubTimer *curr = (subtimerlist == NULL) ? NULL : subtimerlist->current; + + if (timers->current != pb_TimerID_NONE) { + if (curr != NULL && topLevelToStop != NULL) { + pb_StopTimerAndSubTimer(topLevelToStop, &curr->timer); + } else if (curr != NULL) { + pb_StopTimer(&curr->timer); + } else { + pb_StopTimer(topLevelToStop); + } + } + + subtimerlist = timers->sub_timer_list[category]; + struct pb_SubTimer *subtimer = NULL; + + if (label != NULL) { + subtimer = subtimerlist->subtimer_list; + while (subtimer != NULL) { + if (strcmp(subtimer->label, label) == 0) { + break; + } else { + subtimer = subtimer->next; + } + } + } + + if (category != pb_TimerID_NONE) { + + if (subtimerlist != NULL) { + subtimerlist->current = subtimer; + } + + if (category != timers->current && subtimer != NULL) { + pb_StartTimerAndSubTimer(&timers->timers[category], &subtimer->timer); + } else if (subtimer != NULL) { + // Same category, different non-NULL subtimer + pb_StartTimer(&subtimer->timer); + } else{ + // Different category, but no subtimer (not found or specified as NULL) -- unprefered way of setting topLevel timer + pb_StartTimer(&timers->timers[category]); + } + } + + timers->current = category; + +} + +void +pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer) +{ + /* Stop the currently running timer */ + /*if (timers->current != pb_TimerID_NONE) { + struct pb_SubTimer *currSubTimer = NULL; + struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; + + if ( subtimerlist != NULL) { + currSubTimer = timers->sub_timer_list[timers->current]->current; + } + if ( currSubTimer!= NULL) { + pb_StopTimerAndSubTimer(&timers->timers[timers->current], &currSubTimer->timer); + } else { + pb_StopTimer(&timers->timers[timers->current]); + } + } + + timers->current = timer; + + if (timer != pb_TimerID_NONE) { + pb_StartTimer(&timers->timers[timer]); + }*/ +} + +void +pb_PrintTimerSet(struct pb_TimerSet *timers) +{ + + pb_Timestamp wall_end = get_time(); + + struct pb_Timer *t = timers->timers; + struct pb_SubTimer* sub = NULL; + + int maxSubLength; + + const char *categories[] = { + "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute" + }; + + const int maxCategoryLength = 10; + + int i; + for(i = 1; i < pb_TimerID_LAST-1; ++i) { // exclude NONE and OVRELAP from this format + if(pb_GetElapsedTime(&t[i]) != 0) { + + // Print Category Timer + printf("%-*s: %f\n", maxCategoryLength, categories[i-1], pb_GetElapsedTime(&t[i])); + + if (timers->sub_timer_list[i] != NULL) { + sub = timers->sub_timer_list[i]->subtimer_list; + maxSubLength = 0; + while (sub != NULL) { + // Find longest SubTimer label + if (strlen(sub->label) > maxSubLength) { + maxSubLength = strlen(sub->label); + } + sub = sub->next; + } + + // Fit to Categories + if (maxSubLength <= maxCategoryLength) { + maxSubLength = maxCategoryLength; + } + + sub = timers->sub_timer_list[i]->subtimer_list; + + // Print SubTimers + while (sub != NULL) { + printf(" -%-*s: %f\n", maxSubLength, sub->label, pb_GetElapsedTime(&sub->timer)); + sub = sub->next; + } + } + } + } + + if(pb_GetElapsedTime(&t[pb_TimerID_OVERLAP]) != 0) + printf("CPU/Kernel Overlap: %f\n", pb_GetElapsedTime(&t[pb_TimerID_OVERLAP])); + + float walltime = (wall_end - timers->wall_begin)/ 1e6; + printf("Timer Wall Time: %f\n", walltime); + +} + +void pb_DestroyTimerSet(struct pb_TimerSet * timers) +{ + /* clean up all of the async event markers */ + struct pb_async_time_marker_list ** event = &(timers->async_markers); + while( *event != NULL) { + struct pb_async_time_marker_list ** next = &((*event)->next); + free(*event); + (*event) = NULL; + event = next; + } + + int i = 0; + for(i = 0; i < pb_TimerID_LAST; ++i) { + if (timers->sub_timer_list[i] != NULL) { + struct pb_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list; + struct pb_SubTimer *prev = NULL; + while (subtimer != NULL) { + free(subtimer->label); + prev = subtimer; + subtimer = subtimer->next; + free(prev); + } + free(timers->sub_timer_list[i]); + } + } +} + + diff --git a/benchmarks/opencl/sad/parboil.h b/benchmarks/opencl/sad/parboil.h new file mode 100644 index 00000000..4c9a8b5e --- /dev/null +++ b/benchmarks/opencl/sad/parboil.h @@ -0,0 +1,348 @@ +/* + * (c) 2010 The Board of Trustees of the University of Illinois. + */ +#ifndef PARBOIL_HEADER +#define PARBOIL_HEADER + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* A platform as specified by the user on the command line */ +struct pb_PlatformParam { + char *name; /* The platform name. This string is owned. */ + char *version; /* The platform version; may be NULL. + * This string is owned. */ +}; + +/* Create a PlatformParam from the given strings. + * 'name' must not be NULL. 'version' may be NULL. + * If not NULL, the strings should have been allocated by malloc(), + * and they will be owned by the returned object. + */ +struct pb_PlatformParam * +pb_PlatformParam(char *name, char *version); + +void +pb_FreePlatformParam(struct pb_PlatformParam *); + +/* A criterion for how to select a device */ +enum pb_DeviceSelectionCriterion { + pb_Device_INDEX, /* Enumerate the devices and select one + * by its number */ + pb_Device_CPU, /* Select a CPU device */ + pb_Device_GPU, /* Select a GPU device */ + pb_Device_ACCELERATOR, /* Select an accelerator device */ + pb_Device_NAME /* Select a device by name */ +}; + +/* A device as specified by the user on the command line */ +struct pb_DeviceParam { + enum pb_DeviceSelectionCriterion criterion; + union { + int index; /* If criterion == pb_Device_INDEX, + * the index of the device */ + char *name; /* If criterion == pb_Device_NAME, + * the name of the device. + * This string is owned. */ + }; +}; + +struct pb_DeviceParam * +pb_DeviceParam_index(int index); + +struct pb_DeviceParam * +pb_DeviceParam_cpu(void); + +struct pb_DeviceParam * +pb_DeviceParam_gpu(void); + +struct pb_DeviceParam * +pb_DeviceParam_accelerator(void); + +/* Create a by-name device selection criterion. + * The string should have been allocated by malloc(), and it will will be + * owned by the returned object. + */ +struct pb_DeviceParam * +pb_DeviceParam_name(char *name); + +void +pb_FreeDeviceParam(struct pb_DeviceParam *); + +/* Command line parameters for benchmarks */ +struct pb_Parameters { + char *outFile; /* If not NULL, the raw output of the + * computation should be saved to this + * file. The string is owned. */ + char **inpFiles; /* A NULL-terminated array of strings + * holding the input file(s) for the + * computation. The array and strings + * are owned. */ + struct pb_PlatformParam *platform; /* If not NULL, the platform + * specified on the command line. */ + struct pb_DeviceParam *device; /* If not NULL, the device + * specified on the command line. */ +}; + +/* Read command-line parameters. + * + * The argc and argv parameters to main are read, and any parameters + * interpreted by this function are removed from the argument list. + * + * A new instance of struct pb_Parameters is returned. + * If there is an error, then an error message is printed on stderr + * and NULL is returned. + */ +struct pb_Parameters * +pb_ReadParameters(int *_argc, char **argv); + +/* Free an instance of struct pb_Parameters. + */ +void +pb_FreeParameters(struct pb_Parameters *p); + +void +pb_FreeStringArray(char **); + +/* Count the number of input files in a pb_Parameters instance. + */ +int +pb_Parameters_CountInputs(struct pb_Parameters *p); + +/* A time or duration. */ +//#if _POSIX_VERSION >= 200112L +typedef unsigned long long pb_Timestamp; /* time in microseconds */ +//#else +//# error "Timestamps not implemented" +//#endif + +enum pb_TimerState { + pb_Timer_STOPPED, + pb_Timer_RUNNING, +}; + +struct pb_Timer { + enum pb_TimerState state; + pb_Timestamp elapsed; /* Amount of time elapsed so far */ + pb_Timestamp init; /* Beginning of the current time interval, + * if state is RUNNING. End of the last + * recorded time interfal otherwise. */ +}; + +/* Reset a timer. + * Use this to initialize a timer or to clear + * its elapsed time. The reset timer is stopped. + */ +void +pb_ResetTimer(struct pb_Timer *timer); + +/* Start a timer. The timer is set to RUNNING mode and + * time elapsed while the timer is running is added to + * the timer. + * The timer should not already be running. + */ +void +pb_StartTimer(struct pb_Timer *timer); + +/* Stop a timer. + * This stops adding elapsed time to the timer. + * The timer should not already be stopped. + */ +void +pb_StopTimer(struct pb_Timer *timer); + +/* Get the elapsed time in seconds. */ +double +pb_GetElapsedTime(struct pb_Timer *timer); + +/* Execution time is assigned to one of these categories. */ +enum pb_TimerID { + pb_TimerID_NONE = 0, + pb_TimerID_IO, /* Time spent in input/output */ + pb_TimerID_KERNEL, /* Time spent computing on the device, + * recorded asynchronously */ + pb_TimerID_COPY, /* Time spent synchronously moving data + * to/from device and allocating/freeing + * memory on the device */ + pb_TimerID_DRIVER, /* Time spent in the host interacting with the + * driver, primarily for recording the time + * spent queueing asynchronous operations */ + pb_TimerID_COPY_ASYNC, /* Time spent in asynchronous transfers */ + pb_TimerID_COMPUTE, /* Time for all program execution other + * than parsing command line arguments, + * I/O, kernel, and copy */ + pb_TimerID_OVERLAP, /* Time double-counted in asynchronous and + * host activity: automatically filled in, + * not intended for direct usage */ + pb_TimerID_LAST /* Number of timer IDs */ +}; + +/* Dynamic list of asynchronously tracked times between events */ +struct pb_async_time_marker_list { + char *label; // actually just a pointer to a string + enum pb_TimerID timerID; /* The ID to which the interval beginning + * with this marker should be attributed */ + void * marker; + //cudaEvent_t marker; /* The driver event for this marker */ + struct pb_async_time_marker_list *next; +}; + +struct pb_SubTimer { + char *label; + struct pb_Timer timer; + struct pb_SubTimer *next; +}; + +struct pb_SubTimerList { + struct pb_SubTimer *current; + struct pb_SubTimer *subtimer_list; +}; + +/* A set of timers for recording execution times. */ +struct pb_TimerSet { + enum pb_TimerID current; + struct pb_async_time_marker_list* async_markers; + pb_Timestamp async_begin; + pb_Timestamp wall_begin; + struct pb_Timer timers[pb_TimerID_LAST]; + struct pb_SubTimerList *sub_timer_list[pb_TimerID_LAST]; +}; + +/* Reset all timers in the set. */ +void +pb_InitializeTimerSet(struct pb_TimerSet *timers); + +void +pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category); + +/* Select which timer the next interval of time should be accounted + * to. The selected timer is started and other timers are stopped. + * Using pb_TimerID_NONE stops all timers. */ +void +pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer); + +void +pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category); + +/* Print timer values to standard output. */ +void +pb_PrintTimerSet(struct pb_TimerSet *timers); + +/* Release timer resources */ +void +pb_DestroyTimerSet(struct pb_TimerSet * timers); + +void +pb_SetOpenCL(void *clContextPtr, void *clCommandQueuePtr); + + +typedef struct pb_Device_tag { + char* name; + void* clDevice; + int id; + unsigned int in_use; + unsigned int available; +} pb_Device; + +struct pb_Context_tag; +typedef struct pb_Context_tag pb_Context; + +typedef struct pb_Platform_tag { + char* name; + char* version; + void* clPlatform; + unsigned int in_use; + pb_Context** contexts; + pb_Device** devices; +} pb_Platform; + +struct pb_Context_tag { + void* clPlatformId; + void* clContext; + void* clDeviceId; + pb_Platform* pb_platform; + pb_Device* pb_device; +}; + +// verbosely print out list of platforms and their devices to the console. +pb_Platform** +pb_GetPlatforms(); + +// Choose a platform according to the given platform specification +pb_Platform* +pb_GetPlatform(struct pb_PlatformParam *platform); + +// choose a platform: by name, name & version +pb_Platform* +pb_GetPlatformByName(const char* name); + +pb_Platform* +pb_GetPlatformByNameAndVersion(const char* name, const char* version); + +// Choose a device according to the given device specification +pb_Device* +pb_GetDevice(pb_Platform* pb_platform, struct pb_DeviceParam *device); + +pb_Device** +pb_GetDevices(pb_Platform* pb_platform); + +// choose a device by name. +pb_Device* +pb_GetDeviceByName(pb_Platform* pb_platform, const char* name); + +pb_Platform* +pb_GetPlatformByEnvVars(); + +pb_Context* +pb_InitOpenCLContext(struct pb_Parameters* parameters); + +void +pb_ReleasePlatforms(); + +void +pb_ReleaseContext(pb_Context* c); + +void +pb_PrintPlatformInfo(pb_Context* c); + +void +perf_init(); + +//#define MEASURE_KERNEL_TIME + +#include + +#ifdef MEASURE_KERNEL_TIME +#define clEnqueueNDRangeKernel(q,k,d,o,dg,db,a,b,c) pb_clEnqueueNDRangeKernel((q), (k), (d), (o), (dg), (db), (a), (b), (c)) +cl_int +pb_clEnqueueNDRangeKernel(cl_command_queue /* command_queue */, + cl_kernel /* kernel */, + cl_uint /* work_dim */, + const size_t * /* global_work_offset */, + const size_t * /* global_work_size */, + const size_t * /* local_work_size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */); +#endif + +enum { T_FLOAT, T_DOUBLE, T_SHORT, T_INT, T_UCHAR }; +void pb_sig_float(char*, float*, int); +void pb_sig_double(char*, double*, int); +void pb_sig_short(char*, short*, int); +void pb_sig_int(char*, int*, int); +void pb_sig_uchar(char*, unsigned char*, unsigned int); +void pb_sig_clmem(char*, cl_command_queue, cl_mem, int); + +#ifdef __cplusplus +} +#endif + +#endif //PARBOIL_HEADER + diff --git a/benchmarks/opencl/sad/parboil_opencl.c b/benchmarks/opencl/sad/parboil_opencl.c new file mode 100644 index 00000000..a4db1680 --- /dev/null +++ b/benchmarks/opencl/sad/parboil_opencl.c @@ -0,0 +1,1394 @@ +/* + * (c) 2007 The Board of Trustees of the University of Illinois. + */ + +#include +#include +#include +#include +#include +#include + +#if _POSIX_VERSION >= 200112L +# include +#endif + +//#include "perfmon.h" + +cl_context *clContextPtr; +cl_command_queue *clCommandQueuePtr; + +// #define DISABLE_PARBOIL_TIMER + +/*****************************************************************************/ +/* Timer routines */ + +static int is_async(enum pb_TimerID timer) +{ + return (timer == pb_TimerID_KERNEL) || + (timer == pb_TimerID_COPY_ASYNC); +} + +static int is_blocking(enum pb_TimerID timer) +{ + return (timer == pb_TimerID_COPY) || (timer == pb_TimerID_NONE); +} + +#define INVALID_TIMERID pb_TimerID_LAST + +static int asyncs_outstanding(struct pb_TimerSet* timers) +{ + return (timers->async_markers != NULL) && + (timers->async_markers->timerID != INVALID_TIMERID); +} + +static struct pb_async_time_marker_list * +get_last_async(struct pb_TimerSet* timers) +{ + /* Find the last event recorded thus far */ + struct pb_async_time_marker_list * last_event = timers->async_markers; + if(last_event != NULL && last_event->timerID != INVALID_TIMERID) { + while(last_event->next != NULL && + last_event->next->timerID != INVALID_TIMERID) + last_event = last_event->next; + return last_event; + } else + return NULL; +} + +static void insert_marker(struct pb_TimerSet* tset, enum pb_TimerID timer) +{ + cl_int ciErrNum = CL_SUCCESS; + struct pb_async_time_marker_list ** new_event = &(tset->async_markers); + + while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) { + new_event = &((*new_event)->next); + } + + if(*new_event == NULL) { + *new_event = (struct pb_async_time_marker_list *) + malloc(sizeof(struct pb_async_time_marker_list)); + (*new_event)->marker = calloc(1, sizeof(cl_event)); + /* + // I don't think this is needed at all. I believe clEnqueueMarker 'creates' the event +#if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 ) +fprintf(stderr, "Creating Marker [%d]\n", timer); + *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, &ciErrNum); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Creating User Event Object!\n"); + } + ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), CL_QUEUED); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Setting User Event Status!\n"); + } +#endif +*/ + (*new_event)->next = NULL; + } + + /* valid event handle now aquired: insert the event record */ + (*new_event)->label = NULL; + (*new_event)->timerID = timer; + ciErrNum = clEnqueueMarker(*clCommandQueuePtr, (cl_event *)(*new_event)->marker); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Enqueueing Marker!\n"); + } + +} + +static void insert_submarker(struct pb_TimerSet* tset, char *label, enum pb_TimerID timer) +{ + cl_int ciErrNum = CL_SUCCESS; + struct pb_async_time_marker_list ** new_event = &(tset->async_markers); + + while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) { + new_event = &((*new_event)->next); + } + + if(*new_event == NULL) { + *new_event = (struct pb_async_time_marker_list *) + malloc(sizeof(struct pb_async_time_marker_list)); + (*new_event)->marker = calloc(1, sizeof(cl_event)); + /* +#if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 ) +fprintf(stderr, "Creating SubMarker %s[%d]\n", label, timer); + *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, &ciErrNum); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Creating User Event Object!\n"); + } + ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), CL_QUEUED); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Setting User Event Status!\n"); + } +#endif +*/ + (*new_event)->next = NULL; + } + + /* valid event handle now aquired: insert the event record */ + (*new_event)->label = label; + (*new_event)->timerID = timer; + ciErrNum = clEnqueueMarker(*clCommandQueuePtr, (cl_event *)(*new_event)->marker); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Enqueueing Marker!\n"); + } + +} + + +/* Assumes that all recorded events have completed */ +static pb_Timestamp record_async_times(struct pb_TimerSet* tset) +{ + struct pb_async_time_marker_list * next_interval = NULL; + struct pb_async_time_marker_list * last_marker = get_last_async(tset); + pb_Timestamp total_async_time = 0; + enum pb_TimerID timer; + + for(next_interval = tset->async_markers; next_interval != last_marker; + next_interval = next_interval->next) { + cl_ulong command_start=0, command_end=0; + cl_int ciErrNum = CL_SUCCESS; + + ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->marker), CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &command_start, NULL); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error getting first EventProfilingInfo: %d\n", ciErrNum); + } + + ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->next->marker), CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &command_end, NULL); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error getting second EventProfilingInfo: %d\n", ciErrNum); + } + + pb_Timestamp interval = (pb_Timestamp) (((double)(command_end - command_start)) / 1e3); + tset->timers[next_interval->timerID].elapsed += interval; + if (next_interval->label != NULL) { + struct pb_SubTimer *subtimer = tset->sub_timer_list[next_interval->timerID]->subtimer_list; + while (subtimer != NULL) { + if ( strcmp(subtimer->label, next_interval->label) == 0) { + subtimer->timer.elapsed += interval; + break; + } + subtimer = subtimer->next; + } + } + total_async_time += interval; + next_interval->timerID = INVALID_TIMERID; + } + + if(next_interval != NULL) + next_interval->timerID = INVALID_TIMERID; + + return total_async_time; +} + +static void +accumulate_time(pb_Timestamp *accum, + pb_Timestamp start, + pb_Timestamp end) +{ +//#if _POSIX_VERSION >= 200112L + *accum += end - start; +//#else +//# error "Timestamps not implemented for this system" +//#endif +} + +//#if _POSIX_VERSION >= 200112L +static pb_Timestamp get_time() +{ + //struct timeval tv; + //gettimeofday(&tv, NULL); + //return (pb_Timestamp) (tv.tv_sec * 1000000LL + tv.tv_usec); + return 0; +} +//#else +//# error "no supported time libraries are available on this platform" +//#endif + +void +pb_ResetTimer(struct pb_Timer *timer) +{ +//#ifndef DISABLE_PARBOIL_TIMER + timer->state = pb_Timer_STOPPED; + +//#if _POSIX_VERSION >= 200112L + timer->elapsed = 0; +//#else +//# error "pb_ResetTimer: not implemented for this system" +//#endif +//#endif +} + +void +pb_StartTimer(struct pb_Timer *timer) +{ +/*#ifndef DISABLE_PARBOIL_TIMER + if (timer->state != pb_Timer_STOPPED) { + fputs("Ignoring attempt to start a running timer\n", stderr); + return; + } + + timer->state = pb_Timer_RUNNING; + +#if _POSIX_VERSION >= 200112L + { + struct timeval tv; + gettimeofday(&tv, NULL); + timer->init = tv.tv_sec * 1000000LL + tv.tv_usec; + } +#else +# error "pb_StartTimer: not implemented for this system" +#endif +#endif*/ +} + +void +pb_StartTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) +{ +/*#ifndef DISABLE_PARBOIL_TIMER + + unsigned int numNotStopped = 0x3; // 11 + if (timer->state != pb_Timer_STOPPED) { + fputs("Warning: Timer was not stopped\n", stderr); + numNotStopped &= 0x1; // Zero out 2^1 + } + if (subtimer->state != pb_Timer_STOPPED) { + fputs("Warning: Subtimer was not stopped\n", stderr); + numNotStopped &= 0x2; // Zero out 2^0 + } + if (numNotStopped == 0x0) { + fputs("Ignoring attempt to start running timer and subtimer\n", stderr); + return; + } + + timer->state = pb_Timer_RUNNING; + subtimer->state = pb_Timer_RUNNING; + +#if _POSIX_VERSION >= 200112L + { + struct timeval tv; + gettimeofday(&tv, NULL); + + if (numNotStopped & 0x2) { + timer->init = tv.tv_sec * 1000000LL + tv.tv_usec; + } + + if (numNotStopped & 0x1) { + subtimer->init = tv.tv_sec * 1000000LL + tv.tv_usec; + } + } +#else +# error "pb_StartTimer: not implemented for this system" +#endif + +#endif*/ +} + +void +pb_StopTimer(struct pb_Timer *timer) +{ +/*#ifndef DISABLE_PARBOIL_TIMER + + pb_Timestamp fini; + + if (timer->state != pb_Timer_RUNNING) { + fputs("Ignoring attempt to stop a stopped timer\n", stderr); + return; + } + + timer->state = pb_Timer_STOPPED; + +#if _POSIX_VERSION >= 200112L + { + struct timeval tv; + gettimeofday(&tv, NULL); + fini = tv.tv_sec * 1000000LL + tv.tv_usec; + } +#else +# error "pb_StopTimer: not implemented for this system" +#endif + + accumulate_time(&timer->elapsed, timer->init, fini); + timer->init = fini; + +#endif*/ +} + +void pb_StopTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) { +/*#ifndef DISABLE_PARBOIL_TIMER + + pb_Timestamp fini; + + unsigned int numNotRunning = 0x3; // 11 + if (timer->state != pb_Timer_RUNNING) { + fputs("Warning: Timer was not running\n", stderr); + numNotRunning &= 0x1; // Zero out 2^1 + } + if (subtimer->state != pb_Timer_RUNNING) { + fputs("Warning: Subtimer was not running\n", stderr); + numNotRunning &= 0x2; // Zero out 2^0 + } + if (numNotRunning == 0x0) { + fputs("Ignoring attempt to stop stopped timer and subtimer\n", stderr); + return; + } + + + timer->state = pb_Timer_STOPPED; + subtimer->state = pb_Timer_STOPPED; + +#if _POSIX_VERSION >= 200112L + { + struct timeval tv; + gettimeofday(&tv, NULL); + fini = tv.tv_sec * 1000000LL + tv.tv_usec; + } +#else +# error "pb_StopTimer: not implemented for this system" +#endif + + if (numNotRunning & 0x2) { + accumulate_time(&timer->elapsed, timer->init, fini); + timer->init = fini; + } + + if (numNotRunning & 0x1) { + accumulate_time(&subtimer->elapsed, subtimer->init, fini); + subtimer->init = fini; + } + +#endif*/ +} + +/* Get the elapsed time in seconds. */ +double +pb_GetElapsedTime(struct pb_Timer *timer) +{ + /*double ret; +#ifndef DISABLE_PARBOIL_TIMER + + if (timer->state != pb_Timer_STOPPED) { + fputs("Elapsed time from a running timer is inaccurate\n", stderr); + } + +#if _POSIX_VERSION >= 200112L + ret = timer->elapsed / 1e6; +#else +# error "pb_GetElapsedTime: not implemented for this system" +#endif +#endif + return ret;*/ + return 0; +} + +void +pb_InitializeTimerSet(struct pb_TimerSet *timers) +{ +/*#ifndef DISABLE_PARBOIL_TIMER + int n; + + timers->wall_begin = 0; //get_time(); + timers->current = pb_TimerID_NONE; + + timers->async_markers = NULL; + + for (n = 0; n < pb_TimerID_LAST; n++) { + pb_ResetTimer(&timers->timers[n]); + timers->sub_timer_list[n] = NULL; + } +#endif*/ +} + +void pb_SetOpenCL(void *p_clContextPtr, void *p_clCommandQueuePtr) { + clContextPtr = ((cl_context *)p_clContextPtr); + clCommandQueuePtr = ((cl_command_queue *)p_clCommandQueuePtr); +} + +void +pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category) { +/*#ifndef DISABLE_PARBOIL_TIMER + + struct pb_SubTimer *subtimer = (struct pb_SubTimer *) malloc + (sizeof(struct pb_SubTimer)); + + int len = strlen(label); + + subtimer->label = (char *) malloc (sizeof(char)*(len+1)); + sprintf(subtimer->label, "%s\0", label); + + pb_ResetTimer(&subtimer->timer); + subtimer->next = NULL; + + struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[pb_Category]; + if (subtimerlist == NULL) { + subtimerlist = (struct pb_SubTimerList *) calloc + (1, sizeof(struct pb_SubTimerList)); + subtimerlist->subtimer_list = subtimer; + timers->sub_timer_list[pb_Category] = subtimerlist; + } else { + // Append to list + struct pb_SubTimer *element = subtimerlist->subtimer_list; + while (element->next != NULL) { + element = element->next; + } + element->next = subtimer; + } + +#endif*/ +} + +void +pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer) +{ +#if 0 +#ifndef DISABLE_PARBOIL_TIMER + + /* Stop the currently running timer */ + if (timers->current != pb_TimerID_NONE) { + struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; + struct pb_SubTimer *currSubTimer = (subtimerlist != NULL) ? subtimerlist->current : NULL; + + if (!is_async(timers->current) ) { + if (timers->current != timer) { + if (currSubTimer != NULL) { + pb_StopTimerAndSubTimer(&timers->timers[timers->current], &currSubTimer->timer); + } else { + pb_StopTimer(&timers->timers[timers->current]); + } + } else { + if (currSubTimer != NULL) { + pb_StopTimer(&currSubTimer->timer); + } + } + } else { + insert_marker(timers, timer); + if (!is_async(timer)) { // if switching to async too, keep driver going + pb_StopTimer(&timers->timers[pb_TimerID_DRIVER]); + } + } + } + + pb_Timestamp currentTime = 0; //get_time(); + + /* The only cases we check for asynchronous task completion is + * when an overlapping CPU operation completes, or the next + * segment blocks on completion of previous async operations */ + if( asyncs_outstanding(timers) && + (!is_async(timers->current) || is_blocking(timer) ) ) { + + struct pb_async_time_marker_list * last_event = get_last_async(timers); + /* CL_COMPLETE if completed */ + + cl_int ciErrNum = CL_SUCCESS; + cl_int async_done = CL_COMPLETE; + + ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &async_done, NULL); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Querying EventInfo!\n"); + } + + + if(is_blocking(timer)) { + /* Async operations completed after previous CPU operations: + * overlapped time is the total CPU time since this set of async + * operations were first issued */ + + // timer to switch to is COPY or NONE + if(async_done != CL_COMPLETE) { + accumulate_time(&(timers->timers[pb_TimerID_OVERLAP].elapsed), + timers->async_begin,currentTime); + } + + /* Wait on async operation completion */ + ciErrNum = clWaitForEvents(1, (cl_event *)last_event->marker); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Waiting for Events!\n"); + } + + pb_Timestamp total_async_time = record_async_times(timers); + + /* Async operations completed before previous CPU operations: + * overlapped time is the total async time */ + if(async_done == CL_COMPLETE) { + //fprintf(stderr, "Async_done: total_async_type = %lld\n", total_async_time); + timers->timers[pb_TimerID_OVERLAP].elapsed += total_async_time; + } + + } else + /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */ + // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding + // so something is deeper in stack + if(async_done == CL_COMPLETE ) { + /* Async operations completed before previous CPU operations: + * overlapped time is the total async time */ + timers->timers[pb_TimerID_OVERLAP].elapsed += record_async_times(timers); + } + } + + /* Start the new timer */ + if (timer != pb_TimerID_NONE) { + if(!is_async(timer)) { + pb_StartTimer(&timers->timers[timer]); + } else { + // toSwitchTo Is Async (KERNEL/COPY_ASYNC) + if (!asyncs_outstanding(timers)) { + /* No asyncs outstanding, insert a fresh async marker */ + + insert_marker(timers, timer); + timers->async_begin = currentTime; + } else if(!is_async(timers->current)) { + /* Previous asyncs still in flight, but a previous SwitchTo + * already marked the end of the most recent async operation, + * so we can rename that marker as the beginning of this async + * operation */ + + struct pb_async_time_marker_list * last_event = get_last_async(timers); + last_event->label = NULL; + last_event->timerID = timer; + } + if (!is_async(timers->current)) { + pb_StartTimer(&timers->timers[pb_TimerID_DRIVER]); + } + } + } + timers->current = timer; + +#endif +#endif +} + +void +pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category) +{ +#if 0 +#ifndef DISABLE_PARBOIL_TIMER + struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; + struct pb_SubTimer *curr = (subtimerlist != NULL) ? subtimerlist->current : NULL; + + if (timers->current != pb_TimerID_NONE) { + if (!is_async(timers->current) ) { + if (timers->current != category) { + if (curr != NULL) { + pb_StopTimerAndSubTimer(&timers->timers[timers->current], &curr->timer); + } else { + pb_StopTimer(&timers->timers[timers->current]); + } + } else { + if (curr != NULL) { + pb_StopTimer(&curr->timer); + } + } + } else { + insert_submarker(timers, label, category); + if (!is_async(category)) { // if switching to async too, keep driver going + pb_StopTimer(&timers->timers[pb_TimerID_DRIVER]); + } + } + } + + pb_Timestamp currentTime = 0; //get_time(); + + /* The only cases we check for asynchronous task completion is + * when an overlapping CPU operation completes, or the next + * segment blocks on completion of previous async operations */ + if( asyncs_outstanding(timers) && + (!is_async(timers->current) || is_blocking(category) ) ) { + + struct pb_async_time_marker_list * last_event = get_last_async(timers); + /* CL_COMPLETE if completed */ + + cl_int ciErrNum = CL_SUCCESS; + cl_int async_done = CL_COMPLETE; + + ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &async_done, NULL); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Querying EventInfo!\n"); + } + + if(is_blocking(category)) { + /* Async operations completed after previous CPU operations: + * overlapped time is the total CPU time since this set of async + * operations were first issued */ + + // timer to switch to is COPY or NONE + // if it hasn't already finished, then just take now and use that as the elapsed time in OVERLAP + // anything happening after now isn't OVERLAP because everything is being stopped to wait for synchronization + // it seems that the extra sync wall time isn't being recorded anywhere + if(async_done != CL_COMPLETE) + accumulate_time(&(timers->timers[pb_TimerID_OVERLAP].elapsed), + timers->async_begin,currentTime); + + /* Wait on async operation completion */ + ciErrNum = clWaitForEvents(1, (cl_event *)last_event->marker); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Waiting for Events!\n"); + } + pb_Timestamp total_async_time = record_async_times(timers); + + /* Async operations completed before previous CPU operations: + * overlapped time is the total async time */ + // If it did finish, then accumulate all the async time that did happen into OVERLAP + // the immediately preceding EventSynchronize theoretically didn't have any effect since it was already completed. + if(async_done == CL_COMPLETE /*cudaSuccess*/) + timers->timers[pb_TimerID_OVERLAP].elapsed += total_async_time; + + } else + /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */ + // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding + // so something is deeper in stack + if(async_done == CL_COMPLETE /*cudaSuccess*/) { + /* Async operations completed before previous CPU operations: + * overlapped time is the total async time */ + timers->timers[pb_TimerID_OVERLAP].elapsed += record_async_times(timers); + } + // else, this isn't blocking, so just check the next time around + } + + subtimerlist = timers->sub_timer_list[category]; + struct pb_SubTimer *subtimer = NULL; + + if (label != NULL) { + subtimer = subtimerlist->subtimer_list; + while (subtimer != NULL) { + if (strcmp(subtimer->label, label) == 0) { + break; + } else { + subtimer = subtimer->next; + } + } + } + + /* Start the new timer */ + if (category != pb_TimerID_NONE) { + if(!is_async(category)) { + if (subtimerlist != NULL) { + subtimerlist->current = subtimer; + } + + if (category != timers->current && subtimer != NULL) { + pb_StartTimerAndSubTimer(&timers->timers[category], &subtimer->timer); + } else if (subtimer != NULL) { + pb_StartTimer(&subtimer->timer); + } else { + pb_StartTimer(&timers->timers[category]); + } + } else { + if (subtimerlist != NULL) { + subtimerlist->current = subtimer; + } + + // toSwitchTo Is Async (KERNEL/COPY_ASYNC) + if (!asyncs_outstanding(timers)) { + /* No asyncs outstanding, insert a fresh async marker */ + insert_submarker(timers, label, category); + timers->async_begin = currentTime; + } else if(!is_async(timers->current)) { + /* Previous asyncs still in flight, but a previous SwitchTo + * already marked the end of the most recent async operation, + * so we can rename that marker as the beginning of this async + * operation */ + + struct pb_async_time_marker_list * last_event = get_last_async(timers); + last_event->timerID = category; + last_event->label = label; + } // else, marker for switchToThis was already inserted + + //toSwitchto is already asynchronous, but if current/prev state is async too, then DRIVER is already running + if (!is_async(timers->current)) { + pb_StartTimer(&timers->timers[pb_TimerID_DRIVER]); + } + } + } + + timers->current = category; +#endif +#endif +} + +void +pb_PrintTimerSet(struct pb_TimerSet *timers) +{ +#if 0 +#ifndef DISABLE_PARBOIL_TIMER + pb_Timestamp wall_end = 0; //get_time(); + + struct pb_Timer *t = timers->timers; + struct pb_SubTimer* sub = NULL; + + int maxSubLength; + + const char *categories[] = { + "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute" + }; + + const int maxCategoryLength = 10; + + int i; + for(i = 1; i < pb_TimerID_LAST-1; ++i) { // exclude NONE and OVRELAP from this format + if(pb_GetElapsedTime(&t[i]) != 0) { + + // Print Category Timer + printf("%-*s: %f\n", maxCategoryLength, categories[i-1], pb_GetElapsedTime(&t[i])); + + if (timers->sub_timer_list[i] != NULL) { + sub = timers->sub_timer_list[i]->subtimer_list; + maxSubLength = 0; + while (sub != NULL) { + // Find longest SubTimer label + if (strlen(sub->label) > maxSubLength) { + maxSubLength = strlen(sub->label); + } + sub = sub->next; + } + + // Fit to Categories + if (maxSubLength <= maxCategoryLength) { + maxSubLength = maxCategoryLength; + } + + sub = timers->sub_timer_list[i]->subtimer_list; + + // Print SubTimers + while (sub != NULL) { + printf(" -%-*s: %f\n", maxSubLength, sub->label, pb_GetElapsedTime(&sub->timer)); + sub = sub->next; + } + } + } + } + + if(pb_GetElapsedTime(&t[pb_TimerID_OVERLAP]) != 0) + printf("CPU/Kernel Overlap: %f\n", pb_GetElapsedTime(&t[pb_TimerID_OVERLAP])); + + float walltime = (wall_end - timers->wall_begin)/ 1e6; + printf("Timer Wall Time: %f\n", walltime); + +#endif +#endif +} + +void pb_DestroyTimerSet(struct pb_TimerSet * timers) +{ +#ifndef DISABLE_PARBOIL_TIMER + /* clean up all of the async event markers */ + struct pb_async_time_marker_list* event = timers->async_markers; + while(event != NULL) { + + cl_int ciErrNum = CL_SUCCESS; + ciErrNum = clWaitForEvents(1, (cl_event *)(event)->marker); + if (ciErrNum != CL_SUCCESS) { + //fprintf(stderr, "Error Waiting for Events!\n"); + } + + ciErrNum = clReleaseEvent( *((cl_event *)(event)->marker) ); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Release Events!\n"); + } + + free((event)->marker); + struct pb_async_time_marker_list* next = ((event)->next); + + free(event); + + // (*event) = NULL; + event = next; + } + + int i = 0; + for(i = 0; i < pb_TimerID_LAST; ++i) { + if (timers->sub_timer_list[i] != NULL) { + struct pb_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list; + struct pb_SubTimer *prev = NULL; + while (subtimer != NULL) { + free(subtimer->label); + prev = subtimer; + subtimer = subtimer->next; + free(prev); + } + free(timers->sub_timer_list[i]); + } + } +#endif +} + +static pb_Platform** ptr = NULL; + +// verbosely print out list of platforms and their devices to the console. +pb_Platform** +pb_GetPlatforms() { + if (ptr == NULL) { + cl_uint num_platforms; + clGetPlatformIDs(0, NULL, &num_platforms); + if (num_platforms == 0) return NULL; + + ptr = (pb_Platform **) malloc(sizeof(pb_Platform *) * (num_platforms + 1)); + cl_platform_id* ids = (cl_platform_id *) malloc(num_platforms * sizeof(cl_platform_id)); + clGetPlatformIDs(num_platforms, ids, NULL); + + unsigned int i; + for (i = 0; i < num_platforms; i++) { + ptr[i] = (pb_Platform *) malloc(sizeof(pb_Platform)); + ptr[i]->clPlatform = ids[i]; + ptr[i]->contexts = NULL; + ptr[i]->in_use = 0; + ptr[i]->devices = NULL; + + size_t sz; + clGetPlatformInfo(ids[i], CL_PLATFORM_NAME, 0, NULL, &sz); + char* name = (char *) malloc(sz + 1); + clGetPlatformInfo(ids[i], CL_PLATFORM_NAME, sz, name, NULL); + name[sz] = '\0'; + ptr[i]->name = name; + + clGetPlatformInfo(ids[i], CL_PLATFORM_VERSION, 0, NULL, &sz); + char* version = (char *) malloc(sz + 1); + clGetPlatformInfo(ids[i], CL_PLATFORM_VERSION, sz, version, NULL); + version[sz] = '\0'; + ptr[i]->version = version; + } + ptr[i] = NULL; + + free(ids); + } + + return (pb_Platform**) ptr; +} + +pb_Context* +createContext(pb_Platform* pb_platform, pb_Device* pb_device) { + pb_Context* c = (pb_Context*) malloc(sizeof(pb_Context)); + cl_int clStatus; + cl_context_properties clCps[3] = { + CL_CONTEXT_PLATFORM, (cl_context_properties)(pb_platform->clPlatform), 0 + }; + c->clContext = + clCreateContext(clCps, 1, (cl_device_id*)&pb_device->clDevice, NULL, NULL, &clStatus); + c->clPlatformId = pb_platform->clPlatform; + c->clDeviceId = pb_device->clDevice; + c->pb_platform = pb_platform; + c->pb_device = pb_device; + pb_platform->in_use = 1; + pb_device->in_use = 1; + unsigned int i = 0; + if (pb_platform->contexts == NULL) { + pb_platform->contexts = (pb_Context**) malloc(2*sizeof(pb_Context*)); + } else { + for (i = 0; pb_platform->contexts[i] != NULL; i++) {}; + pb_platform->contexts = (pb_Context**) realloc(pb_platform->contexts, + (i+1)*sizeof(pb_Context*)); + } + pb_platform->contexts[i+1] = NULL; + pb_platform->contexts[i] = c; + return c; +} + +// choose a platform by name. +pb_Platform* +pb_GetPlatformByName(const char* name) { + pb_Platform** ps = (pb_Platform **) pb_GetPlatforms(); + if (ps == NULL) return NULL; + if (name == NULL) { + return *ps; + } + + while (*ps) { + if (strstr((*ps)->name, name)) break; + ps++; + } + return (pb_Platform*) *ps; +} + +pb_Device** +pb_GetDevices(pb_Platform* pb_platform) { + if (pb_platform->devices == NULL) { + cl_uint num_devs; + cl_device_id* dev_ids; + clGetDeviceIDs((cl_platform_id) pb_platform->clPlatform, + CL_DEVICE_TYPE_ALL, 0, NULL, &num_devs); + if (num_devs == 0) return NULL; + + pb_platform->devices = + (pb_Device **) malloc((num_devs + 1) * sizeof(pb_Device *)); + dev_ids = (cl_device_id *) malloc(sizeof(cl_device_id) * num_devs); + clGetDeviceIDs((cl_platform_id) pb_platform->clPlatform, + CL_DEVICE_TYPE_ALL, num_devs, dev_ids, NULL); + + unsigned int i; + for (i = 0; i < num_devs; i++) { + pb_platform->devices[i] = (pb_Device *) malloc(sizeof(pb_Device)); + + pb_platform->devices[i]->clDevice = dev_ids[i]; + pb_platform->devices[i]->id = i; + + size_t sz; + clGetDeviceInfo(dev_ids[i], CL_DEVICE_NAME, 0, NULL, &sz); + char* name = (char *) malloc(sz + 1); + clGetDeviceInfo(dev_ids[i], CL_DEVICE_NAME, sz, name, NULL); + name[sz] = '\0'; + pb_platform->devices[i]->name = (char *) name; + + cl_bool available; + clGetDeviceInfo(dev_ids[i], CL_DEVICE_AVAILABLE, sizeof(cl_bool), &available, NULL); + pb_platform->devices[i]->available = (int) available; + + pb_platform->devices[i]->in_use = 0; + } + pb_platform->devices[i] = NULL; + } + return (pb_Device **) pb_platform->devices; +} + +// choose a device by name. +static pb_Device* +pb_SelectDeviceByName(pb_Device **ds, const char* name) { + if (ds == NULL) return NULL; + if (name == NULL) return *ds; + while (*ds) { + if (strstr((*ds)->name, name)) break; + ds++; + } + + return *ds; +} + +// choose a device by name and set the device's 'in_use' flag. +pb_Device* +pb_GetDeviceByName(pb_Platform* pb_platform, const char* name) { + pb_Device** ds = (pb_Device **) pb_GetDevices(pb_platform); + pb_Device *d = pb_SelectDeviceByName(ds, name); + + if (d) d->in_use = 1; + + return d; +} + +void +pb_ReleasePlatforms() { + if (!ptr) return; + pb_Platform** cur_ptr = ptr; + while (*cur_ptr) { + pb_Platform* pfptr = *cur_ptr++; + if (pfptr->devices) { + pb_Device** dvptr = pfptr->devices; + while (*dvptr) { + pb_Device* d = *dvptr++; + free(d->name); + free(d); + } + free(pfptr->devices); + } + if (pfptr->contexts) { + pb_Context** cptr = pfptr->contexts; + while (*cptr) { + free(*cptr++); + } + free(pfptr->contexts); + } + free(pfptr->name); + free(pfptr); + } + free(ptr); + ptr = NULL; +} + +pb_Platform* +pb_GetPlatformByNameAndVersion(const char* name, const char* version) { + pb_Platform** ps = (pb_Platform **) pb_GetPlatforms(); + if (ps == NULL) return NULL; + if (name == NULL) return *ps; + while (*ps) { + if (strstr((*ps)->name, name) && strstr((*ps)->version, version)) break; + ps++; + } + return (pb_Platform*) *ps; +} + +/* Return a pointer to the device at the specified index, or NULL. + * Used by pb_GetDevice. */ +static pb_Device * +select_device_by_index(pb_Device** ds, int id) +{ + int i = 0; + pb_Device** p = ds; + while (*p && (i < id)) { p++; i++; } + return *p; +} + +/* Return a pointer to the device with the specified type, or NULL. + * Used by pb_GetDevice. */ +static pb_Device * +select_device_by_type(pb_Device** ds, + enum pb_DeviceSelectionCriterion criterion) +{ + cl_device_type sought_type; + + /* Determine the OpenCL device type to search for */ + switch(criterion) { + case pb_Device_CPU: + sought_type = CL_DEVICE_TYPE_CPU; + break; + case pb_Device_GPU: + sought_type = CL_DEVICE_TYPE_GPU; + break; + case pb_Device_ACCELERATOR: + sought_type = CL_DEVICE_TYPE_ACCELERATOR; + break; + default: + fprintf(stderr, "pb_GetDevice: Invalid device type"); + exit(-1); + } + + /* Find the device */ + { + pb_Device** p = ds; + cl_device_type type; + while (*p) { + clGetDeviceInfo(((cl_device_id) ((*p)->clDevice)), CL_DEVICE_TYPE, + sizeof(cl_device_type), &type, NULL); + if (type == sought_type) break; + } + + return *p; + } +} + +pb_Device* +pb_GetDevice(pb_Platform* pb_platform, struct pb_DeviceParam *device) +{ + pb_Device** ds = (pb_Device **) pb_GetDevices(pb_platform); + + // The list of devices must be nonempty + if (ds == NULL || *ds == NULL) { + fprintf(stderr, "Error: No device is found in platform: name = %s, version = %s\n.", pb_platform->name, pb_platform->version); + exit(-1); + } + + pb_Device *selected_device = NULL; + + if (device != NULL) { + /* Use 'device' to select and return a device. + * If unable to select a device, fall + * back on the default selection mechanism. */ + switch(device->criterion) { + case pb_Device_INDEX: + selected_device = select_device_by_index(ds, device->index); + break; + case pb_Device_GPU: + case pb_Device_CPU: + case pb_Device_ACCELERATOR: + selected_device = select_device_by_type(ds, device->criterion); + break; + case pb_Device_NAME: + selected_device = pb_SelectDeviceByName(ds, device->name); + break; + default: + fprintf(stderr, "pb_GetDevice: Invalid argument"); + exit(-1); + } + } + + /* By default or if user-specified selection failed, + * select the first device */ + if (selected_device == NULL) + selected_device = *ds; + + /* Set the in_use flag */ + selected_device->in_use = 1; + + return selected_device; +} + +pb_Device* +pb_GetDeviceByEnvVars(pb_Platform* pb_platform) { + + /* Convert environment variables to a 'pb_DeviceParam' */ + struct pb_DeviceParam *param = NULL; + + char* device_num = getenv("PARBOIL_DEVICE_NUMBER"); + if (device_num && strcmp(device_num, "")) { + int id = atoi(device_num); + param = pb_DeviceParam_index(id); + } + else { + char* device_name = getenv("PARBOIL_DEVICE_NAME"); + if (device_name && strcmp(device_name, "")) { + param = pb_DeviceParam_name(strdup(device_name)); + } + else { + char* device_type = getenv("PARBOIL_DEVICE_TYPE"); + if (device_type && strcmp(device_type, "")) { + if (strcmp(device_type, "CPU") == 0) + param = pb_DeviceParam_cpu(); + else if (strcmp(device_type, "GPU") == 0) + param = pb_DeviceParam_gpu(); + else if (strcmp(device_type, "ACCELERATOR") == 0) + param = pb_DeviceParam_accelerator(); + } + } + } + + /* Get a device */ + pb_Device *d = pb_GetDevice(pb_platform, param); + pb_FreeDeviceParam(param); + + return d; +} + +pb_Platform* +pb_GetPlatformByEnvVars() { + char* name = getenv("PARBOIL_PLATFORM_NAME"); + char* version = getenv("PARBOIL_PLATFORM_VERSION"); + + /* Create a pb_PlatformParam object (or NULL) representing the data from the + * environment variables */ + struct pb_PlatformParam *platform; + + if (name) { + if (version) { + platform = pb_PlatformParam(strdup(name), strdup(version)); + } + else { + platform = pb_PlatformParam(strdup(name), NULL); + } + } + else { + platform = NULL; + } + + /* Convert to a platform */ + pb_Platform *p = pb_GetPlatform(platform); + pb_FreePlatformParam(platform); + + return p; +} + +/* Choose an OpenCL platform based on the given command-line parameters. + * If NULL, use the default OpenCL platform. */ +pb_Platform* +pb_GetPlatform(struct pb_PlatformParam *platform) { + if (platform != NULL) { + /* Try to use command-line parameters to choose platform */ + char *name = platform->name; + char *version = platform->version; + + if (!name) { + fprintf(stderr, "Internal error: NULL pointer"); + exit(-1); + } + + if (version) { + pb_Platform* p = pb_GetPlatformByNameAndVersion(name, version); + if (p) return p; + } + + pb_Platform* p = pb_GetPlatformByName(name); + if (p) return p; + } + + pb_Platform* p = pb_GetPlatformByName(NULL); + if (p == NULL) { + fprintf(stderr, "Error: No OpenCL platform in this system. Exiting."); + exit(-1); + } + return p; +} + +//extern void perf_init(); +//extern void mxpa_scheduler_init(); + +pb_Context* +pb_InitOpenCLContext(struct pb_Parameters* parameters) { +#if 0 + pb_Platform* ps = pb_GetPlatform(parameters->platform); + if (!ps) return NULL; + pb_Device* ds = pb_GetDevice(ps, parameters->device); + if (!ds) return NULL; + + /* HERE INITIALIZE TIMER */ + //perf_init(); + //mxpa_scheduler_init(); + + pb_Context* c = createContext(ps, ds); + pb_PrintPlatformInfo(c); + return c; +#endif + cl_int _err; + cl_platform_id platform_id; + cl_device_id device_id; + cl_context context; + clGetPlatformIDs(1, &platform_id, NULL); + clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL); + context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err); + + pb_Context* c = (pb_Context*)malloc(sizeof(pb_Context)); + c->clContext = context; + c->clDeviceId = device_id; + c->clPlatformId = platform_id; + c->pb_platform = (pb_Platform*)malloc(sizeof(pb_Platform)); + c->pb_device = (pb_Device*)malloc(sizeof(pb_Device)); + c->pb_platform->devices = (pb_Device**)malloc(sizeof(pb_Device*) * 2); + c->pb_platform->devices[0] = c->pb_device; + c->pb_platform->devices[1] = NULL; + c->pb_platform->contexts = (pb_Context**)malloc(sizeof(pb_Context*) * 2); + c->pb_platform->contexts[0] = c; + c->pb_platform->contexts[1] = NULL; + c->pb_platform->in_use = 1; + c->pb_device->in_use = 1; + return c; +} + +void +pb_ReleaseOpenCLContext(pb_Context* c) { + pb_ReleasePlatforms(); +} + +void +pb_PrintPlatformInfo(pb_Context* c) { + /*pb_Platform** ps = pb_GetPlatforms(); + if (!ps) { + fprintf (stderr, "No platform found"); + return; + } + + printf ("********************************************************\n"); + printf ("DETECTED OPENCL PLATFORMS AND DEVICES:\n"); + printf ("--------------------------------------------------------\n"); + + while (*ps) { + printf ("PLATFORM = %s, %s", (*ps)->name, (*ps)->version); + if (c->pb_platform == *ps) printf (" (SELECTED)"); + printf ("\n"); + + pb_Device** ds = (pb_Device **) pb_GetDevices((*ps)); + if (ds == NULL) { + printf (" + (No devices)\n"); + } else { + while (*ds) { + printf (" + %d: %s", (*ds)->id, (*ds)->name); + if (c->pb_device == *ds) printf (" (SELECTED)"); + printf ("\n"); + ds++; + } + } + + ps++; + } + printf ("********************************************************\n");*/ +} + +#ifdef MEASURE_KERNEL_TIME + +#undef clEnqueueNDRangeKernel + +//extern void pin_trace_enable(char*); +//extern void pin_trace_disable(char*); + +cl_int +pb_clEnqueueNDRangeKernel(cl_command_queue q/* command_queue */, + cl_kernel k/* kernel */, + cl_uint d/* work_dim */, + const size_t * o/* global_work_offset */, + const size_t * gws/* global_work_size */, + const size_t * lws/* local_work_size */, + cl_uint n/* num_events_in_wait_list */, + const cl_event * w/* event_wait_list */, + cl_event * e/* event */) { + + char buf[128]; + struct timeval begin, end; + clGetKernelInfo(k, CL_KERNEL_FUNCTION_NAME, 128, buf, NULL); + +#if 0 + int i; + for (i = 0; i < d; i++) { + printf ("%s: %d: %d / %d\n", buf, i, gws[i], (lws == NULL ? 0 : lws[i])); + } +#endif + + clFinish(q); clFlush(q); + //pin_trace_enable(buf); + //gettimeofday(&begin, NULL); + cl_int result = clEnqueueNDRangeKernel(q, k, d, o, gws, lws, n, w, e); + clFinish(q); clFlush(q); + //gettimeofday(&end, NULL); + //pin_trace_disable(buf); + //float t = (float)(end.tv_sec - begin.tv_sec) + (end.tv_usec - begin.tv_usec) / 1000000.0f; + fflush(stdout); + fflush(stderr); + //printf ("PBTIMER: %s: %f\n", buf, t); + return result; +} + +#endif + +void +pb_sig_float(char* c, float* p, int sz) { + int i; + double s = 0.0; + for (i = 0; i < sz; i++) s += p[i] * (float)(i+1); + printf ("[Signature] %s = %lf\n", c, s); +} + +void +pb_sig_double(char* c, double* p, int sz) { + int i; + double s = 0.0; + for (i = 0; i < sz; i++) s += p[i]; + printf ("[Signature] %s = %lf\n", c, s); +} + +void +pb_sig_short(char* c, short* p, int sz) { + int i; + long long int s = 0; + for (i = 0; i < sz; i++) s += p[i]; + printf ("[Signature] %s = %lld\n", c, s); +} + +void +pb_sig_int(char* c, int* p, int sz) { + int i; + long long int s = 0; + for (i = 0; i < sz; i++) s += p[i]; + printf ("[Signature] %s = %lld\n", c, s); +} + +void +pb_sig_uchar(char* c, unsigned char* p, unsigned int sz) { + int i; + unsigned long long int s = 0; + for (i = 0; i < sz; i++) s += p[i]; + printf ("[Signature] %s = %lld\n", c, s); +} + +void pb_sig_clmem(char* s, cl_command_queue command_queue, cl_mem memobj, int ty) { + size_t sz; + if (clGetMemObjectInfo(memobj, CL_MEM_SIZE, sizeof(size_t), &sz, NULL) != CL_SUCCESS) { + printf ("Something wrong.\n"); + assert(0); + } else { + printf ("size = %d\n", sz); + } + char* hp; // = (char*) malloc(sz); + //posix_memalign((void**)&hp, 64, sz); + hp = (char*)malloc(sz); + + clEnqueueReadBuffer (command_queue, + memobj, + CL_TRUE, + 0, + sz, + hp, + 0, + NULL, + NULL); + + if (ty == T_FLOAT) pb_sig_float(s, (float*)hp, sz/sizeof(float)); + if (ty == T_DOUBLE) pb_sig_double(s, (double*)hp, sz/sizeof(double)); + if (ty == T_INT) pb_sig_int(s, (int*)hp, sz/sizeof(int)); + if (ty == T_SHORT) pb_sig_short(s, (short*)hp, sz/sizeof(short)); + if (ty == T_UCHAR) pb_sig_uchar(s, (unsigned char*)hp, sz/sizeof(char)); + + free(hp); +} + diff --git a/benchmarks/opencl/sad/reference.bin b/benchmarks/opencl/sad/reference.bin new file mode 100755 index 00000000..94fb04f5 Binary files /dev/null and b/benchmarks/opencl/sad/reference.bin differ diff --git a/benchmarks/opencl/sad/sad.h b/benchmarks/opencl/sad/sad.h new file mode 100644 index 00000000..3374fa04 --- /dev/null +++ b/benchmarks/opencl/sad/sad.h @@ -0,0 +1,83 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2007 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +/* Search offsets within 16 pixels of (0,0) */ +#define SEARCH_RANGE 16 + +/* The total search area is 33 pixels square */ +#define SEARCH_DIMENSION (2*SEARCH_RANGE+1) + +/* The total number of search positions is 33^2 */ +#define MAX_POS 1089 + +/* This is padded to a multiple of 8 when allocating memory */ +#define MAX_POS_PADDED 1096 + +/* VBSME block indices in the SAD array for different + * block sizes. The index is computed from the + * image size in macroblocks. Block sizes are (height, width): + * 1: 16 by 16 pixels, one block per macroblock + * 2: 8 by 16 pixels, 2 blocks per macroblock + * 3: 16 by 8 pixels, 2 blocks per macroblock + * 4: 8 by 8 pixels, 4 blocks per macroblock + * 5: 4 by 8 pixels, 8 blocks per macroblock + * 6: 8 by 4 pixels, 8 blocks per macroblock + * 7: 4 by 4 pixels, 16 blocks per macroblock + */ +#define SAD_TYPE_1_IX(image_size) 0 +#define SAD_TYPE_2_IX(image_size) ((image_size)*MAX_POS_PADDED) +#define SAD_TYPE_3_IX(image_size) ((image_size)*(3*MAX_POS_PADDED)) +#define SAD_TYPE_4_IX(image_size) ((image_size)*(5*MAX_POS_PADDED)) +#define SAD_TYPE_5_IX(image_size) ((image_size)*(9*MAX_POS_PADDED)) +#define SAD_TYPE_6_IX(image_size) ((image_size)*(17*MAX_POS_PADDED)) +#define SAD_TYPE_7_IX(image_size) ((image_size)*(25*MAX_POS_PADDED)) + +#define SAD_TYPE_IX(n, image_size) \ + ((n == 1) ? SAD_TYPE_1_IX(image_size) : \ + ((n == 2) ? SAD_TYPE_2_IX(image_size) : \ + ((n == 3) ? SAD_TYPE_3_IX(image_size) : \ + ((n == 4) ? SAD_TYPE_4_IX(image_size) : \ + ((n == 5) ? SAD_TYPE_5_IX(image_size) : \ + ((n == 6) ? SAD_TYPE_6_IX(image_size) : \ + (SAD_TYPE_7_IX(image_size) \ + ))))))) + +#define SAD_TYPE_1_CT 1 +#define SAD_TYPE_2_CT 2 +#define SAD_TYPE_3_CT 2 +#define SAD_TYPE_4_CT 4 +#define SAD_TYPE_5_CT 8 +#define SAD_TYPE_6_CT 8 +#define SAD_TYPE_7_CT 16 + +#define SAD_TYPE_CT(n) \ + ((n == 1) ? SAD_TYPE_1_CT : \ + ((n == 2) ? SAD_TYPE_2_CT : \ + ((n == 3) ? SAD_TYPE_3_CT : \ + ((n == 4) ? SAD_TYPE_4_CT : \ + ((n == 5) ? SAD_TYPE_5_CT : \ + ((n == 6) ? SAD_TYPE_6_CT : \ + (SAD_TYPE_7_CT \ + ))))))) + +#ifdef __cplusplus +extern "C" { +#endif + +void sad4_cpu(unsigned short *blk_sad, + unsigned short *frame, + unsigned short *ref, + int mb_width, + int mb_height); + +void larger_sads(unsigned short *sads, + int mbs); + +#ifdef __cplusplus +} +#endif diff --git a/benchmarks/opencl/sad/sad_kernel.h b/benchmarks/opencl/sad/sad_kernel.h new file mode 100644 index 00000000..4fbf23ef --- /dev/null +++ b/benchmarks/opencl/sad/sad_kernel.h @@ -0,0 +1,57 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2007 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +/* Integer ceiling division. This computes ceil(x / y) */ +#define CEIL(x,y) (((x) + ((y) - 1)) / (y)) + +/* Fast multiplication by 33 */ +#define TIMES_DIM_POS(x) (((x) << 5) + (x)) + +/* Amount of dynamically allocated local storage + * measured in bytes, 2-byte words, and 8-byte words */ +#define SAD_LOC_SIZE_ELEMS (THREADS_W * THREADS_H * MAX_POS_PADDED) +#define SAD_LOC_SIZE_BYTES (SAD_LOC_SIZE_ELEMS * sizeof(unsigned short)) +#define SAD_LOC_SIZE_8B (SAD_LOC_SIZE_BYTES / sizeof(vec8b)) + +/* The search position index space is distributed across threads + * and across time. */ +/* This many search positions are calculated by each thread. + * Note: the optimized kernel requires that this number is + * divisible by 3. */ +#define POS_PER_THREAD 18 + +/* The width and height (in number of 4x4 blocks) of a tile from the + * current frame that is computed in a single thread block. */ +#define THREADS_W 1 +#define THREADS_H 1 + +// #define TIMES_THREADS_W(x) (((x) << 1) + (x)) +#define TIMES_THREADS_W(x) ((x) * THREADS_W) + +/* This structure is used for vector load/store operations. */ + +struct vec8b { + int fst; + int snd; +} __attribute__ ((aligned(8))); + + + +/* 4-by-4 SAD computation on the device. */ +/* +extern "C" __global__ void mb_sad_calc(unsigned short*, + unsigned short*, + int, int); +*/ +/* A function to get a reference to the "ref" texture, because sharing + * of textures between files isn't really supported. */ + /* +texture &get_ref(void); + +extern "C" __global__ void larger_sad_calc_8(unsigned short*, int, int); +extern "C" __global__ void larger_sad_calc_16(unsigned short*, int, int);*/