sad
This commit is contained in:
1
benchmarks/opencl/sad/DESCRIPTION
Executable file
1
benchmarks/opencl/sad/DESCRIPTION
Executable file
@@ -0,0 +1 @@
|
|||||||
|
Inputs: reference.bin frame.bin
|
||||||
68
benchmarks/opencl/sad/Makefile
Normal file
68
benchmarks/opencl/sad/Makefile
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
RISCV_TOOL_PATH = $(wildcard ~/dev/riscv-gnu-toolchain/drops)
|
||||||
|
POCL_CC_PATH = $(wildcard ~/dev/pocl/drops_riscv_cc)
|
||||||
|
POCL_INC_PATH = $(wildcard ../include)
|
||||||
|
POCL_LIB_PATH = $(wildcard ../lib)
|
||||||
|
VX_RT_PATH = $(wildcard ../../../runtime)
|
||||||
|
VX_SIMX_PATH = $(wildcard ../../../simX/obj_dir)
|
||||||
|
|
||||||
|
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
|
||||||
|
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
|
||||||
|
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
|
||||||
|
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||||
|
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
|
||||||
|
|
||||||
|
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.s
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
|
||||||
|
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
|
||||||
|
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
|
||||||
|
|
||||||
|
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
|
||||||
|
|
||||||
|
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
|
||||||
|
CXXFLAGS += -ffreestanding # program may not begin at main()
|
||||||
|
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
|
||||||
|
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
|
||||||
|
CXXFLAGS += -I$(POCL_INC_PATH) -I.
|
||||||
|
|
||||||
|
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||||
|
QEMU_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/qemu/libOpenCL.a
|
||||||
|
|
||||||
|
PROJECT = sad
|
||||||
|
|
||||||
|
SRCS = main.cc args.c parboil_opencl.c ocl.c gpu_info.c file.c image.c OpenCL_common.cpp
|
||||||
|
|
||||||
|
all: $(PROJECT).dump $(PROJECT).hex
|
||||||
|
|
||||||
|
lib$(PROJECT).a: kernel.cl
|
||||||
|
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
|
||||||
|
|
||||||
|
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
|
||||||
|
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
|
||||||
|
|
||||||
|
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
|
||||||
|
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
|
||||||
|
|
||||||
|
$(PROJECT).hex: $(PROJECT).elf
|
||||||
|
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
|
||||||
|
|
||||||
|
$(PROJECT).dump: $(PROJECT).elf
|
||||||
|
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||||
|
|
||||||
|
run: $(PROJECT).hex
|
||||||
|
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
|
||||||
|
|
||||||
|
qemu: $(PROJECT).qemu
|
||||||
|
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
|
||||||
|
|
||||||
|
gdb-s: $(PROJECT).qemu
|
||||||
|
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
|
||||||
|
|
||||||
|
gdb-c: $(PROJECT).qemu
|
||||||
|
$(GDB) $(PROJECT).qemu
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug
|
||||||
298
benchmarks/opencl/sad/OpenCL_common.cpp
Normal file
298
benchmarks/opencl/sad/OpenCL_common.cpp
Normal file
@@ -0,0 +1,298 @@
|
|||||||
|
|
||||||
|
|
||||||
|
#include "OpenCL_common.h"
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
// -1 for NO suitable device found, 0 if an appropriate device was found
|
||||||
|
int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...) {
|
||||||
|
|
||||||
|
// Supported Device Requests (anything that returns cl_bool)
|
||||||
|
// CL_DEVICE_IMAGE_SUPPORT
|
||||||
|
// CL_DEVICE_HOST_UNIFIED_MEMORY
|
||||||
|
// CL_DEVICE_ERROR_CORRECTION_SUPPORT
|
||||||
|
// CL_DEVICE_AVAILABLE
|
||||||
|
// CL_DEVICE_COMPILER_AVAILABLE
|
||||||
|
|
||||||
|
cl_uint numEntries = 16;
|
||||||
|
cl_platform_id clPlatforms[numEntries];
|
||||||
|
cl_uint numPlatforms;
|
||||||
|
|
||||||
|
cl_device_id clDevices[numEntries];
|
||||||
|
cl_uint numDevices;
|
||||||
|
|
||||||
|
OCL_ERRCK_RETVAL ( clGetPlatformIDs(numEntries, clPlatforms, &numPlatforms) );
|
||||||
|
//fprintf(stderr, "Number of Platforms found: %d\n", numPlatforms);
|
||||||
|
bool needDevice = true;
|
||||||
|
|
||||||
|
for (int ip = 0; ip < numPlatforms && needDevice; ++ip) {
|
||||||
|
|
||||||
|
cl_platform_id clPlatform = clPlatforms[ip];
|
||||||
|
|
||||||
|
OCL_ERRCK_RETVAL ( clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_ALL, numEntries, clDevices, &numDevices) );
|
||||||
|
//fprintf(stderr, " Number of Devices found for Platform %d: %d\n", ip, numDevices);
|
||||||
|
|
||||||
|
for (int id = 0; (id < numDevices) && needDevice ; ++id) {
|
||||||
|
cl_device_id clDevice = clDevices[id];
|
||||||
|
cl_device_type clDeviceType;
|
||||||
|
|
||||||
|
bool canSatisfy = true;
|
||||||
|
|
||||||
|
if (reqDeviceType != NULL) {
|
||||||
|
OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_TYPE, sizeof(cl_device_type), &clDeviceType, NULL));
|
||||||
|
if (*reqDeviceType != CL_DEVICE_TYPE_ALL) {
|
||||||
|
if (*reqDeviceType != clDeviceType) {
|
||||||
|
canSatisfy = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
va_list paramList;
|
||||||
|
va_start(paramList, numRequests);
|
||||||
|
for (int i = 0; (i < numRequests) && canSatisfy ; ++i) {
|
||||||
|
|
||||||
|
cl_device_info devReq = va_arg( paramList, cl_device_info );
|
||||||
|
cl_bool clInfoBool;
|
||||||
|
size_t infoRetSize = sizeof(cl_bool);
|
||||||
|
|
||||||
|
OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, devReq, infoRetSize, &clInfoBool, NULL));
|
||||||
|
if (clInfoBool != true) {
|
||||||
|
canSatisfy = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
va_end(paramList);
|
||||||
|
if (canSatisfy) {
|
||||||
|
*device = clDevice;
|
||||||
|
*platform = clPlatform;
|
||||||
|
needDevice = false;
|
||||||
|
if (reqDeviceType != NULL && (*reqDeviceType == CL_DEVICE_TYPE_ALL)) {
|
||||||
|
*reqDeviceType = clDeviceType;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // End checking all devices for a platform
|
||||||
|
} // End checking all platforms
|
||||||
|
|
||||||
|
int retVal = -1;
|
||||||
|
if (needDevice) {
|
||||||
|
retVal = -1;
|
||||||
|
} else {
|
||||||
|
retVal = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return retVal;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
const char* oclErrorString(cl_int error)
|
||||||
|
{
|
||||||
|
// From NVIDIA SDK
|
||||||
|
static const char* errorString[] = {
|
||||||
|
"CL_SUCCESS",
|
||||||
|
"CL_DEVICE_NOT_FOUND",
|
||||||
|
"CL_DEVICE_NOT_AVAILABLE",
|
||||||
|
"CL_COMPILER_NOT_AVAILABLE",
|
||||||
|
"CL_MEM_OBJECT_ALLOCATION_FAILURE",
|
||||||
|
"CL_OUT_OF_RESOURCES",
|
||||||
|
"CL_OUT_OF_HOST_MEMORY",
|
||||||
|
"CL_PROFILING_INFO_NOT_AVAILABLE",
|
||||||
|
"CL_MEM_COPY_OVERLAP",
|
||||||
|
"CL_IMAGE_FORMAT_MISMATCH",
|
||||||
|
"CL_IMAGE_FORMAT_NOT_SUPPORTED",
|
||||||
|
"CL_BUILD_PROGRAM_FAILURE",
|
||||||
|
"CL_MAP_FAILURE",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"CL_INVALID_VALUE",
|
||||||
|
"CL_INVALID_DEVICE_TYPE",
|
||||||
|
"CL_INVALID_PLATFORM",
|
||||||
|
"CL_INVALID_DEVICE",
|
||||||
|
"CL_INVALID_CONTEXT",
|
||||||
|
"CL_INVALID_QUEUE_PROPERTIES",
|
||||||
|
"CL_INVALID_COMMAND_QUEUE",
|
||||||
|
"CL_INVALID_HOST_PTR",
|
||||||
|
"CL_INVALID_MEM_OBJECT",
|
||||||
|
"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
|
||||||
|
"CL_INVALID_IMAGE_SIZE",
|
||||||
|
"CL_INVALID_SAMPLER",
|
||||||
|
"CL_INVALID_BINARY",
|
||||||
|
"CL_INVALID_BUILD_OPTIONS",
|
||||||
|
"CL_INVALID_PROGRAM",
|
||||||
|
"CL_INVALID_PROGRAM_EXECUTABLE",
|
||||||
|
"CL_INVALID_KERNEL_NAME",
|
||||||
|
"CL_INVALID_KERNEL_DEFINITION",
|
||||||
|
"CL_INVALID_KERNEL",
|
||||||
|
"CL_INVALID_ARG_INDEX",
|
||||||
|
"CL_INVALID_ARG_VALUE",
|
||||||
|
"CL_INVALID_ARG_SIZE",
|
||||||
|
"CL_INVALID_KERNEL_ARGS",
|
||||||
|
"CL_INVALID_WORK_DIMENSION",
|
||||||
|
"CL_INVALID_WORK_GROUP_SIZE",
|
||||||
|
"CL_INVALID_WORK_ITEM_SIZE",
|
||||||
|
"CL_INVALID_GLOBAL_OFFSET",
|
||||||
|
"CL_INVALID_EVENT_WAIT_LIST",
|
||||||
|
"CL_INVALID_EVENT",
|
||||||
|
"CL_INVALID_OPERATION",
|
||||||
|
"CL_INVALID_GL_OBJECT",
|
||||||
|
"CL_INVALID_BUFFER_SIZE",
|
||||||
|
"CL_INVALID_MIP_LEVEL",
|
||||||
|
"CL_INVALID_GLOBAL_WORK_SIZE",
|
||||||
|
};
|
||||||
|
|
||||||
|
const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
|
||||||
|
|
||||||
|
const int index = -error;
|
||||||
|
|
||||||
|
return (index >= 0 && index < errorCount) ? errorString[index] : "";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
const char* oclDebugErrString(cl_int error, cl_device_id device)
|
||||||
|
{
|
||||||
|
// From NVIDIA SDK
|
||||||
|
static const char* errorString[] = {
|
||||||
|
"CL_SUCCESS",
|
||||||
|
"CL_DEVICE_NOT_FOUND",
|
||||||
|
"CL_DEVICE_NOT_AVAILABLE",
|
||||||
|
"CL_COMPILER_NOT_AVAILABLE",
|
||||||
|
"CL_MEM_OBJECT_ALLOCATION_FAILURE",
|
||||||
|
"CL_OUT_OF_RESOURCES",
|
||||||
|
"CL_OUT_OF_HOST_MEMORY",
|
||||||
|
"CL_PROFILING_INFO_NOT_AVAILABLE",
|
||||||
|
"CL_MEM_COPY_OVERLAP",
|
||||||
|
"CL_IMAGE_FORMAT_MISMATCH",
|
||||||
|
"CL_IMAGE_FORMAT_NOT_SUPPORTED",
|
||||||
|
"CL_BUILD_PROGRAM_FAILURE",
|
||||||
|
"CL_MAP_FAILURE",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
"CL_INVALID_VALUE",
|
||||||
|
"CL_INVALID_DEVICE_TYPE",
|
||||||
|
"CL_INVALID_PLATFORM",
|
||||||
|
"CL_INVALID_DEVICE",
|
||||||
|
"CL_INVALID_CONTEXT",
|
||||||
|
"CL_INVALID_QUEUE_PROPERTIES",
|
||||||
|
"CL_INVALID_COMMAND_QUEUE",
|
||||||
|
"CL_INVALID_HOST_PTR",
|
||||||
|
"CL_INVALID_MEM_OBJECT",
|
||||||
|
"CL_INVALID_IMAGE_FORMAT_DESCRIPTOR",
|
||||||
|
"CL_INVALID_IMAGE_SIZE",
|
||||||
|
"CL_INVALID_SAMPLER",
|
||||||
|
"CL_INVALID_BINARY",
|
||||||
|
"CL_INVALID_BUILD_OPTIONS",
|
||||||
|
"CL_INVALID_PROGRAM",
|
||||||
|
"CL_INVALID_PROGRAM_EXECUTABLE",
|
||||||
|
"CL_INVALID_KERNEL_NAME",
|
||||||
|
"CL_INVALID_KERNEL_DEFINITION",
|
||||||
|
"CL_INVALID_KERNEL",
|
||||||
|
"CL_INVALID_ARG_INDEX",
|
||||||
|
"CL_INVALID_ARG_VALUE",
|
||||||
|
"CL_INVALID_ARG_SIZE",
|
||||||
|
"CL_INVALID_KERNEL_ARGS",
|
||||||
|
"CL_INVALID_WORK_DIMENSION",
|
||||||
|
"CL_INVALID_WORK_GROUP_SIZE",
|
||||||
|
"CL_INVALID_WORK_ITEM_SIZE",
|
||||||
|
"CL_INVALID_GLOBAL_OFFSET",
|
||||||
|
"CL_INVALID_EVENT_WAIT_LIST",
|
||||||
|
"CL_INVALID_EVENT",
|
||||||
|
"CL_INVALID_OPERATION",
|
||||||
|
"CL_INVALID_GL_OBJECT",
|
||||||
|
"CL_INVALID_BUFFER_SIZE",
|
||||||
|
"CL_INVALID_MIP_LEVEL",
|
||||||
|
"CL_INVALID_GLOBAL_WORK_SIZE",
|
||||||
|
};
|
||||||
|
|
||||||
|
const int errorCount = sizeof(errorString) / sizeof(errorString[0]);
|
||||||
|
|
||||||
|
const int index = -error;
|
||||||
|
|
||||||
|
if (index == 4) {
|
||||||
|
cl_uint maxMemAlloc = 0;
|
||||||
|
|
||||||
|
OCL_ERRCK_RETVAL ( clGetDeviceInfo( device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &maxMemAlloc, NULL) );
|
||||||
|
|
||||||
|
|
||||||
|
fprintf(stderr, " Device Maximum block allocation size: %lu\n", maxMemAlloc);
|
||||||
|
}
|
||||||
|
|
||||||
|
return (index >= 0 && index < errorCount) ? errorString[index] : "";
|
||||||
|
}
|
||||||
|
|
||||||
|
char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength)
|
||||||
|
{
|
||||||
|
// locals
|
||||||
|
FILE* pFileStream = NULL;
|
||||||
|
size_t szSourceLength;
|
||||||
|
|
||||||
|
// open the OpenCL source code file
|
||||||
|
#ifdef _WIN32 // Windows version
|
||||||
|
if(fopen_s(&pFileStream, cFilename, "rb") != 0)
|
||||||
|
{
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
#else // Linux version
|
||||||
|
pFileStream = fopen(cFilename, "rb");
|
||||||
|
if(pFileStream == 0)
|
||||||
|
{
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
size_t szPreambleLength = strlen(cPreamble);
|
||||||
|
szPreambleLength = 0;
|
||||||
|
|
||||||
|
// get the length of the source code
|
||||||
|
fseek(pFileStream, 0, SEEK_END);
|
||||||
|
szSourceLength = ftell(pFileStream);
|
||||||
|
fseek(pFileStream, 0, SEEK_SET);
|
||||||
|
|
||||||
|
// allocate a buffer for the source code string and read it in
|
||||||
|
char* cSourceString = (char *)malloc(szSourceLength + szPreambleLength + 1);
|
||||||
|
memcpy(cSourceString, cPreamble, szPreambleLength);
|
||||||
|
if (fread((cSourceString) + szPreambleLength, szSourceLength, 1, pFileStream) != 1)
|
||||||
|
{
|
||||||
|
fclose(pFileStream);
|
||||||
|
free(cSourceString);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// close the file and return the total length of the combined (preamble + source) string
|
||||||
|
fclose(pFileStream);
|
||||||
|
if(szFinalLength != 0)
|
||||||
|
{
|
||||||
|
*szFinalLength = szSourceLength + szPreambleLength;
|
||||||
|
}
|
||||||
|
cSourceString[szSourceLength + szPreambleLength] = '\0';
|
||||||
|
|
||||||
|
return cSourceString;
|
||||||
|
}
|
||||||
22
benchmarks/opencl/sad/OpenCL_common.h
Normal file
22
benchmarks/opencl/sad/OpenCL_common.h
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
|
||||||
|
#ifndef __OPENCL_COMMON_H_
|
||||||
|
#define __OPENCL_COMMON_H_
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdarg.h>
|
||||||
|
#include <CL/cl.h>
|
||||||
|
|
||||||
|
int getOpenCLDevice(cl_platform_id *platform, cl_device_id *device, cl_device_type *reqDeviceType, int numRequests, ...);
|
||||||
|
const char* oclErrorString(cl_int error);
|
||||||
|
const char* oclDebugErrString(cl_int error, cl_device_id device);
|
||||||
|
|
||||||
|
#define OCL_ERRCK_VAR(var) \
|
||||||
|
{ if (var != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(var)); }
|
||||||
|
|
||||||
|
#define OCL_ERRCK_RETVAL(s) \
|
||||||
|
{ cl_int clerr = (s);\
|
||||||
|
if (clerr != CL_SUCCESS) fprintf(stderr, "OpenCL Error (%s: %d): %s\n", __FILE__, __LINE__, oclErrorString(clerr)); }
|
||||||
|
|
||||||
|
char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
|
||||||
|
|
||||||
|
#endif
|
||||||
617
benchmarks/opencl/sad/args.c
Normal file
617
benchmarks/opencl/sad/args.c
Normal file
@@ -0,0 +1,617 @@
|
|||||||
|
|
||||||
|
#include <parboil.h>
|
||||||
|
#include <errno.h>
|
||||||
|
#include <limits.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
/*****************************************************************************/
|
||||||
|
/* Memory management routines */
|
||||||
|
|
||||||
|
/* Free an array of owned strings. */
|
||||||
|
void
|
||||||
|
pb_FreeStringArray(char **string_array)
|
||||||
|
{
|
||||||
|
char **p;
|
||||||
|
|
||||||
|
if (!string_array) return;
|
||||||
|
for (p = string_array; *p; p++) free(*p);
|
||||||
|
free(string_array);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct pb_PlatformParam *
|
||||||
|
pb_PlatformParam(char *name, char *version)
|
||||||
|
{
|
||||||
|
if (name == NULL) {
|
||||||
|
fprintf(stderr, "pb_PlatformParam: Invalid argument\n");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct pb_PlatformParam *ret =
|
||||||
|
(struct pb_PlatformParam *)malloc(sizeof (struct pb_PlatformParam));
|
||||||
|
|
||||||
|
ret->name = name;
|
||||||
|
ret->version = version;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_FreePlatformParam(struct pb_PlatformParam *p)
|
||||||
|
{
|
||||||
|
if (p == NULL) return;
|
||||||
|
|
||||||
|
free(p->name);
|
||||||
|
free(p->version);
|
||||||
|
free(p);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct pb_DeviceParam *
|
||||||
|
pb_DeviceParam_index(int index)
|
||||||
|
{
|
||||||
|
struct pb_DeviceParam *ret =
|
||||||
|
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
|
||||||
|
ret->criterion = pb_Device_INDEX;
|
||||||
|
ret->index = index;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct pb_DeviceParam *
|
||||||
|
pb_DeviceParam_cpu(void)
|
||||||
|
{
|
||||||
|
struct pb_DeviceParam *ret =
|
||||||
|
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
|
||||||
|
ret->criterion = pb_Device_CPU;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct pb_DeviceParam *
|
||||||
|
pb_DeviceParam_gpu(void)
|
||||||
|
{
|
||||||
|
struct pb_DeviceParam *ret =
|
||||||
|
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
|
||||||
|
ret->criterion = pb_Device_GPU;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct pb_DeviceParam *
|
||||||
|
pb_DeviceParam_accelerator(void)
|
||||||
|
{
|
||||||
|
struct pb_DeviceParam *ret =
|
||||||
|
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
|
||||||
|
ret->criterion = pb_Device_ACCELERATOR;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct pb_DeviceParam *
|
||||||
|
pb_DeviceParam_name(char *name)
|
||||||
|
{
|
||||||
|
struct pb_DeviceParam *ret =
|
||||||
|
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
|
||||||
|
ret->criterion = pb_Device_NAME;
|
||||||
|
ret->name = name;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_FreeDeviceParam(struct pb_DeviceParam *p)
|
||||||
|
{
|
||||||
|
if (p == NULL) return;
|
||||||
|
|
||||||
|
switch(p->criterion) {
|
||||||
|
case pb_Device_NAME:
|
||||||
|
free(p->name);
|
||||||
|
break;
|
||||||
|
case pb_Device_INDEX:
|
||||||
|
case pb_Device_CPU:
|
||||||
|
case pb_Device_ACCELERATOR:
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
fprintf(stderr, "pb_FreeDeviceParam: Invalid argument\n");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_FreeParameters(struct pb_Parameters *p)
|
||||||
|
{
|
||||||
|
free(p->outFile);
|
||||||
|
pb_FreeStringArray(p->inpFiles);
|
||||||
|
pb_FreePlatformParam(p->platform);
|
||||||
|
pb_FreeDeviceParam(p->device);
|
||||||
|
free(p);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*****************************************************************************/
|
||||||
|
|
||||||
|
/* Parse a comma-delimited list of strings into an
|
||||||
|
* array of strings. */
|
||||||
|
static char **
|
||||||
|
read_string_array(char *in)
|
||||||
|
{
|
||||||
|
char **ret;
|
||||||
|
int i;
|
||||||
|
int count; /* Number of items in the input */
|
||||||
|
char *substring; /* Current substring within 'in' */
|
||||||
|
|
||||||
|
/* Count the number of items in the string */
|
||||||
|
count = 1;
|
||||||
|
for (i = 0; in[i]; i++) if (in[i] == ',') count++;
|
||||||
|
|
||||||
|
/* Allocate storage */
|
||||||
|
ret = (char **)malloc((count + 1) * sizeof(char *));
|
||||||
|
|
||||||
|
/* Create copies of the strings from the list */
|
||||||
|
substring = in;
|
||||||
|
for (i = 0; i < count; i++) {
|
||||||
|
char *substring_end;
|
||||||
|
int substring_length;
|
||||||
|
|
||||||
|
/* Find length of substring */
|
||||||
|
for (substring_end = substring;
|
||||||
|
(*substring_end != ',') && (*substring_end != 0);
|
||||||
|
substring_end++);
|
||||||
|
|
||||||
|
substring_length = substring_end - substring;
|
||||||
|
|
||||||
|
/* Allocate memory and copy the substring */
|
||||||
|
ret[i] = (char *)malloc(substring_length + 1);
|
||||||
|
memcpy(ret[i], substring, substring_length);
|
||||||
|
ret[i][substring_length] = 0;
|
||||||
|
|
||||||
|
/* go to next substring */
|
||||||
|
substring = substring_end + 1;
|
||||||
|
}
|
||||||
|
ret[i] = NULL; /* Write the sentinel value */
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
report_parse_error(const char *str)
|
||||||
|
{
|
||||||
|
fputs(str, stderr);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Interpret a string as a 'pb_DeviceParam' value.
|
||||||
|
* Return a pointer to a new value, or NULL on failure.
|
||||||
|
*/
|
||||||
|
static struct pb_DeviceParam *
|
||||||
|
read_device_param(char *str)
|
||||||
|
{
|
||||||
|
/* Try different ways of interpreting 'device_string' until one works */
|
||||||
|
|
||||||
|
/* If argument is an integer, then interpret it as a device index */
|
||||||
|
errno = 0;
|
||||||
|
char *end;
|
||||||
|
long device_int = strtol(str, &end, 10);
|
||||||
|
if (!errno) {
|
||||||
|
/* Negative numbers are not valid */
|
||||||
|
if (device_int < 0 || device_int > INT_MAX) return NULL;
|
||||||
|
|
||||||
|
return pb_DeviceParam_index(device_int);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Match against predefined strings */
|
||||||
|
if (strcmp(str, "CPU") == 0)
|
||||||
|
return pb_DeviceParam_cpu();
|
||||||
|
if (strcmp(str, "GPU") == 0)
|
||||||
|
return pb_DeviceParam_gpu();
|
||||||
|
if (strcmp(str, "ACCELERATOR") == 0)
|
||||||
|
return pb_DeviceParam_accelerator();
|
||||||
|
|
||||||
|
/* Assume any other string is a device name */
|
||||||
|
return pb_DeviceParam_name(strdup(str));
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Interpret a string as a 'pb_PlatformParam' value.
|
||||||
|
* Return a pointer to a new value, or NULL on failure.
|
||||||
|
*/
|
||||||
|
static struct pb_PlatformParam *
|
||||||
|
read_platform_param(char *str)
|
||||||
|
{
|
||||||
|
int separator_index; /* Index of the '-' character separating
|
||||||
|
* name and version number. It's -1 if
|
||||||
|
* there's no '-' character. */
|
||||||
|
|
||||||
|
/* Find the last occurrence of '-' in 'str' */
|
||||||
|
{
|
||||||
|
char *cur;
|
||||||
|
separator_index = -1;
|
||||||
|
for (cur = str; *cur; cur++) {
|
||||||
|
if (*cur == '-') separator_index = cur - str;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The platform name is either the entire string, or all characters before
|
||||||
|
* the separator */
|
||||||
|
int name_length = separator_index == -1 ? strlen(str) : separator_index;
|
||||||
|
char *name_str = (char *)malloc(name_length + 1);
|
||||||
|
memcpy(name_str, str, name_length);
|
||||||
|
name_str[name_length] = 0;
|
||||||
|
|
||||||
|
/* The version is either NULL, or all characters after the separator */
|
||||||
|
char *version_str;
|
||||||
|
if (separator_index == -1) {
|
||||||
|
version_str = NULL;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
const char *version_input_str = str + separator_index + 1;
|
||||||
|
int version_length = strlen(version_input_str);
|
||||||
|
|
||||||
|
version_str = (char *)malloc(version_length + 1);
|
||||||
|
memcpy(version_str, version_input_str, version_length);
|
||||||
|
version_str[version_length] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Create output structure */
|
||||||
|
return pb_PlatformParam(name_str, version_str);
|
||||||
|
}
|
||||||
|
|
||||||
|
/****************************************************************************/
|
||||||
|
/* Argument parsing state */
|
||||||
|
|
||||||
|
/* Argument parsing state.
|
||||||
|
*
|
||||||
|
* Arguments that are interpreted by the argument parser are removed from
|
||||||
|
* the list. Variables 'argc' and 'argn' do not count arguments that have
|
||||||
|
* been removed.
|
||||||
|
*
|
||||||
|
* During argument parsing, the array of arguments is compacted, overwriting
|
||||||
|
* the erased arguments. Variable 'argv_put' points to the array element
|
||||||
|
* where the next argument will be written. Variable 'argv_get' points to
|
||||||
|
* the array element where the next argument will be read from.
|
||||||
|
*/
|
||||||
|
struct argparse {
|
||||||
|
int argc; /* Number of arguments. Mutable. */
|
||||||
|
int argn; /* Current argument index. */
|
||||||
|
char **argv_get; /* Argument value being read. */
|
||||||
|
char **argv_put; /* Argument value being written.
|
||||||
|
* argv_put <= argv_get. */
|
||||||
|
};
|
||||||
|
|
||||||
|
static void
|
||||||
|
initialize_argparse(struct argparse *ap, int argc, char **argv)
|
||||||
|
{
|
||||||
|
ap->argc = argc;
|
||||||
|
ap->argn = 0;
|
||||||
|
ap->argv_get = ap->argv_put = argv;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Finish argument parsing, without processing the remaining arguments.
|
||||||
|
* Write new argument count into _argc. */
|
||||||
|
static void
|
||||||
|
finalize_argparse(struct argparse *ap, int *_argc, char **argv)
|
||||||
|
{
|
||||||
|
/* Move the remaining arguments */
|
||||||
|
for(; ap->argn < ap->argc; ap->argn++)
|
||||||
|
*ap->argv_put++ = *ap->argv_get++;
|
||||||
|
|
||||||
|
/* Update the argument count */
|
||||||
|
*_argc = ap->argc;
|
||||||
|
|
||||||
|
/* Insert a terminating NULL */
|
||||||
|
argv[ap->argc] = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Delete the current argument. The argument will not be visible
|
||||||
|
* when argument parsing is done. */
|
||||||
|
static void
|
||||||
|
delete_argument(struct argparse *ap)
|
||||||
|
{
|
||||||
|
if (ap->argn >= ap->argc) {
|
||||||
|
fprintf(stderr, "delete_argument\n");
|
||||||
|
}
|
||||||
|
ap->argc--;
|
||||||
|
ap->argv_get++;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Go to the next argument. Also, move the current argument to its
|
||||||
|
* final location in argv. */
|
||||||
|
static void
|
||||||
|
next_argument(struct argparse *ap)
|
||||||
|
{
|
||||||
|
if (ap->argn >= ap->argc) {
|
||||||
|
fprintf(stderr, "next_argument\n");
|
||||||
|
}
|
||||||
|
/* Move argument to its new location. */
|
||||||
|
*ap->argv_put++ = *ap->argv_get++;
|
||||||
|
ap->argn++;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
is_end_of_arguments(struct argparse *ap)
|
||||||
|
{
|
||||||
|
return ap->argn == ap->argc;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Get the current argument */
|
||||||
|
static char *
|
||||||
|
get_argument(struct argparse *ap)
|
||||||
|
{
|
||||||
|
return *ap->argv_get;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Get the current argument, and also delete it */
|
||||||
|
static char *
|
||||||
|
consume_argument(struct argparse *ap)
|
||||||
|
{
|
||||||
|
char *ret = get_argument(ap);
|
||||||
|
delete_argument(ap);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/****************************************************************************/
|
||||||
|
|
||||||
|
/* The result of parsing a command-line argument */
|
||||||
|
typedef enum {
|
||||||
|
ARGPARSE_OK, /* Success */
|
||||||
|
ARGPARSE_ERROR, /* Error */
|
||||||
|
ARGPARSE_DONE /* Success, and do not continue parsing */
|
||||||
|
} result;
|
||||||
|
|
||||||
|
typedef result parse_action(struct argparse *ap, struct pb_Parameters *params);
|
||||||
|
|
||||||
|
|
||||||
|
/* A command-line option */
|
||||||
|
struct option {
|
||||||
|
char short_name; /* If not 0, the one-character
|
||||||
|
* name of this option */
|
||||||
|
const char *long_name; /* If not NULL, the long name of this option */
|
||||||
|
parse_action *action; /* What to do when this option occurs.
|
||||||
|
* Sentinel value is NULL.
|
||||||
|
*/
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Output file
|
||||||
|
*
|
||||||
|
* -o FILE
|
||||||
|
*/
|
||||||
|
static result
|
||||||
|
parse_output_file(struct argparse *ap, struct pb_Parameters *params)
|
||||||
|
{
|
||||||
|
if (is_end_of_arguments(ap))
|
||||||
|
{
|
||||||
|
report_parse_error("Expecting file name after '-o'\n");
|
||||||
|
return ARGPARSE_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Replace the output file name */
|
||||||
|
free(params->outFile);
|
||||||
|
params->outFile = strdup(consume_argument(ap));
|
||||||
|
|
||||||
|
return ARGPARSE_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Input files
|
||||||
|
*
|
||||||
|
* -i FILE,FILE,...
|
||||||
|
*/
|
||||||
|
static result
|
||||||
|
parse_input_files(struct argparse *ap, struct pb_Parameters *params)
|
||||||
|
{
|
||||||
|
if (is_end_of_arguments(ap))
|
||||||
|
{
|
||||||
|
report_parse_error("Expecting file name after '-i'\n");
|
||||||
|
return ARGPARSE_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Replace the input file list */
|
||||||
|
pb_FreeStringArray(params->inpFiles);
|
||||||
|
params->inpFiles = read_string_array(consume_argument(ap));
|
||||||
|
return ARGPARSE_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* End of options
|
||||||
|
*
|
||||||
|
* --
|
||||||
|
*/
|
||||||
|
|
||||||
|
static result
|
||||||
|
parse_end_options(struct argparse *ap, struct pb_Parameters *params)
|
||||||
|
{
|
||||||
|
return ARGPARSE_DONE;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* OpenCL device
|
||||||
|
*
|
||||||
|
* --device X
|
||||||
|
*/
|
||||||
|
|
||||||
|
static result
|
||||||
|
parse_device(struct argparse *ap, struct pb_Parameters *params)
|
||||||
|
{
|
||||||
|
/* Read the next argument, which specifies a device */
|
||||||
|
|
||||||
|
if (is_end_of_arguments(ap))
|
||||||
|
{
|
||||||
|
report_parse_error("Expecting device specification after '--device'\n");
|
||||||
|
return ARGPARSE_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *device_string = consume_argument(ap);
|
||||||
|
struct pb_DeviceParam *device_param = read_device_param(device_string);
|
||||||
|
|
||||||
|
if (!device_param) {
|
||||||
|
report_parse_error("Unrecognized device specification format on command line\n");
|
||||||
|
return ARGPARSE_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Save the result */
|
||||||
|
pb_FreeDeviceParam(params->device);
|
||||||
|
params->device = device_param;
|
||||||
|
|
||||||
|
return ARGPARSE_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
static result
|
||||||
|
parse_platform(struct argparse *ap, struct pb_Parameters *params)
|
||||||
|
{
|
||||||
|
/* Read the next argument, which specifies a platform */
|
||||||
|
|
||||||
|
if (is_end_of_arguments(ap))
|
||||||
|
{
|
||||||
|
report_parse_error("Expecting device specification after '--platform'\n");
|
||||||
|
return ARGPARSE_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
char *platform_string = consume_argument(ap);
|
||||||
|
struct pb_PlatformParam *platform_param = read_platform_param(platform_string);
|
||||||
|
|
||||||
|
if (!platform_param) {
|
||||||
|
report_parse_error("Unrecognized platform specification format on command line\n");
|
||||||
|
return ARGPARSE_ERROR;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Save the result */
|
||||||
|
pb_FreePlatformParam(params->platform);
|
||||||
|
params->platform = platform_param;
|
||||||
|
|
||||||
|
return ARGPARSE_OK;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static struct option options[] = {
|
||||||
|
{ 'o', NULL, &parse_output_file },
|
||||||
|
{ 'i', NULL, &parse_input_files },
|
||||||
|
{ '-', NULL, &parse_end_options },
|
||||||
|
{ 0, "device", &parse_device },
|
||||||
|
{ 0, "platform", &parse_platform },
|
||||||
|
{ 0, NULL, NULL }
|
||||||
|
};
|
||||||
|
|
||||||
|
static int
|
||||||
|
is_last_option(struct option *op)
|
||||||
|
{
|
||||||
|
return op->action == NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/****************************************************************************/
|
||||||
|
|
||||||
|
/* Parse command-line parameters.
|
||||||
|
* Return zero on error, nonzero otherwise.
|
||||||
|
* On error, the other outputs may be invalid.
|
||||||
|
*
|
||||||
|
* The information collected from parameters is used to update
|
||||||
|
* 'ret'. 'ret' should be initialized.
|
||||||
|
*
|
||||||
|
* '_argc' and 'argv' are updated to contain only the unprocessed arguments.
|
||||||
|
*/
|
||||||
|
static int
|
||||||
|
pb_ParseParameters (struct pb_Parameters *ret, int *_argc, char **argv)
|
||||||
|
{
|
||||||
|
char *err_message;
|
||||||
|
struct argparse ap;
|
||||||
|
|
||||||
|
/* Each argument */
|
||||||
|
initialize_argparse(&ap, *_argc, argv);
|
||||||
|
while(!is_end_of_arguments(&ap)) {
|
||||||
|
result arg_result; /* Result of parsing this option */
|
||||||
|
char *arg = get_argument(&ap);
|
||||||
|
|
||||||
|
/* Process this argument */
|
||||||
|
if (arg[0] == '-') {
|
||||||
|
/* Single-character flag */
|
||||||
|
if ((arg[1] != 0) && (arg[2] == 0)) {
|
||||||
|
delete_argument(&ap); /* This argument is consumed here */
|
||||||
|
|
||||||
|
/* Find a matching short option */
|
||||||
|
struct option *op;
|
||||||
|
for (op = options; !is_last_option(op); op++) {
|
||||||
|
if (op->short_name == arg[1]) {
|
||||||
|
arg_result = (*op->action)(&ap, ret);
|
||||||
|
goto option_was_processed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* No option matches */
|
||||||
|
report_parse_error("Unexpected command-line parameter\n");
|
||||||
|
arg_result = ARGPARSE_ERROR;
|
||||||
|
goto option_was_processed;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Long flag */
|
||||||
|
if (arg[1] == '-') {
|
||||||
|
delete_argument(&ap); /* This argument is consumed here */
|
||||||
|
|
||||||
|
/* Find a matching long option */
|
||||||
|
struct option *op;
|
||||||
|
for (op = options; !is_last_option(op); op++) {
|
||||||
|
if (op->long_name && strcmp(&arg[2], op->long_name) == 0) {
|
||||||
|
arg_result = (*op->action)(&ap, ret);
|
||||||
|
goto option_was_processed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* No option matches */
|
||||||
|
report_parse_error("Unexpected command-line parameter\n");
|
||||||
|
arg_result = ARGPARSE_ERROR;
|
||||||
|
goto option_was_processed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
/* Other arguments are ignored */
|
||||||
|
next_argument(&ap);
|
||||||
|
arg_result = ARGPARSE_OK;
|
||||||
|
goto option_was_processed;
|
||||||
|
}
|
||||||
|
|
||||||
|
option_was_processed:
|
||||||
|
/* Decide what to do next based on 'arg_result' */
|
||||||
|
switch(arg_result) {
|
||||||
|
case ARGPARSE_OK:
|
||||||
|
/* Continue processing */
|
||||||
|
break;
|
||||||
|
|
||||||
|
case ARGPARSE_ERROR:
|
||||||
|
/* Error exit from the function */
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
case ARGPARSE_DONE:
|
||||||
|
/* Normal exit from the argument parsing loop */
|
||||||
|
goto end_of_options;
|
||||||
|
}
|
||||||
|
} /* end for each argument */
|
||||||
|
|
||||||
|
/* If all arguments were processed, then normal exit from the loop */
|
||||||
|
|
||||||
|
end_of_options:
|
||||||
|
finalize_argparse(&ap, _argc, argv);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*****************************************************************************/
|
||||||
|
/* Other exported functions */
|
||||||
|
|
||||||
|
struct pb_Parameters *
|
||||||
|
pb_ReadParameters(int *_argc, char **argv)
|
||||||
|
{
|
||||||
|
struct pb_Parameters *ret =
|
||||||
|
(struct pb_Parameters *)malloc(sizeof(struct pb_Parameters));
|
||||||
|
|
||||||
|
/* Initialize the parameters structure */
|
||||||
|
ret->outFile = NULL;
|
||||||
|
ret->inpFiles = (char **)malloc(sizeof(char *));
|
||||||
|
ret->inpFiles[0] = NULL;
|
||||||
|
ret->platform = NULL;
|
||||||
|
ret->device = NULL;
|
||||||
|
|
||||||
|
/* Read parameters and update _argc, argv */
|
||||||
|
if (!pb_ParseParameters(ret, _argc, argv)) {
|
||||||
|
/* Parse error */
|
||||||
|
pb_FreeParameters(ret);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
int
|
||||||
|
pb_Parameters_CountInputs(struct pb_Parameters *p)
|
||||||
|
{
|
||||||
|
int n;
|
||||||
|
|
||||||
|
for (n = 0; p->inpFiles[n]; n++);
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
|
||||||
55
benchmarks/opencl/sad/file.c
Normal file
55
benchmarks/opencl/sad/file.c
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
/***************************************************************************
|
||||||
|
*cr
|
||||||
|
*cr (C) Copyright 2007 The Board of Trustees of the
|
||||||
|
*cr University of Illinois
|
||||||
|
*cr All Rights Reserved
|
||||||
|
*cr
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include "file.h"
|
||||||
|
|
||||||
|
unsigned short
|
||||||
|
read16u(FILE *f)
|
||||||
|
{
|
||||||
|
int n;
|
||||||
|
|
||||||
|
n = fgetc(f);
|
||||||
|
n += fgetc(f) << 8;
|
||||||
|
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
|
||||||
|
short
|
||||||
|
read16i(FILE *f)
|
||||||
|
{
|
||||||
|
int n;
|
||||||
|
|
||||||
|
n = fgetc(f);
|
||||||
|
n += fgetc(f) << 8;
|
||||||
|
|
||||||
|
return n;
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
write32u(FILE *f, unsigned int i)
|
||||||
|
{
|
||||||
|
putc(i, f);
|
||||||
|
putc(i >> 8, f);
|
||||||
|
putc(i >> 16, f);
|
||||||
|
putc(i >> 24, f);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
write16u(FILE *f, unsigned short h)
|
||||||
|
{
|
||||||
|
putc(h, f);
|
||||||
|
putc(h >> 8, f);
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
write16i(FILE *f, short h)
|
||||||
|
{
|
||||||
|
putc(h, f);
|
||||||
|
putc(h >> 8, f);
|
||||||
|
}
|
||||||
22
benchmarks/opencl/sad/file.h
Normal file
22
benchmarks/opencl/sad/file.h
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
/***************************************************************************
|
||||||
|
*cr
|
||||||
|
*cr (C) Copyright 2007 The Board of Trustees of the
|
||||||
|
*cr University of Illinois
|
||||||
|
*cr All Rights Reserved
|
||||||
|
*cr
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
unsigned short read16u(FILE *f);
|
||||||
|
short read16i(FILE *f);
|
||||||
|
|
||||||
|
void write32u(FILE *f, unsigned int i);
|
||||||
|
void write16u(FILE *f, unsigned short h);
|
||||||
|
void write16i(FILE *f, short h);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
BIN
benchmarks/opencl/sad/frame.bin
Executable file
BIN
benchmarks/opencl/sad/frame.bin
Executable file
Binary file not shown.
55
benchmarks/opencl/sad/gpu_info.c
Normal file
55
benchmarks/opencl/sad/gpu_info.c
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
/***************************************************************************
|
||||||
|
*cr
|
||||||
|
*cr (C) Copyright 2010 The Board of Trustees of the
|
||||||
|
*cr University of Illinois
|
||||||
|
*cr All Rights Reserved
|
||||||
|
*cr
|
||||||
|
***************************************************************************/
|
||||||
|
//#include <endian.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <malloc.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <inttypes.h>
|
||||||
|
|
||||||
|
#include "gpu_info.h"
|
||||||
|
|
||||||
|
void compute_active_thread(size_t *thread,
|
||||||
|
size_t *grid,
|
||||||
|
int task,
|
||||||
|
int pad,
|
||||||
|
int major,
|
||||||
|
int minor,
|
||||||
|
int sm)
|
||||||
|
{
|
||||||
|
int max_thread;
|
||||||
|
int max_block=8;
|
||||||
|
if(major==1)
|
||||||
|
{
|
||||||
|
if(minor>=2)
|
||||||
|
max_thread=1024;
|
||||||
|
else
|
||||||
|
max_thread=768;
|
||||||
|
}
|
||||||
|
else if(major==2)
|
||||||
|
max_thread=1536;
|
||||||
|
else
|
||||||
|
//newer GPU //keep using 2.0
|
||||||
|
max_thread=1536;
|
||||||
|
|
||||||
|
int _grid;
|
||||||
|
int _thread;
|
||||||
|
|
||||||
|
if(task*pad>sm*max_thread)
|
||||||
|
{
|
||||||
|
_thread=max_thread/max_block;
|
||||||
|
_grid = ((task*pad+_thread-1)/_thread)*_thread;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
_thread=pad;
|
||||||
|
_grid=task*pad;
|
||||||
|
}
|
||||||
|
|
||||||
|
thread[0]=_thread;
|
||||||
|
grid[0]=_grid;
|
||||||
|
}
|
||||||
20
benchmarks/opencl/sad/gpu_info.h
Normal file
20
benchmarks/opencl/sad/gpu_info.h
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
/***************************************************************************
|
||||||
|
*cr
|
||||||
|
*cr (C) Copyright 2010 The Board of Trustees of the
|
||||||
|
*cr University of Illinois
|
||||||
|
*cr All Rights Reserved
|
||||||
|
*cr
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#ifndef __GPUINFOH__
|
||||||
|
#define __GPUINFOH__
|
||||||
|
|
||||||
|
void compute_active_thread(size_t *thread,
|
||||||
|
size_t *grid,
|
||||||
|
int task,
|
||||||
|
int pad,
|
||||||
|
int major,
|
||||||
|
int minor,
|
||||||
|
int sm);
|
||||||
|
|
||||||
|
#endif
|
||||||
56
benchmarks/opencl/sad/image.c
Normal file
56
benchmarks/opencl/sad/image.c
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
/***************************************************************************
|
||||||
|
*cr
|
||||||
|
*cr (C) Copyright 2007 The Board of Trustees of the
|
||||||
|
*cr University of Illinois
|
||||||
|
*cr All Rights Reserved
|
||||||
|
*cr
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include "file.h"
|
||||||
|
#include "image.h"
|
||||||
|
|
||||||
|
struct image_i16 *
|
||||||
|
load_image(char *filename)
|
||||||
|
{
|
||||||
|
FILE *infile;
|
||||||
|
short *data;
|
||||||
|
int w;
|
||||||
|
int h;
|
||||||
|
|
||||||
|
infile = fopen(filename, "r");
|
||||||
|
|
||||||
|
if (!infile)
|
||||||
|
{
|
||||||
|
fprintf(stderr, "Cannot find file '%s'\n", filename);
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Read image dimensions */
|
||||||
|
w = read16u(infile);
|
||||||
|
h = read16u(infile);
|
||||||
|
|
||||||
|
/* Read image contents */
|
||||||
|
data = (short *)malloc(w * h * sizeof(short));
|
||||||
|
fread(data, sizeof(short), w * h, infile);
|
||||||
|
|
||||||
|
fclose(infile);
|
||||||
|
|
||||||
|
/* Create the return data structure */
|
||||||
|
{
|
||||||
|
struct image_i16 *ret =
|
||||||
|
(struct image_i16 *)malloc(sizeof(struct image_i16));
|
||||||
|
ret->width = w;
|
||||||
|
ret->height = h;
|
||||||
|
ret->data = data;
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
free_image(struct image_i16 *img)
|
||||||
|
{
|
||||||
|
free(img->data);
|
||||||
|
free(img);
|
||||||
|
}
|
||||||
25
benchmarks/opencl/sad/image.h
Normal file
25
benchmarks/opencl/sad/image.h
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
/***************************************************************************
|
||||||
|
*cr
|
||||||
|
*cr (C) Copyright 2007 The Board of Trustees of the
|
||||||
|
*cr University of Illinois
|
||||||
|
*cr All Rights Reserved
|
||||||
|
*cr
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
struct image_i16
|
||||||
|
{
|
||||||
|
int width;
|
||||||
|
int height;
|
||||||
|
short *data;
|
||||||
|
};
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
struct image_i16 * load_image(char *filename);
|
||||||
|
void free_image(struct image_i16 *);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
326
benchmarks/opencl/sad/kernel.cl
Normal file
326
benchmarks/opencl/sad/kernel.cl
Normal file
@@ -0,0 +1,326 @@
|
|||||||
|
/***************************************************************************
|
||||||
|
*cr
|
||||||
|
*cr (C) Copyright 2007 The Board of Trustees of the
|
||||||
|
*cr University of Illinois
|
||||||
|
*cr All Rights Reserved
|
||||||
|
*cr
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#ifndef MAX_POS
|
||||||
|
#define MAX_POS 1089
|
||||||
|
#define CEIL_POS 61
|
||||||
|
#define POS_PER_THREAD 18
|
||||||
|
#define MAX_POS_PADDED 1096
|
||||||
|
#define THREADS_W 1
|
||||||
|
#define THREADS_H 1
|
||||||
|
#define SEARCH_RANGE 16
|
||||||
|
#define SEARCH_DIMENSION 33
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* The compute kernel. */
|
||||||
|
/* The macros THREADS_W and THREADS_H specify the width and height of the
|
||||||
|
* area to be processed by one thread, measured in 4-by-4 pixel blocks.
|
||||||
|
* Larger numbers mean more computation per thread block.
|
||||||
|
*
|
||||||
|
* The macro POS_PER_THREAD specifies the number of search positions for which
|
||||||
|
* an SAD is computed. A larger value indicates more computation per thread,
|
||||||
|
* and fewer threads per thread block. It must be a multiple of 3 and also
|
||||||
|
* must be at most 33 because the loop to copy from shared memory uses
|
||||||
|
* 32 threads per 4-by-4 pixel block.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
// AMD OpenCL fails UINT_CUDA_V
|
||||||
|
#define SHORT2_V 0
|
||||||
|
#define UINT_CUDA_V 0
|
||||||
|
|
||||||
|
// Either works
|
||||||
|
#define VEC_LOAD 0
|
||||||
|
|
||||||
|
// CAST_STORE is only method that works for all implementations of OpenCL tested
|
||||||
|
#define VEC_STORE 0
|
||||||
|
#define CAST_STORE 0
|
||||||
|
#define SCALAR_STORE 1
|
||||||
|
|
||||||
|
__kernel void mb_sad_calc(__global unsigned short *blk_sad,
|
||||||
|
__global unsigned short *frame,
|
||||||
|
int mb_width,
|
||||||
|
int mb_height,
|
||||||
|
__global unsigned short* img_ref) // __read_only image2d_t img_ref)
|
||||||
|
{
|
||||||
|
int tx = (get_local_id(0) / CEIL_POS) % THREADS_W;
|
||||||
|
int ty = (get_local_id(0) / CEIL_POS) / THREADS_W;
|
||||||
|
int bx = get_group_id(0);
|
||||||
|
int by = get_group_id(1);
|
||||||
|
int img_width = mb_width*16;
|
||||||
|
int lidx = get_local_id(0);
|
||||||
|
|
||||||
|
// Macroblock and sub-block coordinates
|
||||||
|
int mb_x = (tx + bx * THREADS_W) >> 2;
|
||||||
|
int mb_y = (ty + by * THREADS_H) >> 2;
|
||||||
|
int block_x = (tx + bx * THREADS_W) & 0x03;
|
||||||
|
int block_y = (ty + by * THREADS_H) & 0x03;
|
||||||
|
|
||||||
|
// If this thread is assigned to an invalid 4x4 block, do nothing
|
||||||
|
if ((mb_x < mb_width) && (mb_y < mb_height))
|
||||||
|
{
|
||||||
|
// Pixel offset of the origin of the current 4x4 block
|
||||||
|
int frame_x = ((mb_x << 2) + block_x) << 2;
|
||||||
|
int frame_y = ((mb_y << 2) + block_y) << 2;
|
||||||
|
|
||||||
|
// Origin of the search area for this 4x4 block
|
||||||
|
int ref_x = frame_x - SEARCH_RANGE;
|
||||||
|
int ref_y = frame_y - SEARCH_RANGE;
|
||||||
|
|
||||||
|
// Origin in the current frame for this 4x4 block
|
||||||
|
int cur_o = frame_y * img_width + frame_x;
|
||||||
|
|
||||||
|
int search_pos;
|
||||||
|
int search_pos_base =
|
||||||
|
(lidx % CEIL_POS) * POS_PER_THREAD;
|
||||||
|
int search_pos_end = search_pos_base + POS_PER_THREAD;
|
||||||
|
|
||||||
|
// Don't go past bounds
|
||||||
|
if (search_pos_end > MAX_POS) {
|
||||||
|
search_pos_end = MAX_POS;
|
||||||
|
}
|
||||||
|
|
||||||
|
// For each search position, within the range allocated to this thread
|
||||||
|
for (search_pos = search_pos_base;
|
||||||
|
search_pos < search_pos_end;
|
||||||
|
search_pos++) {
|
||||||
|
unsigned short sad4x4 = 0;
|
||||||
|
int search_off_x = ref_x + (search_pos % SEARCH_DIMENSION);
|
||||||
|
int search_off_y = ref_y + (search_pos / SEARCH_DIMENSION);
|
||||||
|
|
||||||
|
// 4x4 SAD computation
|
||||||
|
for(int y=0; y<4; y++) {
|
||||||
|
for (int x=0; x<4; x++) {
|
||||||
|
|
||||||
|
// ([unsigned] short)read_imageui or
|
||||||
|
// read_imagei is required for correct calculation.
|
||||||
|
// Though read_imagei() is shorter, its results are undefined by specification since the input
|
||||||
|
// is an unsigned type, CL_UNSIGNED_INT16
|
||||||
|
|
||||||
|
int sx = search_off_x + x;
|
||||||
|
sx = (sx < 0) ? 0 : sx;
|
||||||
|
sx = (sx >= img_width) ? img_width - 1 : sx;
|
||||||
|
int sy = search_off_y + y;
|
||||||
|
sy = (sy < 0) ? 0 : sy;
|
||||||
|
sy = (sy >= mb_height * 16) ? mb_height * 16 - 1 : sy;
|
||||||
|
sad4x4 += abs((unsigned short) img_ref[(sx) + (sy) * img_width] -
|
||||||
|
frame[cur_o + y * img_width + x]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Save this value into the local SAD array
|
||||||
|
blk_sad[mb_width * mb_height * MAX_POS_PADDED * (9 + 16) +
|
||||||
|
(mb_y * mb_width + mb_x) * MAX_POS_PADDED * 16 +
|
||||||
|
(4 * block_y + block_x) * MAX_POS_PADDED+search_pos] = sad4x4;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//typedef unsigned int uint;
|
||||||
|
|
||||||
|
__kernel void larger_sad_calc_8(__global unsigned short *blk_sad,
|
||||||
|
int mb_width,
|
||||||
|
int mb_height)
|
||||||
|
{
|
||||||
|
int tx = get_local_id(1) & 1;
|
||||||
|
int ty = get_local_id(1) >> 1;
|
||||||
|
|
||||||
|
// Macroblock and sub-block coordinates
|
||||||
|
int mb_x = get_group_id(0);
|
||||||
|
int mb_y = get_group_id(1);
|
||||||
|
int lidx = get_local_id(0);
|
||||||
|
|
||||||
|
// Number of macroblocks in a frame
|
||||||
|
int macroblocks = mul24(mb_width, mb_height);
|
||||||
|
int macroblock_index = (mul24(mb_y, mb_width) + mb_x) * MAX_POS_PADDED;
|
||||||
|
|
||||||
|
__global unsigned short *bi;
|
||||||
|
__global unsigned short *bo_6, *bo_5, *bo_4;
|
||||||
|
|
||||||
|
// MXPA
|
||||||
|
bo_4 = (__global unsigned short *) tx;
|
||||||
|
bo_5 = (__global unsigned short *) tx;
|
||||||
|
|
||||||
|
|
||||||
|
bi = blk_sad
|
||||||
|
+ (mul24(macroblocks, 25) + (ty * 8 + tx * 2)) * MAX_POS_PADDED
|
||||||
|
+ macroblock_index * 16;
|
||||||
|
|
||||||
|
// Block type 6: 4x8
|
||||||
|
bo_6 = blk_sad
|
||||||
|
+ ((macroblocks << 4) + macroblocks + (ty * 4 + tx * 2)) * MAX_POS_PADDED
|
||||||
|
+ macroblock_index * 8;
|
||||||
|
|
||||||
|
if (ty < 100) // always true, but improves register allocation
|
||||||
|
{
|
||||||
|
// Block type 5: 8x4
|
||||||
|
bo_5 = blk_sad
|
||||||
|
+ ((macroblocks << 3) + macroblocks + (ty * 4 + tx)) * MAX_POS_PADDED
|
||||||
|
+ macroblock_index * 8;
|
||||||
|
|
||||||
|
// Block type 4: 8x8
|
||||||
|
bo_4 = blk_sad
|
||||||
|
+ ((macroblocks << 2) + macroblocks + (ty * 2 + tx)) * MAX_POS_PADDED
|
||||||
|
+ macroblock_index * 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int search_pos = lidx; search_pos < (MAX_POS+1)/2; search_pos += 32)
|
||||||
|
{
|
||||||
|
#if SHORT2_V
|
||||||
|
#if VEC_LOAD
|
||||||
|
ushort2 s00 = vload2(search_pos, bi);
|
||||||
|
ushort2 s01 = vload2(search_pos+ MAX_POS_PADDED/2, bi);
|
||||||
|
ushort2 s10 = vload2(search_pos+4*MAX_POS_PADDED/2, bi);
|
||||||
|
ushort2 s11 = vload2(search_pos+5*MAX_POS_PADDED/2, bi);
|
||||||
|
#else
|
||||||
|
ushort2 s00 = (ushort2) (bi[search_pos*2], bi[search_pos*2+1]);
|
||||||
|
ushort2 s01 = (ushort2) (bi[(search_pos + MAX_POS_PADDED/2)*2], bi[(search_pos + MAX_POS_PADDED/2)*2+1]);
|
||||||
|
ushort2 s10 = (ushort2) (bi[(search_pos + 4*MAX_POS_PADDED/2)*2], bi[(search_pos + 4*MAX_POS_PADDED/2)*2+1]);
|
||||||
|
ushort2 s11 = (ushort2) (bi[(search_pos + 5*MAX_POS_PADDED/2)*2], bi[(search_pos + 5*MAX_POS_PADDED/2)*2+1]);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if VEC_STORE
|
||||||
|
ushort2 s0010 = s00 + s10;
|
||||||
|
ushort2 s0111 = s01 + s11;
|
||||||
|
ushort2 s0001 = s00 + s01;
|
||||||
|
ushort2 s1011 = s10 + s11;
|
||||||
|
ushort2 s00011011 = s0001 + s1011;
|
||||||
|
|
||||||
|
vstore2(s0010, search_pos, bo_6);
|
||||||
|
vstore2(s0111, search_pos+MAX_POS_PADDED/2, bo_6);
|
||||||
|
vstore2(s0001, search_pos, bo_5);
|
||||||
|
vstore2(s1011, search_pos+2*MAX_POS_PADDED/2, bo_5);
|
||||||
|
vstore2(s00011011, search_pos, bo_4);
|
||||||
|
#elif CAST_STORE
|
||||||
|
((__global ushort2 *)bo_6)[search_pos] = s00 + s10;
|
||||||
|
((__global ushort2 *)bo_6)[search_pos+MAX_POS_PADDED/2] = s01 + s11;
|
||||||
|
((__global ushort2 *)bo_5)[search_pos] = s00 + s01;
|
||||||
|
((__global ushort2 *)bo_5)[search_pos+2*MAX_POS_PADDED/2] = s10 + s11;
|
||||||
|
((__global ushort2 *)bo_4)[search_pos] = (s00 + s01) + (s10 + s11);
|
||||||
|
#else // SCALAR_STORE
|
||||||
|
bo_6[search_pos*2] = s00.x + s10.x;
|
||||||
|
bo_6[search_pos*2+1] = s00.y + s10.y;
|
||||||
|
bo_6[(search_pos+MAX_POS_PADDED/2)*2] = s01.x + s11.x;
|
||||||
|
bo_6[(search_pos+MAX_POS_PADDED/2)*2+1] = s01.y + s11.y;
|
||||||
|
bo_5[search_pos*2] = s00.x + s01.x;
|
||||||
|
bo_5[search_pos*2+1] = s00.y + s01.y;
|
||||||
|
bo_5[(search_pos+2*MAX_POS_PADDED/2)*2] = s10.x + s11.x;
|
||||||
|
bo_5[(search_pos+2*MAX_POS_PADDED/2)*2+1] = s10.y + s11.y;
|
||||||
|
bo_4[search_pos*2] = (s00.x + s01.x) + (s10.x + s11.x);
|
||||||
|
bo_4[search_pos*2+1] = (s00.y + s01.y) + (s10.y + s11.y);
|
||||||
|
#endif
|
||||||
|
#else // UINT_CUDA_V
|
||||||
|
uint i00 = ((__global uint *)bi)[search_pos];
|
||||||
|
uint i01 = ((__global uint *)bi)[search_pos + MAX_POS_PADDED/2];
|
||||||
|
uint i10 = ((__global uint *)bi)[search_pos + 4*MAX_POS_PADDED/2];
|
||||||
|
uint i11 = ((__global uint *)bi)[search_pos + 5*MAX_POS_PADDED/2];
|
||||||
|
|
||||||
|
((__global uint *)bo_6)[search_pos] = i00 + i10;
|
||||||
|
((__global uint *)bo_6)[search_pos+MAX_POS_PADDED/2] = i01 + i11;
|
||||||
|
((__global uint *)bo_5)[search_pos] = i00 + i01;
|
||||||
|
((__global uint *)bo_5)[search_pos+2*MAX_POS_PADDED/2] = i10 + i11;
|
||||||
|
((__global uint *)bo_4)[search_pos] = (i00 + i01) + (i10 + i11);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
__kernel void larger_sad_calc_16(__global unsigned short *blk_sad,
|
||||||
|
int mb_width,
|
||||||
|
int mb_height)
|
||||||
|
{
|
||||||
|
// Macroblock coordinates
|
||||||
|
int mb_x = get_group_id(0);
|
||||||
|
int mb_y = get_group_id(1);
|
||||||
|
int search_pos = get_local_id(0);
|
||||||
|
|
||||||
|
// Number of macroblocks in a frame
|
||||||
|
int macroblocks = mul24(mb_width, mb_height) * MAX_POS_PADDED;
|
||||||
|
int macroblock_index = (mul24(mb_y, mb_width) + mb_x) * MAX_POS_PADDED;
|
||||||
|
|
||||||
|
__global unsigned short *bi;
|
||||||
|
__global unsigned short *bo_3, *bo_2, *bo_1;
|
||||||
|
|
||||||
|
//bi = blk_sad + macroblocks * 5 + macroblock_index * 4;
|
||||||
|
bi = blk_sad + ((macroblocks + macroblock_index) << 2) + macroblocks;
|
||||||
|
|
||||||
|
// Block type 3: 8x16
|
||||||
|
//bo_3 = blk_sad + macroblocks * 3 + macroblock_index * 2;
|
||||||
|
bo_3 = blk_sad + ((macroblocks + macroblock_index) << 1) + macroblocks;
|
||||||
|
|
||||||
|
// Block type 5: 8x4
|
||||||
|
bo_2 = blk_sad + macroblocks + macroblock_index * 2;
|
||||||
|
|
||||||
|
// Block type 4: 8x8
|
||||||
|
bo_1 = blk_sad + macroblock_index;
|
||||||
|
|
||||||
|
for ( ; search_pos < (MAX_POS+1)/2; search_pos += 32)
|
||||||
|
{
|
||||||
|
#if SHORT2_V
|
||||||
|
#if VEC_LOAD
|
||||||
|
ushort2 s00 = vload2(search_pos, bi);
|
||||||
|
ushort2 s01 = vload2(search_pos+ MAX_POS_PADDED/2, bi);
|
||||||
|
ushort2 s10 = vload2(search_pos+2*MAX_POS_PADDED/2, bi);
|
||||||
|
ushort2 s11 = vload2(search_pos+3*MAX_POS_PADDED/2, bi);
|
||||||
|
#else
|
||||||
|
ushort2 s00 = (ushort2) (bi[search_pos*2], bi[search_pos*2+1]);
|
||||||
|
ushort2 s01 = (ushort2) (bi[(search_pos + MAX_POS_PADDED/2)*2], bi[(search_pos + MAX_POS_PADDED/2)*2+1]);
|
||||||
|
ushort2 s10 = (ushort2) (bi[(search_pos + 2*MAX_POS_PADDED/2)*2], bi[(search_pos + 2*MAX_POS_PADDED/2)*2+1]);
|
||||||
|
ushort2 s11 = (ushort2) (bi[(search_pos + 3*MAX_POS_PADDED/2)*2], bi[(search_pos + 3*MAX_POS_PADDED/2)*2+1]);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if VEC_STORE
|
||||||
|
ushort2 s0010 = s00 + s10;
|
||||||
|
ushort2 s0111 = s01 + s11;
|
||||||
|
ushort2 s0001 = s00 + s01;
|
||||||
|
ushort2 s1011 = s10 + s11;
|
||||||
|
ushort2 s00011011 = s0001 + s1011;
|
||||||
|
|
||||||
|
vstore2(s0010, search_pos, bo_3);
|
||||||
|
vstore2(s0111, search_pos+MAX_POS_PADDED/2, bo_3);
|
||||||
|
vstore2(s0001, search_pos, bo_2);
|
||||||
|
vstore2(s1011, search_pos+MAX_POS_PADDED/2, bo_2);
|
||||||
|
vstore2(s00011011, search_pos, bo_1);
|
||||||
|
#elif CAST_STORE
|
||||||
|
((__global ushort2 *)bo_3)[search_pos] = s00 + s10;
|
||||||
|
((__global ushort2 *)bo_3)[search_pos+MAX_POS_PADDED/2] = s01 + s11;
|
||||||
|
((__global ushort2 *)bo_2)[search_pos] = s00 + s01;
|
||||||
|
((__global ushort2 *)bo_2)[search_pos+MAX_POS_PADDED/2] = s10 + s11;
|
||||||
|
((__global ushort2 *)bo_1)[search_pos] = (s00 + s01) + (s10 + s11);
|
||||||
|
#else // SCALAR_STORE
|
||||||
|
bo_3[search_pos*2] = s00.x + s10.x;
|
||||||
|
bo_3[search_pos*2+1] = s00.y + s10.y;
|
||||||
|
bo_3[(search_pos+MAX_POS_PADDED/2)*2] = s01.x + s11.x;
|
||||||
|
bo_3[(search_pos+MAX_POS_PADDED/2)*2+1] = s01.y + s11.y;
|
||||||
|
bo_2[search_pos*2] = s00.x + s01.x;
|
||||||
|
bo_2[search_pos*2+1] = s00.y + s01.y;
|
||||||
|
bo_2[(search_pos+MAX_POS_PADDED/2)*2] = s10.x + s11.x;
|
||||||
|
bo_2[(search_pos+MAX_POS_PADDED/2)*2+1] = s10.y + s11.y;
|
||||||
|
bo_1[search_pos*2] = (s00.x + s01.x) + (s10.x + s11.x);
|
||||||
|
bo_1[search_pos*2+1] = (s00.y + s01.y) + (s10.y + s11.y);
|
||||||
|
#endif
|
||||||
|
#else // UINT_CUDA_V
|
||||||
|
uint i00 = ((__global uint *)bi)[search_pos];
|
||||||
|
uint i01 = ((__global uint *)bi)[search_pos + MAX_POS_PADDED/2];
|
||||||
|
uint i10 = ((__global uint *)bi)[search_pos + 2*MAX_POS_PADDED/2];
|
||||||
|
uint i11 = ((__global uint *)bi)[search_pos + 3*MAX_POS_PADDED/2];
|
||||||
|
|
||||||
|
((__global uint *)bo_3)[search_pos] = i00 + i10;
|
||||||
|
((__global uint *)bo_3)[search_pos+MAX_POS_PADDED/2] = i01 + i11;
|
||||||
|
((__global uint *)bo_2)[search_pos] = i00 + i01;
|
||||||
|
((__global uint *)bo_2)[search_pos+MAX_POS_PADDED/2] = i10 + i11;
|
||||||
|
((__global uint *)bo_1)[search_pos] = (i00 + i01) + (i10 + i11);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
BIN
benchmarks/opencl/sad/libsad.a
Normal file
BIN
benchmarks/opencl/sad/libsad.a
Normal file
Binary file not shown.
545
benchmarks/opencl/sad/main.cc
Normal file
545
benchmarks/opencl/sad/main.cc
Normal file
@@ -0,0 +1,545 @@
|
|||||||
|
/***************************************************************************
|
||||||
|
*cr
|
||||||
|
*cr (C) Copyright 2007 The Board of Trustees of the
|
||||||
|
*cr University of Illinois
|
||||||
|
*cr All Rights Reserved
|
||||||
|
*cr
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
#include <CL/cl.h>
|
||||||
|
#include <inttypes.h>
|
||||||
|
#include <parboil.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <sys/time.h>
|
||||||
|
|
||||||
|
#include "OpenCL_common.h"
|
||||||
|
#include "file.h"
|
||||||
|
#include "image.h"
|
||||||
|
#include "sad.h"
|
||||||
|
#include "sad_kernel.h"
|
||||||
|
|
||||||
|
static unsigned short *load_sads(char *filename);
|
||||||
|
static void write_sads(char *filename, int image_width_macroblocks,
|
||||||
|
int image_height_macroblocks, unsigned short *sads);
|
||||||
|
static void write_sads_directly(char *filename, int width, int height,
|
||||||
|
unsigned short *sads);
|
||||||
|
|
||||||
|
/* FILE I/O */
|
||||||
|
|
||||||
|
unsigned short *load_sads(char *filename) {
|
||||||
|
FILE *infile;
|
||||||
|
unsigned short *sads;
|
||||||
|
int w;
|
||||||
|
int h;
|
||||||
|
int sads_per_block;
|
||||||
|
|
||||||
|
infile = fopen(filename, "r");
|
||||||
|
|
||||||
|
if (!infile) {
|
||||||
|
fprintf(stderr, "Cannot find file '%s'\n", filename);
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Read image dimensions (measured in macroblocks) */
|
||||||
|
w = read16u(infile);
|
||||||
|
h = read16u(infile);
|
||||||
|
|
||||||
|
/* Read SAD values. Only interested in the 4x4 SAD values, which are
|
||||||
|
* at the end of the file. */
|
||||||
|
sads_per_block = MAX_POS_PADDED * (w * h);
|
||||||
|
fseek(infile, 25 * sads_per_block * sizeof(unsigned short), SEEK_CUR);
|
||||||
|
|
||||||
|
sads = (unsigned short *)malloc(sads_per_block * 16 * sizeof(unsigned short));
|
||||||
|
fread(sads, sizeof(unsigned short), sads_per_block * 16, infile);
|
||||||
|
fclose(infile);
|
||||||
|
|
||||||
|
return sads;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Compare the reference SADs to the expected SADs.
|
||||||
|
*/
|
||||||
|
void check_sads(unsigned short *sads_reference, unsigned short *sads_computed,
|
||||||
|
int image_size_macroblocks) {
|
||||||
|
int block;
|
||||||
|
|
||||||
|
/* Check the 4x4 SAD values. These are in sads_reference.
|
||||||
|
* Ignore the data at the beginning of sads_computed. */
|
||||||
|
sads_computed += 25 * MAX_POS_PADDED * image_size_macroblocks;
|
||||||
|
|
||||||
|
for (block = 0; block < image_size_macroblocks; block++) {
|
||||||
|
int subblock;
|
||||||
|
|
||||||
|
for (subblock = 0; subblock < 16; subblock++) {
|
||||||
|
int sad_index;
|
||||||
|
|
||||||
|
for (sad_index = 0; sad_index < MAX_POS; sad_index++) {
|
||||||
|
int index = (block * 16 + subblock) * MAX_POS_PADDED + sad_index;
|
||||||
|
|
||||||
|
if (sads_reference[index] != sads_computed[index]) {
|
||||||
|
#if 0
|
||||||
|
/* Print exactly where the mismatch was seen */
|
||||||
|
printf("M %3d %2d %4d (%d = %d)\n", block, subblock, sad_index, sads_reference[index], sads_computed[index]);
|
||||||
|
#else
|
||||||
|
goto mismatch;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("Success.\n");
|
||||||
|
return;
|
||||||
|
|
||||||
|
mismatch:
|
||||||
|
printf("Computed SADs do not match expected values.\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Extract the SAD data for a particular block type for a particular
|
||||||
|
* macroblock from the array of SADs of that block type. */
|
||||||
|
static inline void write_subblocks(FILE *outfile,
|
||||||
|
unsigned short *subblock_array,
|
||||||
|
int macroblock, int count) {
|
||||||
|
int block;
|
||||||
|
int pos;
|
||||||
|
|
||||||
|
for (block = 0; block < count; block++) {
|
||||||
|
unsigned short *vec =
|
||||||
|
subblock_array + (block + macroblock * count) * MAX_POS_PADDED;
|
||||||
|
|
||||||
|
/* Write all SADs for this sub-block */
|
||||||
|
for (pos = 0; pos < MAX_POS; pos++)
|
||||||
|
write16u(outfile, *vec++);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Write some SAD data to a file for output checking.
|
||||||
|
*
|
||||||
|
* All SAD values for six rows of macroblocks are written.
|
||||||
|
* The six rows consist of the top two, middle two, and bottom two image rows.
|
||||||
|
*/
|
||||||
|
void write_sads(char *filename, int mb_width, int mb_height,
|
||||||
|
unsigned short *sads) {
|
||||||
|
FILE *outfile = fopen(filename, "w");
|
||||||
|
int mbs = mb_width * mb_height;
|
||||||
|
int row_indir;
|
||||||
|
int row_indices[6] = {
|
||||||
|
0, 1, mb_height / 2 - 1, mb_height / 2, mb_height - 2, mb_height - 1};
|
||||||
|
|
||||||
|
if (outfile == NULL) {
|
||||||
|
fprintf(stderr, "Cannot open output file\n");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Write the number of output macroblocks */
|
||||||
|
write32u(outfile, mb_width * 6);
|
||||||
|
|
||||||
|
/* Write zeros */
|
||||||
|
write32u(outfile, 0);
|
||||||
|
|
||||||
|
/* Each row */
|
||||||
|
for (row_indir = 0; row_indir < 6; row_indir++) {
|
||||||
|
int row = row_indices[row_indir];
|
||||||
|
|
||||||
|
/* Each block in row */
|
||||||
|
int block;
|
||||||
|
for (block = mb_width * row; block < mb_width * (row + 1); block++) {
|
||||||
|
int blocktype;
|
||||||
|
|
||||||
|
/* Write SADs for all sub-block types */
|
||||||
|
for (blocktype = 1; blocktype <= 7; blocktype++)
|
||||||
|
write_subblocks(outfile, sads + SAD_TYPE_IX(blocktype, mbs), block,
|
||||||
|
SAD_TYPE_CT(blocktype));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose(outfile);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* FILE I/O for debugging */
|
||||||
|
|
||||||
|
static void write_sads_directly(char *filename, int width, int height,
|
||||||
|
unsigned short *sads) {
|
||||||
|
FILE *f = fopen(filename, "w");
|
||||||
|
int n;
|
||||||
|
|
||||||
|
write16u(f, width);
|
||||||
|
write16u(f, height);
|
||||||
|
for (n = 0; n < 41 * MAX_POS_PADDED * (width * height); n++) {
|
||||||
|
write16u(f, sads[n]);
|
||||||
|
}
|
||||||
|
fclose(f);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void print_test_sad_vector(unsigned short *base, int macroblock,
|
||||||
|
int count) {
|
||||||
|
int n;
|
||||||
|
int searchpos = 17 * 33 + 17;
|
||||||
|
for (n = 0; n < count; n++)
|
||||||
|
printf(" %d", base[(count * macroblock + n) * MAX_POS_PADDED + searchpos]);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void print_test_sads(unsigned short *sads_computed, int mbs) {
|
||||||
|
int macroblock = 5;
|
||||||
|
int blocktype;
|
||||||
|
|
||||||
|
for (blocktype = 1; blocktype <= 7; blocktype++) {
|
||||||
|
printf("%d:", blocktype);
|
||||||
|
print_test_sad_vector(sads_computed + SAD_TYPE_IX(blocktype, mbs),
|
||||||
|
macroblock, SAD_TYPE_CT(blocktype));
|
||||||
|
puts("\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* MAIN */
|
||||||
|
|
||||||
|
int main(int argc, char **argv) {
|
||||||
|
struct image_i16 *ref_image;
|
||||||
|
struct image_i16 *cur_image;
|
||||||
|
unsigned short *sads_computed; /* SADs generated by the program */
|
||||||
|
|
||||||
|
int image_size_bytes;
|
||||||
|
int image_width_macroblocks, image_height_macroblocks;
|
||||||
|
int image_size_macroblocks;
|
||||||
|
|
||||||
|
struct pb_TimerSet timers;
|
||||||
|
struct pb_Parameters *params;
|
||||||
|
|
||||||
|
char oclOverhead[] = "OpenCL Overhead";
|
||||||
|
|
||||||
|
pb_InitializeTimerSet(&timers);
|
||||||
|
pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);
|
||||||
|
|
||||||
|
params = pb_ReadParameters(&argc, argv);
|
||||||
|
params->inpFiles = (char **)malloc(sizeof(char *) * 3);
|
||||||
|
params->inpFiles[0] = (char *)malloc(100);
|
||||||
|
params->inpFiles[1] = (char *)malloc(100);
|
||||||
|
params->inpFiles[2] = NULL;
|
||||||
|
strncpy(params->inpFiles[0], "reference.bin", 100);
|
||||||
|
strncpy(params->inpFiles[1], "frame.bin", 100);
|
||||||
|
|
||||||
|
if (pb_Parameters_CountInputs(params) != 2) {
|
||||||
|
fprintf(stderr, "Expecting two input filenames\n");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Read input files */
|
||||||
|
pb_SwitchToTimer(&timers, pb_TimerID_IO);
|
||||||
|
ref_image = load_image(params->inpFiles[0]);
|
||||||
|
cur_image = load_image(params->inpFiles[1]);
|
||||||
|
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
|
||||||
|
|
||||||
|
printf("Ok\n");
|
||||||
|
|
||||||
|
if ((ref_image->width != cur_image->width) ||
|
||||||
|
(ref_image->height != cur_image->height)) {
|
||||||
|
fprintf(stderr, "Input images must be the same size\n");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
if ((ref_image->width % 16) || (ref_image->height % 16)) {
|
||||||
|
fprintf(stderr, "Input image size must be an integral multiple of 16\n");
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("Ok\n");
|
||||||
|
|
||||||
|
/* Compute parameters, allocate memory */
|
||||||
|
image_size_bytes = ref_image->width * ref_image->height * sizeof(short);
|
||||||
|
image_width_macroblocks = ref_image->width >> 4;
|
||||||
|
image_height_macroblocks = ref_image->height >> 4;
|
||||||
|
image_size_macroblocks = image_width_macroblocks * image_height_macroblocks;
|
||||||
|
|
||||||
|
sads_computed = (unsigned short *)malloc(
|
||||||
|
41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(short));
|
||||||
|
|
||||||
|
// Run the kernel code
|
||||||
|
// ************************************************************************
|
||||||
|
|
||||||
|
cl_int ciErrNum;
|
||||||
|
cl_command_queue clCommandQueue;
|
||||||
|
|
||||||
|
cl_kernel mb_sad_calc;
|
||||||
|
cl_kernel larger_sad_calc_8;
|
||||||
|
cl_kernel larger_sad_calc_16;
|
||||||
|
|
||||||
|
cl_mem imgRef; /* Reference image on the device */
|
||||||
|
cl_mem d_cur_image; /* Current image on the device */
|
||||||
|
cl_mem d_sads; /* SADs on the device */
|
||||||
|
|
||||||
|
// x : image_width_macroblocks
|
||||||
|
// y : image_height_macroblocks
|
||||||
|
|
||||||
|
pb_Context *pb_context;
|
||||||
|
pb_context = pb_InitOpenCLContext(params);
|
||||||
|
if (pb_context == NULL) {
|
||||||
|
fprintf(stderr, "Error: No OpenCL platform/device can be found.");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("Ok+\n");
|
||||||
|
|
||||||
|
cl_int clStatus;
|
||||||
|
cl_device_id clDevice = (cl_device_id)pb_context->clDeviceId;
|
||||||
|
cl_platform_id clPlatform = (cl_platform_id)pb_context->clPlatformId;
|
||||||
|
cl_context clContext = (cl_context)pb_context->clContext;
|
||||||
|
|
||||||
|
clCommandQueue = clCreateCommandQueue(clContext, clDevice,
|
||||||
|
CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
|
||||||
|
OCL_ERRCK_VAR(ciErrNum);
|
||||||
|
|
||||||
|
printf("Ok!\n");
|
||||||
|
|
||||||
|
pb_SetOpenCL(&clContext, &clCommandQueue);
|
||||||
|
|
||||||
|
printf("Ok!\n");
|
||||||
|
|
||||||
|
pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL);
|
||||||
|
|
||||||
|
// Read Source Code File
|
||||||
|
/*size_t program_length;
|
||||||
|
const char* source_path = "src/opencl_base/kernel.cl";
|
||||||
|
char* source = oclLoadProgSource(source_path, "", &program_length);
|
||||||
|
if(!source) {
|
||||||
|
fprintf(stderr, "Could not load program source\n"); exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
cl_program clProgram = clCreateProgramWithSource(clContext, 1, (const char
|
||||||
|
**)&source, &program_length, &ciErrNum);*/
|
||||||
|
printf("Ok//-\n");
|
||||||
|
cl_program clProgram = clCreateProgramWithBuiltInKernels(
|
||||||
|
clContext, 1, &clDevice, "mb_sad_calc;larger_sad_calc_8;larger_sad_calc_16", &ciErrNum);
|
||||||
|
printf("Ok//+\n");
|
||||||
|
OCL_ERRCK_VAR(ciErrNum);
|
||||||
|
|
||||||
|
printf("Ok+\n");
|
||||||
|
|
||||||
|
//free(source);
|
||||||
|
|
||||||
|
// JIT Compilation Options
|
||||||
|
char compileOptions[1024];
|
||||||
|
// -cl-nv-verbose
|
||||||
|
sprintf(compileOptions, "\
|
||||||
|
-D MAX_POS=%u -D CEIL_POS=%u\
|
||||||
|
-D POS_PER_THREAD=%u -D MAX_POS_PADDED=%u\
|
||||||
|
-D THREADS_W=%u -D THREADS_H=%u\
|
||||||
|
-D SEARCH_RANGE=%u -D SEARCH_DIMENSION=%u\
|
||||||
|
\0",
|
||||||
|
MAX_POS, CEIL(MAX_POS, POS_PER_THREAD), POS_PER_THREAD,
|
||||||
|
MAX_POS_PADDED, THREADS_W, THREADS_H, SEARCH_RANGE, SEARCH_DIMENSION);
|
||||||
|
printf("options = %s\n", compileOptions);
|
||||||
|
|
||||||
|
OCL_ERRCK_RETVAL(
|
||||||
|
clBuildProgram(clProgram, 1, &clDevice, compileOptions, NULL, NULL));
|
||||||
|
|
||||||
|
/*
|
||||||
|
char *build_log;
|
||||||
|
size_t ret_val_size;
|
||||||
|
OCL_ERRCK_RETVAL( clGetProgramBuildInfo(clProgram, clDevice,
|
||||||
|
CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size) );
|
||||||
|
build_log = (char *)malloc(ret_val_size+1);
|
||||||
|
OCL_ERRCK_RETVAL( clGetProgramBuildInfo(clProgram, clDevice,
|
||||||
|
CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL) );
|
||||||
|
|
||||||
|
// Null terminate (original writer wasn't sure)
|
||||||
|
build_log[ret_val_size] = '\0';
|
||||||
|
|
||||||
|
fprintf(stderr, "%s\n", build_log );
|
||||||
|
*/
|
||||||
|
|
||||||
|
mb_sad_calc = clCreateKernel(clProgram, "mb_sad_calc", &ciErrNum);
|
||||||
|
OCL_ERRCK_VAR(ciErrNum);
|
||||||
|
larger_sad_calc_8 = clCreateKernel(clProgram, "larger_sad_calc_8", &ciErrNum);
|
||||||
|
OCL_ERRCK_VAR(ciErrNum);
|
||||||
|
larger_sad_calc_16 =
|
||||||
|
clCreateKernel(clProgram, "larger_sad_calc_16", &ciErrNum);
|
||||||
|
OCL_ERRCK_VAR(ciErrNum);
|
||||||
|
|
||||||
|
size_t wgSize;
|
||||||
|
size_t comp_wgSize[3];
|
||||||
|
cl_ulong localMemSize;
|
||||||
|
size_t prefwgSizeMult;
|
||||||
|
cl_ulong privateMemSize;
|
||||||
|
|
||||||
|
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
|
||||||
|
|
||||||
|
printf("Ok++\n");
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
cl_image_format img_format;
|
||||||
|
img_format.image_channel_order = CL_R;
|
||||||
|
img_format.image_channel_data_type = CL_UNSIGNED_INT16;
|
||||||
|
|
||||||
|
/* Transfer reference image to device */
|
||||||
|
imgRef = clCreateImage2D(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &img_format,
|
||||||
|
ref_image->width /** sizeof(unsigned short)*/, // width
|
||||||
|
ref_image->height, // height
|
||||||
|
ref_image->width * sizeof(unsigned short), // row_pitch
|
||||||
|
ref_image->data, &ciErrNum);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
imgRef = clCreateBuffer(clContext, CL_MEM_READ_ONLY,
|
||||||
|
ref_image->width * ref_image->height *
|
||||||
|
sizeof(unsigned short),
|
||||||
|
NULL, &ciErrNum);
|
||||||
|
OCL_ERRCK_VAR(ciErrNum);
|
||||||
|
OCL_ERRCK_RETVAL(clEnqueueWriteBuffer(clCommandQueue, imgRef, CL_TRUE, 0,
|
||||||
|
ref_image->width * ref_image->height *
|
||||||
|
sizeof(unsigned short),
|
||||||
|
ref_image->data, 0, NULL, NULL));
|
||||||
|
#else
|
||||||
|
imgRef = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
|
||||||
|
ref_image->width * ref_image->height *
|
||||||
|
sizeof(unsigned short),
|
||||||
|
ref_image->data, &ciErrNum);
|
||||||
|
printf("Allocating %d bytes\n",
|
||||||
|
ref_image->width * ref_image->height * sizeof(unsigned short));
|
||||||
|
|
||||||
|
#endif
|
||||||
|
OCL_ERRCK_VAR(ciErrNum);
|
||||||
|
|
||||||
|
/* Allocate SAD data on the device */
|
||||||
|
|
||||||
|
unsigned short *tmpZero = (unsigned short *)calloc(
|
||||||
|
41 * MAX_POS_PADDED * image_size_macroblocks, sizeof(unsigned short));
|
||||||
|
|
||||||
|
/*
|
||||||
|
size_t max_alloc_size = 0;
|
||||||
|
clGetDeviceInfo(clDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
|
||||||
|
sizeof(max_alloc_size), &max_alloc_size, NULL);
|
||||||
|
if (max_alloc_size < (41 * MAX_POS_PADDED *
|
||||||
|
image_size_macroblocks * sizeof(unsigned short))) {
|
||||||
|
fprintf(stderr, "Can't allocate sad buffer: max alloc size is %dMB\n",
|
||||||
|
(int) (max_alloc_size >> 20));
|
||||||
|
exit(-1);
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
d_sads = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR,
|
||||||
|
41 * MAX_POS_PADDED * image_size_macroblocks *
|
||||||
|
sizeof(unsigned short),
|
||||||
|
tmpZero, &ciErrNum);
|
||||||
|
OCL_ERRCK_VAR(ciErrNum);
|
||||||
|
free(tmpZero);
|
||||||
|
|
||||||
|
d_cur_image =
|
||||||
|
clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
|
||||||
|
image_size_bytes, cur_image->data, &ciErrNum);
|
||||||
|
OCL_ERRCK_VAR(ciErrNum);
|
||||||
|
|
||||||
|
/* Set Kernel Parameters */
|
||||||
|
|
||||||
|
OCL_ERRCK_RETVAL(
|
||||||
|
clSetKernelArg(mb_sad_calc, 0, sizeof(cl_mem), (void *)&d_sads));
|
||||||
|
OCL_ERRCK_RETVAL(
|
||||||
|
clSetKernelArg(mb_sad_calc, 1, sizeof(cl_mem), (void *)&d_cur_image));
|
||||||
|
OCL_ERRCK_RETVAL(
|
||||||
|
clSetKernelArg(mb_sad_calc, 2, sizeof(int), &image_width_macroblocks));
|
||||||
|
OCL_ERRCK_RETVAL(
|
||||||
|
clSetKernelArg(mb_sad_calc, 3, sizeof(int), &image_height_macroblocks));
|
||||||
|
OCL_ERRCK_RETVAL(
|
||||||
|
clSetKernelArg(mb_sad_calc, 4, sizeof(cl_mem), (void *)&imgRef));
|
||||||
|
|
||||||
|
OCL_ERRCK_RETVAL(
|
||||||
|
clSetKernelArg(larger_sad_calc_8, 0, sizeof(cl_mem), (void *)&d_sads));
|
||||||
|
OCL_ERRCK_RETVAL(clSetKernelArg(larger_sad_calc_8, 1, sizeof(int),
|
||||||
|
&image_width_macroblocks));
|
||||||
|
OCL_ERRCK_RETVAL(clSetKernelArg(larger_sad_calc_8, 2, sizeof(int),
|
||||||
|
&image_height_macroblocks));
|
||||||
|
|
||||||
|
OCL_ERRCK_RETVAL(
|
||||||
|
clSetKernelArg(larger_sad_calc_16, 0, sizeof(cl_mem), (void *)&d_sads));
|
||||||
|
OCL_ERRCK_RETVAL(clSetKernelArg(larger_sad_calc_16, 1, sizeof(int),
|
||||||
|
&image_width_macroblocks));
|
||||||
|
OCL_ERRCK_RETVAL(clSetKernelArg(larger_sad_calc_16, 2, sizeof(int),
|
||||||
|
&image_height_macroblocks));
|
||||||
|
|
||||||
|
size_t mb_sad_calc_localWorkSize[2] = {
|
||||||
|
CEIL(MAX_POS, POS_PER_THREAD) * THREADS_W * THREADS_H, 1};
|
||||||
|
size_t mb_sad_calc_globalWorkSize[2] = {
|
||||||
|
mb_sad_calc_localWorkSize[0] * CEIL(ref_image->width / 4, THREADS_W),
|
||||||
|
mb_sad_calc_localWorkSize[1] * CEIL(ref_image->height / 4, THREADS_H)};
|
||||||
|
|
||||||
|
size_t larger_sad_calc_8_localWorkSize[2] = {32, 4};
|
||||||
|
size_t larger_sad_calc_8_globalWorkSize[2] = {image_width_macroblocks * 32,
|
||||||
|
image_height_macroblocks * 4};
|
||||||
|
|
||||||
|
size_t larger_sad_calc_16_localWorkSize[2] = {32, 1};
|
||||||
|
size_t larger_sad_calc_16_globalWorkSize[2] = {image_width_macroblocks * 32,
|
||||||
|
image_height_macroblocks * 1};
|
||||||
|
|
||||||
|
printf("Ok+++\n");
|
||||||
|
|
||||||
|
pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
|
||||||
|
|
||||||
|
/* Run the 4x4 kernel */
|
||||||
|
OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, mb_sad_calc, 2, 0,
|
||||||
|
mb_sad_calc_globalWorkSize,
|
||||||
|
mb_sad_calc_localWorkSize, 0, 0, 0));
|
||||||
|
|
||||||
|
/* Run the larger-blocks kernels */
|
||||||
|
OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(
|
||||||
|
clCommandQueue, larger_sad_calc_8, 2, 0, larger_sad_calc_8_globalWorkSize,
|
||||||
|
larger_sad_calc_8_localWorkSize, 0, 0, 0));
|
||||||
|
|
||||||
|
OCL_ERRCK_RETVAL(clEnqueueNDRangeKernel(clCommandQueue, larger_sad_calc_16, 2,
|
||||||
|
0, larger_sad_calc_16_globalWorkSize,
|
||||||
|
larger_sad_calc_16_localWorkSize, 0,
|
||||||
|
0, 0));
|
||||||
|
|
||||||
|
OCL_ERRCK_RETVAL(clFinish(clCommandQueue));
|
||||||
|
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
|
||||||
|
|
||||||
|
/* Transfer SAD data to the host */
|
||||||
|
OCL_ERRCK_RETVAL(clEnqueueReadBuffer(
|
||||||
|
clCommandQueue, d_sads, CL_TRUE, 0,
|
||||||
|
41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(unsigned short),
|
||||||
|
sads_computed, 0, NULL, NULL));
|
||||||
|
|
||||||
|
/* Free GPU memory */
|
||||||
|
OCL_ERRCK_RETVAL(clReleaseKernel(larger_sad_calc_8));
|
||||||
|
OCL_ERRCK_RETVAL(clReleaseKernel(larger_sad_calc_16));
|
||||||
|
OCL_ERRCK_RETVAL(clReleaseProgram(clProgram));
|
||||||
|
|
||||||
|
OCL_ERRCK_RETVAL(clReleaseMemObject(d_sads));
|
||||||
|
OCL_ERRCK_RETVAL(clReleaseMemObject(imgRef));
|
||||||
|
OCL_ERRCK_RETVAL(clReleaseMemObject(d_cur_image));
|
||||||
|
|
||||||
|
OCL_ERRCK_RETVAL(clFinish(clCommandQueue));
|
||||||
|
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
|
||||||
|
|
||||||
|
// ************************************************************************
|
||||||
|
// End GPU Code
|
||||||
|
|
||||||
|
/* Print output */
|
||||||
|
if (params->outFile) {
|
||||||
|
pb_SwitchToTimer(&timers, pb_TimerID_IO);
|
||||||
|
write_sads(params->outFile, image_width_macroblocks,
|
||||||
|
image_height_macroblocks, sads_computed);
|
||||||
|
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
|
||||||
|
}
|
||||||
|
|
||||||
|
#if 0 /* Debugging */
|
||||||
|
print_test_sads(sads_computed, image_size_macroblocks);
|
||||||
|
write_sads_directly("sad-debug.bin",
|
||||||
|
ref_image->width / 16, ref_image->height / 16,
|
||||||
|
sads_computed);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* Free memory */
|
||||||
|
free(sads_computed);
|
||||||
|
free_image(ref_image);
|
||||||
|
free_image(cur_image);
|
||||||
|
|
||||||
|
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
|
||||||
|
pb_PrintTimerSet(&timers);
|
||||||
|
pb_FreeParameters(params);
|
||||||
|
|
||||||
|
OCL_ERRCK_RETVAL(clReleaseCommandQueue(clCommandQueue));
|
||||||
|
OCL_ERRCK_RETVAL(clReleaseContext(clContext));
|
||||||
|
|
||||||
|
pb_DestroyTimerSet(&timers);
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
50
benchmarks/opencl/sad/ocl.c
Normal file
50
benchmarks/opencl/sad/ocl.c
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
#include <CL/cl.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include "ocl.h"
|
||||||
|
|
||||||
|
char* readFile(const char* fileName)
|
||||||
|
{
|
||||||
|
FILE* fp;
|
||||||
|
fp = fopen(fileName,"r");
|
||||||
|
if(fp == NULL)
|
||||||
|
{
|
||||||
|
printf("Error 1!\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
fseek(fp,0,SEEK_END);
|
||||||
|
long size = ftell(fp);
|
||||||
|
rewind(fp);
|
||||||
|
|
||||||
|
char* buffer = (char*)malloc(sizeof(char)*(size+1));
|
||||||
|
if(buffer == NULL)
|
||||||
|
{
|
||||||
|
printf("Error 2!\n");
|
||||||
|
fclose(fp);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t res = fread(buffer,1,size,fp);
|
||||||
|
if(res != size)
|
||||||
|
{
|
||||||
|
printf("Error 3!\n");
|
||||||
|
fclose(fp);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
buffer[size] = 0;
|
||||||
|
fclose(fp);
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
|
||||||
|
{
|
||||||
|
cl_int clStatus;
|
||||||
|
char* temp = (char*)malloc(size);
|
||||||
|
memset(temp,val,size);
|
||||||
|
clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
|
||||||
|
CHECK_ERROR("clEnqueueWriteBuffer")
|
||||||
|
free(temp);
|
||||||
|
}
|
||||||
21
benchmarks/opencl/sad/ocl.h
Normal file
21
benchmarks/opencl/sad/ocl.h
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
#ifndef __OCLH__
|
||||||
|
#define __OCLH__
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
cl_uint major;
|
||||||
|
cl_uint minor;
|
||||||
|
cl_uint multiProcessorCount;
|
||||||
|
} OpenCLDeviceProp;
|
||||||
|
|
||||||
|
void clMemSet(cl_command_queue, cl_mem, int, size_t);
|
||||||
|
char* readFile(const char*);
|
||||||
|
|
||||||
|
#define CHECK_ERROR(errorMessage) \
|
||||||
|
if(clStatus != CL_SUCCESS) \
|
||||||
|
{ \
|
||||||
|
printf("Error: %s!\n",errorMessage); \
|
||||||
|
printf("Line: %d\n",__LINE__); \
|
||||||
|
exit(1); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
427
benchmarks/opencl/sad/parboil.c
Normal file
427
benchmarks/opencl/sad/parboil.c
Normal file
@@ -0,0 +1,427 @@
|
|||||||
|
/*
|
||||||
|
* (c) 2007 The Board of Trustees of the University of Illinois.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <parboil.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
#if _POSIX_VERSION >= 200112L
|
||||||
|
# include <sys/time.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
/*****************************************************************************/
|
||||||
|
/* Timer routines */
|
||||||
|
|
||||||
|
static void
|
||||||
|
accumulate_time(pb_Timestamp *accum,
|
||||||
|
pb_Timestamp start,
|
||||||
|
pb_Timestamp end)
|
||||||
|
{
|
||||||
|
#if _POSIX_VERSION >= 200112L
|
||||||
|
*accum += end - start;
|
||||||
|
#else
|
||||||
|
# error "Timestamps not implemented for this system"
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
#if _POSIX_VERSION >= 200112L
|
||||||
|
static pb_Timestamp get_time()
|
||||||
|
{
|
||||||
|
struct timeval tv;
|
||||||
|
gettimeofday(&tv, NULL);
|
||||||
|
return (pb_Timestamp) (tv.tv_sec * 1000000LL + tv.tv_usec);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
# error "no supported time libraries are available on this platform"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_ResetTimer(struct pb_Timer *timer)
|
||||||
|
{
|
||||||
|
timer->state = pb_Timer_STOPPED;
|
||||||
|
|
||||||
|
#if _POSIX_VERSION >= 200112L
|
||||||
|
timer->elapsed = 0;
|
||||||
|
#else
|
||||||
|
# error "pb_ResetTimer: not implemented for this system"
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_StartTimer(struct pb_Timer *timer)
|
||||||
|
{
|
||||||
|
if (timer->state != pb_Timer_STOPPED) {
|
||||||
|
fputs("Ignoring attempt to start a running timer\n", stderr);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
timer->state = pb_Timer_RUNNING;
|
||||||
|
|
||||||
|
#if _POSIX_VERSION >= 200112L
|
||||||
|
{
|
||||||
|
struct timeval tv;
|
||||||
|
gettimeofday(&tv, NULL);
|
||||||
|
timer->init = tv.tv_sec * 1000000LL + tv.tv_usec;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
# error "pb_StartTimer: not implemented for this system"
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_StartTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer)
|
||||||
|
{
|
||||||
|
unsigned int numNotStopped = 0x3; // 11
|
||||||
|
if (timer->state != pb_Timer_STOPPED) {
|
||||||
|
fputs("Warning: Timer was not stopped\n", stderr);
|
||||||
|
numNotStopped &= 0x1; // Zero out 2^1
|
||||||
|
}
|
||||||
|
if (subtimer->state != pb_Timer_STOPPED) {
|
||||||
|
fputs("Warning: Subtimer was not stopped\n", stderr);
|
||||||
|
numNotStopped &= 0x2; // Zero out 2^0
|
||||||
|
}
|
||||||
|
if (numNotStopped == 0x0) {
|
||||||
|
fputs("Ignoring attempt to start running timer and subtimer\n", stderr);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
timer->state = pb_Timer_RUNNING;
|
||||||
|
subtimer->state = pb_Timer_RUNNING;
|
||||||
|
|
||||||
|
#if _POSIX_VERSION >= 200112L
|
||||||
|
{
|
||||||
|
struct timeval tv;
|
||||||
|
gettimeofday(&tv, NULL);
|
||||||
|
|
||||||
|
if (numNotStopped & 0x2) {
|
||||||
|
timer->init = tv.tv_sec * 1000000LL + tv.tv_usec;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (numNotStopped & 0x1) {
|
||||||
|
subtimer->init = tv.tv_sec * 1000000LL + tv.tv_usec;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
# error "pb_StartTimer: not implemented for this system"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_StopTimer(struct pb_Timer *timer)
|
||||||
|
{
|
||||||
|
|
||||||
|
pb_Timestamp fini;
|
||||||
|
|
||||||
|
if (timer->state != pb_Timer_RUNNING) {
|
||||||
|
fputs("Ignoring attempt to stop a stopped timer\n", stderr);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
timer->state = pb_Timer_STOPPED;
|
||||||
|
|
||||||
|
#if _POSIX_VERSION >= 200112L
|
||||||
|
{
|
||||||
|
struct timeval tv;
|
||||||
|
gettimeofday(&tv, NULL);
|
||||||
|
fini = tv.tv_sec * 1000000LL + tv.tv_usec;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
# error "pb_StopTimer: not implemented for this system"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
accumulate_time(&timer->elapsed, timer->init, fini);
|
||||||
|
timer->init = fini;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void pb_StopTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) {
|
||||||
|
|
||||||
|
pb_Timestamp fini;
|
||||||
|
|
||||||
|
unsigned int numNotRunning = 0x3; // 0b11
|
||||||
|
if (timer->state != pb_Timer_RUNNING) {
|
||||||
|
fputs("Warning: Timer was not running\n", stderr);
|
||||||
|
numNotRunning &= 0x1; // Zero out 2^1
|
||||||
|
}
|
||||||
|
if (subtimer->state != pb_Timer_RUNNING) {
|
||||||
|
fputs("Warning: Subtimer was not running\n", stderr);
|
||||||
|
numNotRunning &= 0x2; // Zero out 2^0
|
||||||
|
}
|
||||||
|
if (numNotRunning == 0x0) {
|
||||||
|
fputs("Ignoring attempt to stop stopped timer and subtimer\n", stderr);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
timer->state = pb_Timer_STOPPED;
|
||||||
|
subtimer->state = pb_Timer_STOPPED;
|
||||||
|
|
||||||
|
#if _POSIX_VERSION >= 200112L
|
||||||
|
{
|
||||||
|
struct timeval tv;
|
||||||
|
gettimeofday(&tv, NULL);
|
||||||
|
fini = tv.tv_sec * 1000000LL + tv.tv_usec;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
# error "pb_StopTimer: not implemented for this system"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (numNotRunning & 0x2) {
|
||||||
|
accumulate_time(&timer->elapsed, timer->init, fini);
|
||||||
|
timer->init = fini;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (numNotRunning & 0x1) {
|
||||||
|
accumulate_time(&subtimer->elapsed, subtimer->init, fini);
|
||||||
|
subtimer->init = fini;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Get the elapsed time in seconds. */
|
||||||
|
double
|
||||||
|
pb_GetElapsedTime(struct pb_Timer *timer)
|
||||||
|
{
|
||||||
|
double ret;
|
||||||
|
|
||||||
|
if (timer->state != pb_Timer_STOPPED) {
|
||||||
|
fputs("Elapsed time from a running timer is inaccurate\n", stderr);
|
||||||
|
}
|
||||||
|
|
||||||
|
#if _POSIX_VERSION >= 200112L
|
||||||
|
ret = timer->elapsed / 1e6;
|
||||||
|
#else
|
||||||
|
# error "pb_GetElapsedTime: not implemented for this system"
|
||||||
|
#endif
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_InitializeTimerSet(struct pb_TimerSet *timers)
|
||||||
|
{
|
||||||
|
int n;
|
||||||
|
|
||||||
|
timers->wall_begin = get_time();
|
||||||
|
|
||||||
|
timers->current = pb_TimerID_NONE;
|
||||||
|
|
||||||
|
timers->async_markers = NULL;
|
||||||
|
|
||||||
|
|
||||||
|
for (n = 0; n < pb_TimerID_LAST; n++) {
|
||||||
|
pb_ResetTimer(&timers->timers[n]);
|
||||||
|
timers->sub_timer_list[n] = NULL; // free first?
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category) {
|
||||||
|
|
||||||
|
struct pb_SubTimer *subtimer = (struct pb_SubTimer *) malloc
|
||||||
|
(sizeof(struct pb_SubTimer));
|
||||||
|
|
||||||
|
int len = strlen(label);
|
||||||
|
|
||||||
|
subtimer->label = (char *) malloc (sizeof(char)*(len+1));
|
||||||
|
sprintf(subtimer->label, "%s\0", label);
|
||||||
|
|
||||||
|
pb_ResetTimer(&subtimer->timer);
|
||||||
|
subtimer->next = NULL;
|
||||||
|
|
||||||
|
struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[pb_Category];
|
||||||
|
if (subtimerlist == NULL) {
|
||||||
|
subtimerlist = (struct pb_SubTimerList *) malloc
|
||||||
|
(sizeof(struct pb_SubTimerList));
|
||||||
|
subtimerlist->subtimer_list = subtimer;
|
||||||
|
timers->sub_timer_list[pb_Category] = subtimerlist;
|
||||||
|
} else {
|
||||||
|
// Append to list
|
||||||
|
struct pb_SubTimer *element = subtimerlist->subtimer_list;
|
||||||
|
while (element->next != NULL) {
|
||||||
|
element = element->next;
|
||||||
|
}
|
||||||
|
element->next = subtimer;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category)
|
||||||
|
{
|
||||||
|
|
||||||
|
// switchToSub( NULL, NONE
|
||||||
|
// switchToSub( NULL, some
|
||||||
|
// switchToSub( some, some
|
||||||
|
// switchToSub( some, NONE -- tries to find "some" in NONE's sublist, which won't be printed
|
||||||
|
|
||||||
|
struct pb_Timer *topLevelToStop = NULL;
|
||||||
|
if (timers->current != category && timers->current != pb_TimerID_NONE) {
|
||||||
|
// Switching to subtimer in a different category needs to stop the top-level current, different categoried timer.
|
||||||
|
// NONE shouldn't have a timer associated with it, so exclude from branch
|
||||||
|
topLevelToStop = &timers->timers[timers->current];
|
||||||
|
}
|
||||||
|
|
||||||
|
struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current];
|
||||||
|
struct pb_SubTimer *curr = (subtimerlist == NULL) ? NULL : subtimerlist->current;
|
||||||
|
|
||||||
|
if (timers->current != pb_TimerID_NONE) {
|
||||||
|
if (curr != NULL && topLevelToStop != NULL) {
|
||||||
|
pb_StopTimerAndSubTimer(topLevelToStop, &curr->timer);
|
||||||
|
} else if (curr != NULL) {
|
||||||
|
pb_StopTimer(&curr->timer);
|
||||||
|
} else {
|
||||||
|
pb_StopTimer(topLevelToStop);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
subtimerlist = timers->sub_timer_list[category];
|
||||||
|
struct pb_SubTimer *subtimer = NULL;
|
||||||
|
|
||||||
|
if (label != NULL) {
|
||||||
|
subtimer = subtimerlist->subtimer_list;
|
||||||
|
while (subtimer != NULL) {
|
||||||
|
if (strcmp(subtimer->label, label) == 0) {
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
subtimer = subtimer->next;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (category != pb_TimerID_NONE) {
|
||||||
|
|
||||||
|
if (subtimerlist != NULL) {
|
||||||
|
subtimerlist->current = subtimer;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (category != timers->current && subtimer != NULL) {
|
||||||
|
pb_StartTimerAndSubTimer(&timers->timers[category], &subtimer->timer);
|
||||||
|
} else if (subtimer != NULL) {
|
||||||
|
// Same category, different non-NULL subtimer
|
||||||
|
pb_StartTimer(&subtimer->timer);
|
||||||
|
} else{
|
||||||
|
// Different category, but no subtimer (not found or specified as NULL) -- unprefered way of setting topLevel timer
|
||||||
|
pb_StartTimer(&timers->timers[category]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
timers->current = category;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer)
|
||||||
|
{
|
||||||
|
/* Stop the currently running timer */
|
||||||
|
/*if (timers->current != pb_TimerID_NONE) {
|
||||||
|
struct pb_SubTimer *currSubTimer = NULL;
|
||||||
|
struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current];
|
||||||
|
|
||||||
|
if ( subtimerlist != NULL) {
|
||||||
|
currSubTimer = timers->sub_timer_list[timers->current]->current;
|
||||||
|
}
|
||||||
|
if ( currSubTimer!= NULL) {
|
||||||
|
pb_StopTimerAndSubTimer(&timers->timers[timers->current], &currSubTimer->timer);
|
||||||
|
} else {
|
||||||
|
pb_StopTimer(&timers->timers[timers->current]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
timers->current = timer;
|
||||||
|
|
||||||
|
if (timer != pb_TimerID_NONE) {
|
||||||
|
pb_StartTimer(&timers->timers[timer]);
|
||||||
|
}*/
|
||||||
|
}
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_PrintTimerSet(struct pb_TimerSet *timers)
|
||||||
|
{
|
||||||
|
|
||||||
|
pb_Timestamp wall_end = get_time();
|
||||||
|
|
||||||
|
struct pb_Timer *t = timers->timers;
|
||||||
|
struct pb_SubTimer* sub = NULL;
|
||||||
|
|
||||||
|
int maxSubLength;
|
||||||
|
|
||||||
|
const char *categories[] = {
|
||||||
|
"IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute"
|
||||||
|
};
|
||||||
|
|
||||||
|
const int maxCategoryLength = 10;
|
||||||
|
|
||||||
|
int i;
|
||||||
|
for(i = 1; i < pb_TimerID_LAST-1; ++i) { // exclude NONE and OVRELAP from this format
|
||||||
|
if(pb_GetElapsedTime(&t[i]) != 0) {
|
||||||
|
|
||||||
|
// Print Category Timer
|
||||||
|
printf("%-*s: %f\n", maxCategoryLength, categories[i-1], pb_GetElapsedTime(&t[i]));
|
||||||
|
|
||||||
|
if (timers->sub_timer_list[i] != NULL) {
|
||||||
|
sub = timers->sub_timer_list[i]->subtimer_list;
|
||||||
|
maxSubLength = 0;
|
||||||
|
while (sub != NULL) {
|
||||||
|
// Find longest SubTimer label
|
||||||
|
if (strlen(sub->label) > maxSubLength) {
|
||||||
|
maxSubLength = strlen(sub->label);
|
||||||
|
}
|
||||||
|
sub = sub->next;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fit to Categories
|
||||||
|
if (maxSubLength <= maxCategoryLength) {
|
||||||
|
maxSubLength = maxCategoryLength;
|
||||||
|
}
|
||||||
|
|
||||||
|
sub = timers->sub_timer_list[i]->subtimer_list;
|
||||||
|
|
||||||
|
// Print SubTimers
|
||||||
|
while (sub != NULL) {
|
||||||
|
printf(" -%-*s: %f\n", maxSubLength, sub->label, pb_GetElapsedTime(&sub->timer));
|
||||||
|
sub = sub->next;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if(pb_GetElapsedTime(&t[pb_TimerID_OVERLAP]) != 0)
|
||||||
|
printf("CPU/Kernel Overlap: %f\n", pb_GetElapsedTime(&t[pb_TimerID_OVERLAP]));
|
||||||
|
|
||||||
|
float walltime = (wall_end - timers->wall_begin)/ 1e6;
|
||||||
|
printf("Timer Wall Time: %f\n", walltime);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void pb_DestroyTimerSet(struct pb_TimerSet * timers)
|
||||||
|
{
|
||||||
|
/* clean up all of the async event markers */
|
||||||
|
struct pb_async_time_marker_list ** event = &(timers->async_markers);
|
||||||
|
while( *event != NULL) {
|
||||||
|
struct pb_async_time_marker_list ** next = &((*event)->next);
|
||||||
|
free(*event);
|
||||||
|
(*event) = NULL;
|
||||||
|
event = next;
|
||||||
|
}
|
||||||
|
|
||||||
|
int i = 0;
|
||||||
|
for(i = 0; i < pb_TimerID_LAST; ++i) {
|
||||||
|
if (timers->sub_timer_list[i] != NULL) {
|
||||||
|
struct pb_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list;
|
||||||
|
struct pb_SubTimer *prev = NULL;
|
||||||
|
while (subtimer != NULL) {
|
||||||
|
free(subtimer->label);
|
||||||
|
prev = subtimer;
|
||||||
|
subtimer = subtimer->next;
|
||||||
|
free(prev);
|
||||||
|
}
|
||||||
|
free(timers->sub_timer_list[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
348
benchmarks/opencl/sad/parboil.h
Normal file
348
benchmarks/opencl/sad/parboil.h
Normal file
@@ -0,0 +1,348 @@
|
|||||||
|
/*
|
||||||
|
* (c) 2010 The Board of Trustees of the University of Illinois.
|
||||||
|
*/
|
||||||
|
#ifndef PARBOIL_HEADER
|
||||||
|
#define PARBOIL_HEADER
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#include <unistd.h>
|
||||||
|
|
||||||
|
/* A platform as specified by the user on the command line */
|
||||||
|
struct pb_PlatformParam {
|
||||||
|
char *name; /* The platform name. This string is owned. */
|
||||||
|
char *version; /* The platform version; may be NULL.
|
||||||
|
* This string is owned. */
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Create a PlatformParam from the given strings.
|
||||||
|
* 'name' must not be NULL. 'version' may be NULL.
|
||||||
|
* If not NULL, the strings should have been allocated by malloc(),
|
||||||
|
* and they will be owned by the returned object.
|
||||||
|
*/
|
||||||
|
struct pb_PlatformParam *
|
||||||
|
pb_PlatformParam(char *name, char *version);
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_FreePlatformParam(struct pb_PlatformParam *);
|
||||||
|
|
||||||
|
/* A criterion for how to select a device */
|
||||||
|
enum pb_DeviceSelectionCriterion {
|
||||||
|
pb_Device_INDEX, /* Enumerate the devices and select one
|
||||||
|
* by its number */
|
||||||
|
pb_Device_CPU, /* Select a CPU device */
|
||||||
|
pb_Device_GPU, /* Select a GPU device */
|
||||||
|
pb_Device_ACCELERATOR, /* Select an accelerator device */
|
||||||
|
pb_Device_NAME /* Select a device by name */
|
||||||
|
};
|
||||||
|
|
||||||
|
/* A device as specified by the user on the command line */
|
||||||
|
struct pb_DeviceParam {
|
||||||
|
enum pb_DeviceSelectionCriterion criterion;
|
||||||
|
union {
|
||||||
|
int index; /* If criterion == pb_Device_INDEX,
|
||||||
|
* the index of the device */
|
||||||
|
char *name; /* If criterion == pb_Device_NAME,
|
||||||
|
* the name of the device.
|
||||||
|
* This string is owned. */
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
struct pb_DeviceParam *
|
||||||
|
pb_DeviceParam_index(int index);
|
||||||
|
|
||||||
|
struct pb_DeviceParam *
|
||||||
|
pb_DeviceParam_cpu(void);
|
||||||
|
|
||||||
|
struct pb_DeviceParam *
|
||||||
|
pb_DeviceParam_gpu(void);
|
||||||
|
|
||||||
|
struct pb_DeviceParam *
|
||||||
|
pb_DeviceParam_accelerator(void);
|
||||||
|
|
||||||
|
/* Create a by-name device selection criterion.
|
||||||
|
* The string should have been allocated by malloc(), and it will will be
|
||||||
|
* owned by the returned object.
|
||||||
|
*/
|
||||||
|
struct pb_DeviceParam *
|
||||||
|
pb_DeviceParam_name(char *name);
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_FreeDeviceParam(struct pb_DeviceParam *);
|
||||||
|
|
||||||
|
/* Command line parameters for benchmarks */
|
||||||
|
struct pb_Parameters {
|
||||||
|
char *outFile; /* If not NULL, the raw output of the
|
||||||
|
* computation should be saved to this
|
||||||
|
* file. The string is owned. */
|
||||||
|
char **inpFiles; /* A NULL-terminated array of strings
|
||||||
|
* holding the input file(s) for the
|
||||||
|
* computation. The array and strings
|
||||||
|
* are owned. */
|
||||||
|
struct pb_PlatformParam *platform; /* If not NULL, the platform
|
||||||
|
* specified on the command line. */
|
||||||
|
struct pb_DeviceParam *device; /* If not NULL, the device
|
||||||
|
* specified on the command line. */
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Read command-line parameters.
|
||||||
|
*
|
||||||
|
* The argc and argv parameters to main are read, and any parameters
|
||||||
|
* interpreted by this function are removed from the argument list.
|
||||||
|
*
|
||||||
|
* A new instance of struct pb_Parameters is returned.
|
||||||
|
* If there is an error, then an error message is printed on stderr
|
||||||
|
* and NULL is returned.
|
||||||
|
*/
|
||||||
|
struct pb_Parameters *
|
||||||
|
pb_ReadParameters(int *_argc, char **argv);
|
||||||
|
|
||||||
|
/* Free an instance of struct pb_Parameters.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
pb_FreeParameters(struct pb_Parameters *p);
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_FreeStringArray(char **);
|
||||||
|
|
||||||
|
/* Count the number of input files in a pb_Parameters instance.
|
||||||
|
*/
|
||||||
|
int
|
||||||
|
pb_Parameters_CountInputs(struct pb_Parameters *p);
|
||||||
|
|
||||||
|
/* A time or duration. */
|
||||||
|
//#if _POSIX_VERSION >= 200112L
|
||||||
|
typedef unsigned long long pb_Timestamp; /* time in microseconds */
|
||||||
|
//#else
|
||||||
|
//# error "Timestamps not implemented"
|
||||||
|
//#endif
|
||||||
|
|
||||||
|
enum pb_TimerState {
|
||||||
|
pb_Timer_STOPPED,
|
||||||
|
pb_Timer_RUNNING,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct pb_Timer {
|
||||||
|
enum pb_TimerState state;
|
||||||
|
pb_Timestamp elapsed; /* Amount of time elapsed so far */
|
||||||
|
pb_Timestamp init; /* Beginning of the current time interval,
|
||||||
|
* if state is RUNNING. End of the last
|
||||||
|
* recorded time interfal otherwise. */
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Reset a timer.
|
||||||
|
* Use this to initialize a timer or to clear
|
||||||
|
* its elapsed time. The reset timer is stopped.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
pb_ResetTimer(struct pb_Timer *timer);
|
||||||
|
|
||||||
|
/* Start a timer. The timer is set to RUNNING mode and
|
||||||
|
* time elapsed while the timer is running is added to
|
||||||
|
* the timer.
|
||||||
|
* The timer should not already be running.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
pb_StartTimer(struct pb_Timer *timer);
|
||||||
|
|
||||||
|
/* Stop a timer.
|
||||||
|
* This stops adding elapsed time to the timer.
|
||||||
|
* The timer should not already be stopped.
|
||||||
|
*/
|
||||||
|
void
|
||||||
|
pb_StopTimer(struct pb_Timer *timer);
|
||||||
|
|
||||||
|
/* Get the elapsed time in seconds. */
|
||||||
|
double
|
||||||
|
pb_GetElapsedTime(struct pb_Timer *timer);
|
||||||
|
|
||||||
|
/* Execution time is assigned to one of these categories. */
|
||||||
|
enum pb_TimerID {
|
||||||
|
pb_TimerID_NONE = 0,
|
||||||
|
pb_TimerID_IO, /* Time spent in input/output */
|
||||||
|
pb_TimerID_KERNEL, /* Time spent computing on the device,
|
||||||
|
* recorded asynchronously */
|
||||||
|
pb_TimerID_COPY, /* Time spent synchronously moving data
|
||||||
|
* to/from device and allocating/freeing
|
||||||
|
* memory on the device */
|
||||||
|
pb_TimerID_DRIVER, /* Time spent in the host interacting with the
|
||||||
|
* driver, primarily for recording the time
|
||||||
|
* spent queueing asynchronous operations */
|
||||||
|
pb_TimerID_COPY_ASYNC, /* Time spent in asynchronous transfers */
|
||||||
|
pb_TimerID_COMPUTE, /* Time for all program execution other
|
||||||
|
* than parsing command line arguments,
|
||||||
|
* I/O, kernel, and copy */
|
||||||
|
pb_TimerID_OVERLAP, /* Time double-counted in asynchronous and
|
||||||
|
* host activity: automatically filled in,
|
||||||
|
* not intended for direct usage */
|
||||||
|
pb_TimerID_LAST /* Number of timer IDs */
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Dynamic list of asynchronously tracked times between events */
|
||||||
|
struct pb_async_time_marker_list {
|
||||||
|
char *label; // actually just a pointer to a string
|
||||||
|
enum pb_TimerID timerID; /* The ID to which the interval beginning
|
||||||
|
* with this marker should be attributed */
|
||||||
|
void * marker;
|
||||||
|
//cudaEvent_t marker; /* The driver event for this marker */
|
||||||
|
struct pb_async_time_marker_list *next;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct pb_SubTimer {
|
||||||
|
char *label;
|
||||||
|
struct pb_Timer timer;
|
||||||
|
struct pb_SubTimer *next;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct pb_SubTimerList {
|
||||||
|
struct pb_SubTimer *current;
|
||||||
|
struct pb_SubTimer *subtimer_list;
|
||||||
|
};
|
||||||
|
|
||||||
|
/* A set of timers for recording execution times. */
|
||||||
|
struct pb_TimerSet {
|
||||||
|
enum pb_TimerID current;
|
||||||
|
struct pb_async_time_marker_list* async_markers;
|
||||||
|
pb_Timestamp async_begin;
|
||||||
|
pb_Timestamp wall_begin;
|
||||||
|
struct pb_Timer timers[pb_TimerID_LAST];
|
||||||
|
struct pb_SubTimerList *sub_timer_list[pb_TimerID_LAST];
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Reset all timers in the set. */
|
||||||
|
void
|
||||||
|
pb_InitializeTimerSet(struct pb_TimerSet *timers);
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category);
|
||||||
|
|
||||||
|
/* Select which timer the next interval of time should be accounted
|
||||||
|
* to. The selected timer is started and other timers are stopped.
|
||||||
|
* Using pb_TimerID_NONE stops all timers. */
|
||||||
|
void
|
||||||
|
pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer);
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category);
|
||||||
|
|
||||||
|
/* Print timer values to standard output. */
|
||||||
|
void
|
||||||
|
pb_PrintTimerSet(struct pb_TimerSet *timers);
|
||||||
|
|
||||||
|
/* Release timer resources */
|
||||||
|
void
|
||||||
|
pb_DestroyTimerSet(struct pb_TimerSet * timers);
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_SetOpenCL(void *clContextPtr, void *clCommandQueuePtr);
|
||||||
|
|
||||||
|
|
||||||
|
typedef struct pb_Device_tag {
|
||||||
|
char* name;
|
||||||
|
void* clDevice;
|
||||||
|
int id;
|
||||||
|
unsigned int in_use;
|
||||||
|
unsigned int available;
|
||||||
|
} pb_Device;
|
||||||
|
|
||||||
|
struct pb_Context_tag;
|
||||||
|
typedef struct pb_Context_tag pb_Context;
|
||||||
|
|
||||||
|
typedef struct pb_Platform_tag {
|
||||||
|
char* name;
|
||||||
|
char* version;
|
||||||
|
void* clPlatform;
|
||||||
|
unsigned int in_use;
|
||||||
|
pb_Context** contexts;
|
||||||
|
pb_Device** devices;
|
||||||
|
} pb_Platform;
|
||||||
|
|
||||||
|
struct pb_Context_tag {
|
||||||
|
void* clPlatformId;
|
||||||
|
void* clContext;
|
||||||
|
void* clDeviceId;
|
||||||
|
pb_Platform* pb_platform;
|
||||||
|
pb_Device* pb_device;
|
||||||
|
};
|
||||||
|
|
||||||
|
// verbosely print out list of platforms and their devices to the console.
|
||||||
|
pb_Platform**
|
||||||
|
pb_GetPlatforms();
|
||||||
|
|
||||||
|
// Choose a platform according to the given platform specification
|
||||||
|
pb_Platform*
|
||||||
|
pb_GetPlatform(struct pb_PlatformParam *platform);
|
||||||
|
|
||||||
|
// choose a platform: by name, name & version
|
||||||
|
pb_Platform*
|
||||||
|
pb_GetPlatformByName(const char* name);
|
||||||
|
|
||||||
|
pb_Platform*
|
||||||
|
pb_GetPlatformByNameAndVersion(const char* name, const char* version);
|
||||||
|
|
||||||
|
// Choose a device according to the given device specification
|
||||||
|
pb_Device*
|
||||||
|
pb_GetDevice(pb_Platform* pb_platform, struct pb_DeviceParam *device);
|
||||||
|
|
||||||
|
pb_Device**
|
||||||
|
pb_GetDevices(pb_Platform* pb_platform);
|
||||||
|
|
||||||
|
// choose a device by name.
|
||||||
|
pb_Device*
|
||||||
|
pb_GetDeviceByName(pb_Platform* pb_platform, const char* name);
|
||||||
|
|
||||||
|
pb_Platform*
|
||||||
|
pb_GetPlatformByEnvVars();
|
||||||
|
|
||||||
|
pb_Context*
|
||||||
|
pb_InitOpenCLContext(struct pb_Parameters* parameters);
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_ReleasePlatforms();
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_ReleaseContext(pb_Context* c);
|
||||||
|
|
||||||
|
void
|
||||||
|
pb_PrintPlatformInfo(pb_Context* c);
|
||||||
|
|
||||||
|
void
|
||||||
|
perf_init();
|
||||||
|
|
||||||
|
//#define MEASURE_KERNEL_TIME
|
||||||
|
|
||||||
|
#include <CL/cl.h>
|
||||||
|
|
||||||
|
#ifdef MEASURE_KERNEL_TIME
|
||||||
|
#define clEnqueueNDRangeKernel(q,k,d,o,dg,db,a,b,c) pb_clEnqueueNDRangeKernel((q), (k), (d), (o), (dg), (db), (a), (b), (c))
|
||||||
|
cl_int
|
||||||
|
pb_clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
|
||||||
|
cl_kernel /* kernel */,
|
||||||
|
cl_uint /* work_dim */,
|
||||||
|
const size_t * /* global_work_offset */,
|
||||||
|
const size_t * /* global_work_size */,
|
||||||
|
const size_t * /* local_work_size */,
|
||||||
|
cl_uint /* num_events_in_wait_list */,
|
||||||
|
const cl_event * /* event_wait_list */,
|
||||||
|
cl_event * /* event */);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
enum { T_FLOAT, T_DOUBLE, T_SHORT, T_INT, T_UCHAR };
|
||||||
|
void pb_sig_float(char*, float*, int);
|
||||||
|
void pb_sig_double(char*, double*, int);
|
||||||
|
void pb_sig_short(char*, short*, int);
|
||||||
|
void pb_sig_int(char*, int*, int);
|
||||||
|
void pb_sig_uchar(char*, unsigned char*, unsigned int);
|
||||||
|
void pb_sig_clmem(char*, cl_command_queue, cl_mem, int);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif //PARBOIL_HEADER
|
||||||
|
|
||||||
1394
benchmarks/opencl/sad/parboil_opencl.c
Normal file
1394
benchmarks/opencl/sad/parboil_opencl.c
Normal file
File diff suppressed because it is too large
Load Diff
BIN
benchmarks/opencl/sad/reference.bin
Executable file
BIN
benchmarks/opencl/sad/reference.bin
Executable file
Binary file not shown.
83
benchmarks/opencl/sad/sad.h
Normal file
83
benchmarks/opencl/sad/sad.h
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
/***************************************************************************
|
||||||
|
*cr
|
||||||
|
*cr (C) Copyright 2007 The Board of Trustees of the
|
||||||
|
*cr University of Illinois
|
||||||
|
*cr All Rights Reserved
|
||||||
|
*cr
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
/* Search offsets within 16 pixels of (0,0) */
|
||||||
|
#define SEARCH_RANGE 16
|
||||||
|
|
||||||
|
/* The total search area is 33 pixels square */
|
||||||
|
#define SEARCH_DIMENSION (2*SEARCH_RANGE+1)
|
||||||
|
|
||||||
|
/* The total number of search positions is 33^2 */
|
||||||
|
#define MAX_POS 1089
|
||||||
|
|
||||||
|
/* This is padded to a multiple of 8 when allocating memory */
|
||||||
|
#define MAX_POS_PADDED 1096
|
||||||
|
|
||||||
|
/* VBSME block indices in the SAD array for different
|
||||||
|
* block sizes. The index is computed from the
|
||||||
|
* image size in macroblocks. Block sizes are (height, width):
|
||||||
|
* 1: 16 by 16 pixels, one block per macroblock
|
||||||
|
* 2: 8 by 16 pixels, 2 blocks per macroblock
|
||||||
|
* 3: 16 by 8 pixels, 2 blocks per macroblock
|
||||||
|
* 4: 8 by 8 pixels, 4 blocks per macroblock
|
||||||
|
* 5: 4 by 8 pixels, 8 blocks per macroblock
|
||||||
|
* 6: 8 by 4 pixels, 8 blocks per macroblock
|
||||||
|
* 7: 4 by 4 pixels, 16 blocks per macroblock
|
||||||
|
*/
|
||||||
|
#define SAD_TYPE_1_IX(image_size) 0
|
||||||
|
#define SAD_TYPE_2_IX(image_size) ((image_size)*MAX_POS_PADDED)
|
||||||
|
#define SAD_TYPE_3_IX(image_size) ((image_size)*(3*MAX_POS_PADDED))
|
||||||
|
#define SAD_TYPE_4_IX(image_size) ((image_size)*(5*MAX_POS_PADDED))
|
||||||
|
#define SAD_TYPE_5_IX(image_size) ((image_size)*(9*MAX_POS_PADDED))
|
||||||
|
#define SAD_TYPE_6_IX(image_size) ((image_size)*(17*MAX_POS_PADDED))
|
||||||
|
#define SAD_TYPE_7_IX(image_size) ((image_size)*(25*MAX_POS_PADDED))
|
||||||
|
|
||||||
|
#define SAD_TYPE_IX(n, image_size) \
|
||||||
|
((n == 1) ? SAD_TYPE_1_IX(image_size) : \
|
||||||
|
((n == 2) ? SAD_TYPE_2_IX(image_size) : \
|
||||||
|
((n == 3) ? SAD_TYPE_3_IX(image_size) : \
|
||||||
|
((n == 4) ? SAD_TYPE_4_IX(image_size) : \
|
||||||
|
((n == 5) ? SAD_TYPE_5_IX(image_size) : \
|
||||||
|
((n == 6) ? SAD_TYPE_6_IX(image_size) : \
|
||||||
|
(SAD_TYPE_7_IX(image_size) \
|
||||||
|
)))))))
|
||||||
|
|
||||||
|
#define SAD_TYPE_1_CT 1
|
||||||
|
#define SAD_TYPE_2_CT 2
|
||||||
|
#define SAD_TYPE_3_CT 2
|
||||||
|
#define SAD_TYPE_4_CT 4
|
||||||
|
#define SAD_TYPE_5_CT 8
|
||||||
|
#define SAD_TYPE_6_CT 8
|
||||||
|
#define SAD_TYPE_7_CT 16
|
||||||
|
|
||||||
|
#define SAD_TYPE_CT(n) \
|
||||||
|
((n == 1) ? SAD_TYPE_1_CT : \
|
||||||
|
((n == 2) ? SAD_TYPE_2_CT : \
|
||||||
|
((n == 3) ? SAD_TYPE_3_CT : \
|
||||||
|
((n == 4) ? SAD_TYPE_4_CT : \
|
||||||
|
((n == 5) ? SAD_TYPE_5_CT : \
|
||||||
|
((n == 6) ? SAD_TYPE_6_CT : \
|
||||||
|
(SAD_TYPE_7_CT \
|
||||||
|
)))))))
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void sad4_cpu(unsigned short *blk_sad,
|
||||||
|
unsigned short *frame,
|
||||||
|
unsigned short *ref,
|
||||||
|
int mb_width,
|
||||||
|
int mb_height);
|
||||||
|
|
||||||
|
void larger_sads(unsigned short *sads,
|
||||||
|
int mbs);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
57
benchmarks/opencl/sad/sad_kernel.h
Normal file
57
benchmarks/opencl/sad/sad_kernel.h
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
/***************************************************************************
|
||||||
|
*cr
|
||||||
|
*cr (C) Copyright 2007 The Board of Trustees of the
|
||||||
|
*cr University of Illinois
|
||||||
|
*cr All Rights Reserved
|
||||||
|
*cr
|
||||||
|
***************************************************************************/
|
||||||
|
|
||||||
|
/* Integer ceiling division. This computes ceil(x / y) */
|
||||||
|
#define CEIL(x,y) (((x) + ((y) - 1)) / (y))
|
||||||
|
|
||||||
|
/* Fast multiplication by 33 */
|
||||||
|
#define TIMES_DIM_POS(x) (((x) << 5) + (x))
|
||||||
|
|
||||||
|
/* Amount of dynamically allocated local storage
|
||||||
|
* measured in bytes, 2-byte words, and 8-byte words */
|
||||||
|
#define SAD_LOC_SIZE_ELEMS (THREADS_W * THREADS_H * MAX_POS_PADDED)
|
||||||
|
#define SAD_LOC_SIZE_BYTES (SAD_LOC_SIZE_ELEMS * sizeof(unsigned short))
|
||||||
|
#define SAD_LOC_SIZE_8B (SAD_LOC_SIZE_BYTES / sizeof(vec8b))
|
||||||
|
|
||||||
|
/* The search position index space is distributed across threads
|
||||||
|
* and across time. */
|
||||||
|
/* This many search positions are calculated by each thread.
|
||||||
|
* Note: the optimized kernel requires that this number is
|
||||||
|
* divisible by 3. */
|
||||||
|
#define POS_PER_THREAD 18
|
||||||
|
|
||||||
|
/* The width and height (in number of 4x4 blocks) of a tile from the
|
||||||
|
* current frame that is computed in a single thread block. */
|
||||||
|
#define THREADS_W 1
|
||||||
|
#define THREADS_H 1
|
||||||
|
|
||||||
|
// #define TIMES_THREADS_W(x) (((x) << 1) + (x))
|
||||||
|
#define TIMES_THREADS_W(x) ((x) * THREADS_W)
|
||||||
|
|
||||||
|
/* This structure is used for vector load/store operations. */
|
||||||
|
|
||||||
|
struct vec8b {
|
||||||
|
int fst;
|
||||||
|
int snd;
|
||||||
|
} __attribute__ ((aligned(8)));
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/* 4-by-4 SAD computation on the device. */
|
||||||
|
/*
|
||||||
|
extern "C" __global__ void mb_sad_calc(unsigned short*,
|
||||||
|
unsigned short*,
|
||||||
|
int, int);
|
||||||
|
*/
|
||||||
|
/* A function to get a reference to the "ref" texture, because sharing
|
||||||
|
* of textures between files isn't really supported. */
|
||||||
|
/*
|
||||||
|
texture<unsigned short, 2, cudaReadModeElementType> &get_ref(void);
|
||||||
|
|
||||||
|
extern "C" __global__ void larger_sad_calc_8(unsigned short*, int, int);
|
||||||
|
extern "C" __global__ void larger_sad_calc_16(unsigned short*, int, int);*/
|
||||||
Reference in New Issue
Block a user