diff --git a/benchmarks/opencl/stencil/128x128x32.bin b/benchmarks/opencl/stencil/128x128x32.bin new file mode 100644 index 00000000..aea52650 Binary files /dev/null and b/benchmarks/opencl/stencil/128x128x32.bin differ diff --git a/benchmarks/opencl/stencil/Makefile b/benchmarks/opencl/stencil/Makefile new file mode 100644 index 00000000..af727f48 --- /dev/null +++ b/benchmarks/opencl/stencil/Makefile @@ -0,0 +1,69 @@ +RISCV_TOOL_PATH = $(wildcard ~/dev/riscv-gnu-toolchain/drops) +POCL_CC_PATH = $(wildcard ~/dev/pocl/drops_riscv_cc) +POCL_INC_PATH = $(wildcard ../include) +POCL_LIB_PATH = $(wildcard ../lib) +VX_RT_PATH = $(wildcard ../../../runtime) +VX_SIMX_PATH = $(wildcard ../../../simX/obj_dir) + +CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc +CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++ +DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump +HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy +GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb + +VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c +VX_SRCS += $(VX_RT_PATH)/startup/vx_start.s +VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s +VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c +VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s +VX_SRCS += $(VX_RT_PATH)/tests/tests.c +VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c +VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST) + +VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld + +CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32 +CXXFLAGS += -ffreestanding # program may not begin at main() +CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections +CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions +CXXFLAGS += -I$(POCL_INC_PATH) -I. + +VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a +QEMU_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/qemu/libOpenCL.a + +PROJECT = stencil + +SRCS = main.cc args.c parboil_opencl.c ocl.c gpu_info.c file.c +#convert_dataset.c mmio.c + +all: $(PROJECT).dump $(PROJECT).hex + +lib$(PROJECT).a: kernel.cl + POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl + +$(PROJECT).elf: $(SRCS) lib$(PROJECT).a + $(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf + +$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a + $(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu + +$(PROJECT).hex: $(PROJECT).elf + $(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex + +$(PROJECT).dump: $(PROJECT).elf + $(DMP) -D $(PROJECT).elf > $(PROJECT).dump + +run: $(PROJECT).hex + POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug + +qemu: $(PROJECT).qemu + POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu + +gdb-s: $(PROJECT).qemu + POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu + +gdb-c: $(PROJECT).qemu + $(GDB) $(PROJECT).qemu + +clean: + rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug diff --git a/benchmarks/opencl/stencil/args.c b/benchmarks/opencl/stencil/args.c new file mode 100644 index 00000000..9d751e29 --- /dev/null +++ b/benchmarks/opencl/stencil/args.c @@ -0,0 +1,617 @@ + +#include +#include +#include +#include +#include +#include + +/*****************************************************************************/ +/* Memory management routines */ + +/* Free an array of owned strings. */ +void +pb_FreeStringArray(char **string_array) +{ + char **p; + + if (!string_array) return; + for (p = string_array; *p; p++) free(*p); + free(string_array); +} + +struct pb_PlatformParam * +pb_PlatformParam(char *name, char *version) +{ + if (name == NULL) { + fprintf(stderr, "pb_PlatformParam: Invalid argument\n"); + exit(-1); + } + + struct pb_PlatformParam *ret = + (struct pb_PlatformParam *)malloc(sizeof (struct pb_PlatformParam)); + + ret->name = name; + ret->version = version; + return ret; +} + +void +pb_FreePlatformParam(struct pb_PlatformParam *p) +{ + if (p == NULL) return; + + free(p->name); + free(p->version); + free(p); +} + +struct pb_DeviceParam * +pb_DeviceParam_index(int index) +{ + struct pb_DeviceParam *ret = + (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam)); + ret->criterion = pb_Device_INDEX; + ret->index = index; + return ret; +} + +struct pb_DeviceParam * +pb_DeviceParam_cpu(void) +{ + struct pb_DeviceParam *ret = + (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam)); + ret->criterion = pb_Device_CPU; + return ret; +} + +struct pb_DeviceParam * +pb_DeviceParam_gpu(void) +{ + struct pb_DeviceParam *ret = + (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam)); + ret->criterion = pb_Device_GPU; + return ret; +} + +struct pb_DeviceParam * +pb_DeviceParam_accelerator(void) +{ + struct pb_DeviceParam *ret = + (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam)); + ret->criterion = pb_Device_ACCELERATOR; + return ret; +} + +struct pb_DeviceParam * +pb_DeviceParam_name(char *name) +{ + struct pb_DeviceParam *ret = + (struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam)); + ret->criterion = pb_Device_NAME; + ret->name = name; + return ret; +} + +void +pb_FreeDeviceParam(struct pb_DeviceParam *p) +{ + if (p == NULL) return; + + switch(p->criterion) { + case pb_Device_NAME: + free(p->name); + break; + case pb_Device_INDEX: + case pb_Device_CPU: + case pb_Device_ACCELERATOR: + break; + default: + fprintf(stderr, "pb_FreeDeviceParam: Invalid argument\n"); + exit(-1); + } +} + +void +pb_FreeParameters(struct pb_Parameters *p) +{ + free(p->outFile); + pb_FreeStringArray(p->inpFiles); + pb_FreePlatformParam(p->platform); + pb_FreeDeviceParam(p->device); + free(p); +} + +/*****************************************************************************/ + +/* Parse a comma-delimited list of strings into an + * array of strings. */ +static char ** +read_string_array(char *in) +{ + char **ret; + int i; + int count; /* Number of items in the input */ + char *substring; /* Current substring within 'in' */ + + /* Count the number of items in the string */ + count = 1; + for (i = 0; in[i]; i++) if (in[i] == ',') count++; + + /* Allocate storage */ + ret = (char **)malloc((count + 1) * sizeof(char *)); + + /* Create copies of the strings from the list */ + substring = in; + for (i = 0; i < count; i++) { + char *substring_end; + int substring_length; + + /* Find length of substring */ + for (substring_end = substring; + (*substring_end != ',') && (*substring_end != 0); + substring_end++); + + substring_length = substring_end - substring; + + /* Allocate memory and copy the substring */ + ret[i] = (char *)malloc(substring_length + 1); + memcpy(ret[i], substring, substring_length); + ret[i][substring_length] = 0; + + /* go to next substring */ + substring = substring_end + 1; + } + ret[i] = NULL; /* Write the sentinel value */ + + return ret; +} + +static void +report_parse_error(const char *str) +{ + fputs(str, stderr); +} + +/* Interpret a string as a 'pb_DeviceParam' value. + * Return a pointer to a new value, or NULL on failure. + */ +static struct pb_DeviceParam * +read_device_param(char *str) +{ + /* Try different ways of interpreting 'device_string' until one works */ + + /* If argument is an integer, then interpret it as a device index */ + errno = 0; + char *end; + long device_int = strtol(str, &end, 10); + if (!errno) { + /* Negative numbers are not valid */ + if (device_int < 0 || device_int > INT_MAX) return NULL; + + return pb_DeviceParam_index(device_int); + } + + /* Match against predefined strings */ + if (strcmp(str, "CPU") == 0) + return pb_DeviceParam_cpu(); + if (strcmp(str, "GPU") == 0) + return pb_DeviceParam_gpu(); + if (strcmp(str, "ACCELERATOR") == 0) + return pb_DeviceParam_accelerator(); + + /* Assume any other string is a device name */ + return pb_DeviceParam_name(strdup(str)); +} + +/* Interpret a string as a 'pb_PlatformParam' value. + * Return a pointer to a new value, or NULL on failure. + */ +static struct pb_PlatformParam * +read_platform_param(char *str) +{ + int separator_index; /* Index of the '-' character separating + * name and version number. It's -1 if + * there's no '-' character. */ + + /* Find the last occurrence of '-' in 'str' */ + { + char *cur; + separator_index = -1; + for (cur = str; *cur; cur++) { + if (*cur == '-') separator_index = cur - str; + } + } + + /* The platform name is either the entire string, or all characters before + * the separator */ + int name_length = separator_index == -1 ? strlen(str) : separator_index; + char *name_str = (char *)malloc(name_length + 1); + memcpy(name_str, str, name_length); + name_str[name_length] = 0; + + /* The version is either NULL, or all characters after the separator */ + char *version_str; + if (separator_index == -1) { + version_str = NULL; + } + else { + const char *version_input_str = str + separator_index + 1; + int version_length = strlen(version_input_str); + + version_str = (char *)malloc(version_length + 1); + memcpy(version_str, version_input_str, version_length); + version_str[version_length] = 0; + } + + /* Create output structure */ + return pb_PlatformParam(name_str, version_str); +} + +/****************************************************************************/ +/* Argument parsing state */ + +/* Argument parsing state. + * + * Arguments that are interpreted by the argument parser are removed from + * the list. Variables 'argc' and 'argn' do not count arguments that have + * been removed. + * + * During argument parsing, the array of arguments is compacted, overwriting + * the erased arguments. Variable 'argv_put' points to the array element + * where the next argument will be written. Variable 'argv_get' points to + * the array element where the next argument will be read from. + */ +struct argparse { + int argc; /* Number of arguments. Mutable. */ + int argn; /* Current argument index. */ + char **argv_get; /* Argument value being read. */ + char **argv_put; /* Argument value being written. + * argv_put <= argv_get. */ +}; + +static void +initialize_argparse(struct argparse *ap, int argc, char **argv) +{ + ap->argc = argc; + ap->argn = 0; + ap->argv_get = ap->argv_put = argv; +} + +/* Finish argument parsing, without processing the remaining arguments. + * Write new argument count into _argc. */ +static void +finalize_argparse(struct argparse *ap, int *_argc, char **argv) +{ + /* Move the remaining arguments */ + for(; ap->argn < ap->argc; ap->argn++) + *ap->argv_put++ = *ap->argv_get++; + + /* Update the argument count */ + *_argc = ap->argc; + + /* Insert a terminating NULL */ + argv[ap->argc] = NULL; +} + +/* Delete the current argument. The argument will not be visible + * when argument parsing is done. */ +static void +delete_argument(struct argparse *ap) +{ + if (ap->argn >= ap->argc) { + fprintf(stderr, "delete_argument\n"); + } + ap->argc--; + ap->argv_get++; +} + +/* Go to the next argument. Also, move the current argument to its + * final location in argv. */ +static void +next_argument(struct argparse *ap) +{ + if (ap->argn >= ap->argc) { + fprintf(stderr, "next_argument\n"); + } + /* Move argument to its new location. */ + *ap->argv_put++ = *ap->argv_get++; + ap->argn++; +} + +static int +is_end_of_arguments(struct argparse *ap) +{ + return ap->argn == ap->argc; +} + +/* Get the current argument */ +static char * +get_argument(struct argparse *ap) +{ + return *ap->argv_get; +} + +/* Get the current argument, and also delete it */ +static char * +consume_argument(struct argparse *ap) +{ + char *ret = get_argument(ap); + delete_argument(ap); + return ret; +} + +/****************************************************************************/ + +/* The result of parsing a command-line argument */ +typedef enum { + ARGPARSE_OK, /* Success */ + ARGPARSE_ERROR, /* Error */ + ARGPARSE_DONE /* Success, and do not continue parsing */ +} result; + +typedef result parse_action(struct argparse *ap, struct pb_Parameters *params); + + +/* A command-line option */ +struct option { + char short_name; /* If not 0, the one-character + * name of this option */ + const char *long_name; /* If not NULL, the long name of this option */ + parse_action *action; /* What to do when this option occurs. + * Sentinel value is NULL. + */ +}; + +/* Output file + * + * -o FILE + */ +static result +parse_output_file(struct argparse *ap, struct pb_Parameters *params) +{ + if (is_end_of_arguments(ap)) + { + report_parse_error("Expecting file name after '-o'\n"); + return ARGPARSE_ERROR; + } + + /* Replace the output file name */ + free(params->outFile); + params->outFile = strdup(consume_argument(ap)); + + return ARGPARSE_OK; +} + +/* Input files + * + * -i FILE,FILE,... + */ +static result +parse_input_files(struct argparse *ap, struct pb_Parameters *params) +{ + if (is_end_of_arguments(ap)) + { + report_parse_error("Expecting file name after '-i'\n"); + return ARGPARSE_ERROR; + } + + /* Replace the input file list */ + pb_FreeStringArray(params->inpFiles); + params->inpFiles = read_string_array(consume_argument(ap)); + return ARGPARSE_OK; +} + +/* End of options + * + * -- + */ + +static result +parse_end_options(struct argparse *ap, struct pb_Parameters *params) +{ + return ARGPARSE_DONE; +} + +/* OpenCL device + * + * --device X + */ + +static result +parse_device(struct argparse *ap, struct pb_Parameters *params) +{ + /* Read the next argument, which specifies a device */ + + if (is_end_of_arguments(ap)) + { + report_parse_error("Expecting device specification after '--device'\n"); + return ARGPARSE_ERROR; + } + + char *device_string = consume_argument(ap); + struct pb_DeviceParam *device_param = read_device_param(device_string); + + if (!device_param) { + report_parse_error("Unrecognized device specification format on command line\n"); + return ARGPARSE_ERROR; + } + + /* Save the result */ + pb_FreeDeviceParam(params->device); + params->device = device_param; + + return ARGPARSE_OK; +} + +static result +parse_platform(struct argparse *ap, struct pb_Parameters *params) +{ + /* Read the next argument, which specifies a platform */ + + if (is_end_of_arguments(ap)) + { + report_parse_error("Expecting device specification after '--platform'\n"); + return ARGPARSE_ERROR; + } + + char *platform_string = consume_argument(ap); + struct pb_PlatformParam *platform_param = read_platform_param(platform_string); + + if (!platform_param) { + report_parse_error("Unrecognized platform specification format on command line\n"); + return ARGPARSE_ERROR; + } + + /* Save the result */ + pb_FreePlatformParam(params->platform); + params->platform = platform_param; + + return ARGPARSE_OK; +} + + +static struct option options[] = { + { 'o', NULL, &parse_output_file }, + { 'i', NULL, &parse_input_files }, + { '-', NULL, &parse_end_options }, + { 0, "device", &parse_device }, + { 0, "platform", &parse_platform }, + { 0, NULL, NULL } +}; + +static int +is_last_option(struct option *op) +{ + return op->action == NULL; +} + +/****************************************************************************/ + +/* Parse command-line parameters. + * Return zero on error, nonzero otherwise. + * On error, the other outputs may be invalid. + * + * The information collected from parameters is used to update + * 'ret'. 'ret' should be initialized. + * + * '_argc' and 'argv' are updated to contain only the unprocessed arguments. + */ +static int +pb_ParseParameters (struct pb_Parameters *ret, int *_argc, char **argv) +{ + char *err_message; + struct argparse ap; + + /* Each argument */ + initialize_argparse(&ap, *_argc, argv); + while(!is_end_of_arguments(&ap)) { + result arg_result; /* Result of parsing this option */ + char *arg = get_argument(&ap); + + /* Process this argument */ + if (arg[0] == '-') { + /* Single-character flag */ + if ((arg[1] != 0) && (arg[2] == 0)) { + delete_argument(&ap); /* This argument is consumed here */ + + /* Find a matching short option */ + struct option *op; + for (op = options; !is_last_option(op); op++) { + if (op->short_name == arg[1]) { + arg_result = (*op->action)(&ap, ret); + goto option_was_processed; + } + } + + /* No option matches */ + report_parse_error("Unexpected command-line parameter\n"); + arg_result = ARGPARSE_ERROR; + goto option_was_processed; + } + + /* Long flag */ + if (arg[1] == '-') { + delete_argument(&ap); /* This argument is consumed here */ + + /* Find a matching long option */ + struct option *op; + for (op = options; !is_last_option(op); op++) { + if (op->long_name && strcmp(&arg[2], op->long_name) == 0) { + arg_result = (*op->action)(&ap, ret); + goto option_was_processed; + } + } + + /* No option matches */ + report_parse_error("Unexpected command-line parameter\n"); + arg_result = ARGPARSE_ERROR; + goto option_was_processed; + } + } + else { + /* Other arguments are ignored */ + next_argument(&ap); + arg_result = ARGPARSE_OK; + goto option_was_processed; + } + + option_was_processed: + /* Decide what to do next based on 'arg_result' */ + switch(arg_result) { + case ARGPARSE_OK: + /* Continue processing */ + break; + + case ARGPARSE_ERROR: + /* Error exit from the function */ + return 0; + + case ARGPARSE_DONE: + /* Normal exit from the argument parsing loop */ + goto end_of_options; + } + } /* end for each argument */ + + /* If all arguments were processed, then normal exit from the loop */ + + end_of_options: + finalize_argparse(&ap, _argc, argv); + return 1; +} + +/*****************************************************************************/ +/* Other exported functions */ + +struct pb_Parameters * +pb_ReadParameters(int *_argc, char **argv) +{ + struct pb_Parameters *ret = + (struct pb_Parameters *)malloc(sizeof(struct pb_Parameters)); + + /* Initialize the parameters structure */ + ret->outFile = NULL; + ret->inpFiles = (char **)malloc(sizeof(char *)); + ret->inpFiles[0] = NULL; + ret->platform = NULL; + ret->device = NULL; + + /* Read parameters and update _argc, argv */ + if (!pb_ParseParameters(ret, _argc, argv)) { + /* Parse error */ + pb_FreeParameters(ret); + return NULL; + } + + return ret; +} + +int +pb_Parameters_CountInputs(struct pb_Parameters *p) +{ + int n; + + for (n = 0; p->inpFiles[n]; n++); + return n; +} + diff --git a/benchmarks/opencl/stencil/file.c b/benchmarks/opencl/stencil/file.c new file mode 100644 index 00000000..a98451e5 --- /dev/null +++ b/benchmarks/opencl/stencil/file.c @@ -0,0 +1,78 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2007 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +//#include +#include +#include +#include +#include + +#if __BYTE_ORDER != __LITTLE_ENDIAN +# error "File I/O is not implemented for this system: wrong endianness." +#endif + +void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, + float** h_data, int** h_indices, int** h_ptr, + int** h_perm, int** h_nzcnt) +{ + FILE* fid = fopen(fName, "rb"); + + if (fid == NULL) + { + fprintf(stderr, "Cannot open input file\n"); + exit(-1); + } + + fscanf(fid, "%d %d %d %d %d\n",len,depth,nzcnt_len,dim,pad); + int _len=len[0]; + int _depth=depth[0]; + int _dim=dim[0]; + int _pad=pad[0]; + int _nzcnt_len=nzcnt_len[0]; + + *h_data = (float *) malloc(_len * sizeof (float)); + fread (*h_data, sizeof (float), _len, fid); + + *h_indices = (int *) malloc(_len * sizeof (int)); + fread (*h_indices, sizeof (int), _len, fid); + + *h_ptr = (int *) malloc(_depth * sizeof (int)); + fread (*h_ptr, sizeof (int), _depth, fid); + + *h_perm = (int *) malloc(_dim * sizeof (int)); + fread (*h_perm, sizeof (int), _dim, fid); + + *h_nzcnt = (int *) malloc(_nzcnt_len * sizeof (int)); + fread (*h_nzcnt, sizeof (int), _nzcnt_len, fid); + + fclose (fid); +} + +void input_vec(char *fName,float *h_vec,int dim) +{ + FILE* fid = fopen(fName, "rb"); + fread (h_vec, sizeof (float), dim, fid); + fclose(fid); + +} + +void outputData(char* fName, float *h_Ax_vector,int dim) +{ + FILE* fid = fopen(fName, "w"); + uint32_t tmp32; + if (fid == NULL) + { + fprintf(stderr, "Cannot open output file\n"); + exit(-1); + } + tmp32 = dim; + fwrite(&tmp32, sizeof(uint32_t), 1, fid); + fwrite(h_Ax_vector, sizeof(float), dim, fid); + + fclose (fid); +} diff --git a/benchmarks/opencl/stencil/file.h b/benchmarks/opencl/stencil/file.h new file mode 100644 index 00000000..65f80135 --- /dev/null +++ b/benchmarks/opencl/stencil/file.h @@ -0,0 +1,18 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2007 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ +#ifndef __FILEH__ +#define __FILEH__ + +void inputData(char* fName, int* len, int* depth, int* dim,int *nzcnt_len,int *pad, + float** h_data, int** h_indices, int** h_ptr, + int** h_perm, int** h_nzcnt); + +void input_vec(char* fNanme, float *h_vec,int dim); +void outputData(char* fName, float *h_Ax_vector,int dim); + +#endif \ No newline at end of file diff --git a/benchmarks/opencl/stencil/gpu_info.c b/benchmarks/opencl/stencil/gpu_info.c new file mode 100644 index 00000000..4d641f81 --- /dev/null +++ b/benchmarks/opencl/stencil/gpu_info.c @@ -0,0 +1,55 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2010 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ +//#include +#include +#include +#include +#include + +#include "gpu_info.h" + +void compute_active_thread(size_t *thread, + size_t *grid, + int task, + int pad, + int major, + int minor, + int sm) +{ + int max_thread; + int max_block=8; + if(major==1) + { + if(minor>=2) + max_thread=1024; + else + max_thread=768; + } + else if(major==2) + max_thread=1536; + else + //newer GPU //keep using 2.0 + max_thread=1536; + + int _grid; + int _thread; + + if(task*pad>sm*max_thread) + { + _thread=max_thread/max_block; + _grid = ((task*pad+_thread-1)/_thread)*_thread; + } + else + { + _thread=pad; + _grid=task*pad; + } + + thread[0]=_thread; + grid[0]=_grid; +} diff --git a/benchmarks/opencl/stencil/gpu_info.h b/benchmarks/opencl/stencil/gpu_info.h new file mode 100644 index 00000000..4219cda9 --- /dev/null +++ b/benchmarks/opencl/stencil/gpu_info.h @@ -0,0 +1,20 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2010 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +#ifndef __GPUINFOH__ +#define __GPUINFOH__ + +void compute_active_thread(size_t *thread, + size_t *grid, + int task, + int pad, + int major, + int minor, + int sm); + +#endif diff --git a/benchmarks/opencl/stencil/kernel.cl b/benchmarks/opencl/stencil/kernel.cl new file mode 100644 index 00000000..f6a98fab --- /dev/null +++ b/benchmarks/opencl/stencil/kernel.cl @@ -0,0 +1,28 @@ +/*************************************************************************** + *cr + *cr (C) Copyright 2010 The Board of Trustees of the + *cr University of Illinois + *cr All Rights Reserved + *cr + ***************************************************************************/ + +#define Index3D(_nx,_ny,_i,_j,_k) ((_i)+_nx*((_j)+_ny*(_k))) + +__kernel void naive_kernel(float c0,float c1,__global float* A0,__global float *Anext,int nx,int ny,int nz) +{ + int i = get_global_id(0)+1; + int j = get_global_id(1)+1; + int k = get_global_id(2)+1; + +if(i +#include +#include +#include +#include +#include +//#include "file.h" +#include "ocl.h" +//#include "common.h" + +#define CHECK_ERROR(errorMessage) \ + if(clStatus != CL_SUCCESS) \ + { \ + printf("Error: %s!\n",errorMessage); \ + printf("Line: %d\n",__LINE__); \ + exit(1); \ + } + +static int read_data(float *A0, int nx,int ny,int nz,FILE *fp) +{ + int s=0; + int i,j,k; + for(i=0;i\n"); + + //parameters = pb_ReadParameters(&argc, argv); + + /*parameters->inpFiles = (char **)malloc(sizeof(char *) * 2); + parameters->inpFiles[0] = (char *)malloc(100); + parameters->inpFiles[1] = NULL; + strncpy(parameters->inpFiles[0], "128x128x32.bin", 100);*/ + + pb_InitializeTimerSet(&timers); + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + //declaration + int nx,ny,nz; + int size; + int iteration; + float c0=1.0f/6.0f; + float c1=1.0f/6.0f/6.0f; + + /*if (argc<5) + { + printf("Usage: probe nx ny nz t\n" + "nx: the grid size x\n" + "ny: the grid size y\n" + "nz: the grid size z\n" + "t: the iteration time\n"); + return -1; + } + + nx = atoi(argv[1]); + if (nx<1) + return -1; + ny = atoi(argv[2]); + if (ny<1) + return -1; + nz = atoi(argv[3]); + if (nz<1) + return -1; + iteration = atoi(argv[4]); + if(iteration<1) + return -1;*/ + + nx = 64; + ny = 64; + nz = 4; + iteration = 2; + + cl_int clStatus; + cl_context clContext; + cl_device_id clDevice; + cl_platform_id clPlatform; + + // Below is the new interface, coupled with Parboil runtime. + pb_Context* pb_context; + pb_context = pb_InitOpenCLContext(parameters); + if (pb_context == NULL) { + fprintf (stderr, "Error: No OpenCL platform/device can be found."); + return -1; + } + + printf("OK\n"); + + // okay, let's deliver actual variables + clPlatform = (cl_platform_id) pb_context->clPlatformId; + clContext = (cl_context) pb_context->clContext; + clDevice = (cl_device_id) pb_context->clDeviceId; + + cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus); + CHECK_ERROR("clCreateCommandQueue") + + printf("OK\n"); + + pb_SetOpenCL(&clContext, &clCommandQueue); + + //const char* clSource[] = {readFile("src/opencl_base/kernel.cl")}; + //cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus); + cl_program clProgram = clCreateProgramWithBuiltInKernels( + clContext, 1, &clDevice, "naive_kernel", &clStatus); + CHECK_ERROR("clCreateProgramWithSource") + + char clOptions[50]; + sprintf(clOptions,"-I src/opencl_base"); + clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL); + CHECK_ERROR("clBuildProgram") + + cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus); + CHECK_ERROR("clCreateKernel") + + printf("OK+\n"); + + //host data + float *h_A0; + float *h_Anext; + + //device + cl_mem d_A0; + cl_mem d_Anext; + + //load data from files + + size=nx*ny*nz; + + h_A0=(float*)malloc(sizeof(float)*size); + h_Anext=(float*)malloc(sizeof(float)*size); + pb_SwitchToTimer(&timers, pb_TimerID_IO); + //FILE *fp = fopen(parameters->inpFiles[0], "rb"); + printf("OK+\n"); + read_data(h_A0, nx,ny,nz,NULL); + printf("OK+\n"); + //fclose(fp); + memcpy (h_Anext,h_A0,sizeof(float)*size); + + pb_SwitchToTimer(&timers, pb_TimerID_COPY); + + printf("OK+\n"); + + //memory allocation + d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus); + CHECK_ERROR("clCreateBuffer") + d_Anext = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus); + CHECK_ERROR("clCreateBuffer") + + //memory copy + clStatus = clEnqueueWriteBuffer(clCommandQueue,d_A0,CL_FALSE,0,size*sizeof(float),h_A0,0,NULL,NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + clStatus = clEnqueueWriteBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + printf("OK+\n"); + + //only use 1D thread block + int tx = 256; + size_t block[3] = {tx,1,1}; + size_t grid[3] = {(nx-2+tx-1)/tx*tx,ny-2,nz-2}; + //size_t grid[3] = {nx-2,ny-2,nz-2}; + size_t offset[3] = {1,1,1}; + printf("block size in x/y/z = %d %d %d\n",block[0],block[1],block[2]); + printf("grid size in x/y/z = %d %d %d\n",grid[0],grid[1],grid[2]); + + printf ("blocks = %d\n", (grid[0]/block[0])*(grid[1]/block[1])*(grid[2]*block[2])); + + clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0); + clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1); + clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),(void*)&d_A0); + clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext); + clStatus = clSetKernelArg(clKernel,4,sizeof(int),(void*)&nx); + clStatus = clSetKernelArg(clKernel,5,sizeof(int),(void*)&ny); + clStatus = clSetKernelArg(clKernel,6,sizeof(int),(void*)&nz); + CHECK_ERROR("clSetKernelArg") + + //main execution + pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); + + int t; + for(t=0;toutFile) { + pb_SwitchToTimer(&timers, pb_TimerID_IO); + //outputData(parameters->outFile,h_Anext,nx,ny,nz); + } + pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); + + //free((void*)clSource[0]); + + free(h_A0); + free(h_Anext); + pb_SwitchToTimer(&timers, pb_TimerID_NONE); + + pb_PrintTimerSet(&timers); + pb_FreeParameters(parameters); + + return 0; +} diff --git a/benchmarks/opencl/stencil/ocl.c b/benchmarks/opencl/stencil/ocl.c new file mode 100644 index 00000000..9ce9a2f5 --- /dev/null +++ b/benchmarks/opencl/stencil/ocl.c @@ -0,0 +1,50 @@ +#include +#include +#include +#include +#include "ocl.h" + +char* readFile(const char* fileName) +{ + FILE* fp; + fp = fopen(fileName,"r"); + if(fp == NULL) + { + printf("Error 1!\n"); + exit(1); + } + + fseek(fp,0,SEEK_END); + long size = ftell(fp); + rewind(fp); + + char* buffer = (char*)malloc(sizeof(char)*(size+1)); + if(buffer == NULL) + { + printf("Error 2!\n"); + fclose(fp); + exit(1); + } + + size_t res = fread(buffer,1,size,fp); + if(res != size) + { + printf("Error 3!\n"); + fclose(fp); + exit(1); + } + + buffer[size] = 0; + fclose(fp); + return buffer; +} + +void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size) +{ + cl_int clStatus; + char* temp = (char*)malloc(size); + memset(temp,val,size); + clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL); + CHECK_ERROR("clEnqueueWriteBuffer") + free(temp); +} diff --git a/benchmarks/opencl/stencil/ocl.h b/benchmarks/opencl/stencil/ocl.h new file mode 100644 index 00000000..8840a868 --- /dev/null +++ b/benchmarks/opencl/stencil/ocl.h @@ -0,0 +1,21 @@ +#ifndef __OCLH__ +#define __OCLH__ + +typedef struct { + cl_uint major; + cl_uint minor; + cl_uint multiProcessorCount; +} OpenCLDeviceProp; + +void clMemSet(cl_command_queue, cl_mem, int, size_t); +char* readFile(const char*); + +#define CHECK_ERROR(errorMessage) \ + if(clStatus != CL_SUCCESS) \ + { \ + printf("Error: %s!\n",errorMessage); \ + printf("Line: %d\n",__LINE__); \ + exit(1); \ + } + +#endif diff --git a/benchmarks/opencl/stencil/parboil.c b/benchmarks/opencl/stencil/parboil.c new file mode 100644 index 00000000..54fca9d0 --- /dev/null +++ b/benchmarks/opencl/stencil/parboil.c @@ -0,0 +1,427 @@ +/* + * (c) 2007 The Board of Trustees of the University of Illinois. + */ + +#include +#include +#include +#include + +#if _POSIX_VERSION >= 200112L +# include +#endif + + +/*****************************************************************************/ +/* Timer routines */ + +static void +accumulate_time(pb_Timestamp *accum, + pb_Timestamp start, + pb_Timestamp end) +{ +#if _POSIX_VERSION >= 200112L + *accum += end - start; +#else +# error "Timestamps not implemented for this system" +#endif +} + +#if _POSIX_VERSION >= 200112L +static pb_Timestamp get_time() +{ + struct timeval tv; + gettimeofday(&tv, NULL); + return (pb_Timestamp) (tv.tv_sec * 1000000LL + tv.tv_usec); +} +#else +# error "no supported time libraries are available on this platform" +#endif + +void +pb_ResetTimer(struct pb_Timer *timer) +{ + timer->state = pb_Timer_STOPPED; + +#if _POSIX_VERSION >= 200112L + timer->elapsed = 0; +#else +# error "pb_ResetTimer: not implemented for this system" +#endif +} + +void +pb_StartTimer(struct pb_Timer *timer) +{ + if (timer->state != pb_Timer_STOPPED) { + fputs("Ignoring attempt to start a running timer\n", stderr); + return; + } + + timer->state = pb_Timer_RUNNING; + +#if _POSIX_VERSION >= 200112L + { + struct timeval tv; + gettimeofday(&tv, NULL); + timer->init = tv.tv_sec * 1000000LL + tv.tv_usec; + } +#else +# error "pb_StartTimer: not implemented for this system" +#endif +} + +void +pb_StartTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) +{ + unsigned int numNotStopped = 0x3; // 11 + if (timer->state != pb_Timer_STOPPED) { + fputs("Warning: Timer was not stopped\n", stderr); + numNotStopped &= 0x1; // Zero out 2^1 + } + if (subtimer->state != pb_Timer_STOPPED) { + fputs("Warning: Subtimer was not stopped\n", stderr); + numNotStopped &= 0x2; // Zero out 2^0 + } + if (numNotStopped == 0x0) { + fputs("Ignoring attempt to start running timer and subtimer\n", stderr); + return; + } + + timer->state = pb_Timer_RUNNING; + subtimer->state = pb_Timer_RUNNING; + +#if _POSIX_VERSION >= 200112L + { + struct timeval tv; + gettimeofday(&tv, NULL); + + if (numNotStopped & 0x2) { + timer->init = tv.tv_sec * 1000000LL + tv.tv_usec; + } + + if (numNotStopped & 0x1) { + subtimer->init = tv.tv_sec * 1000000LL + tv.tv_usec; + } + } +#else +# error "pb_StartTimer: not implemented for this system" +#endif + +} + +void +pb_StopTimer(struct pb_Timer *timer) +{ + + pb_Timestamp fini; + + if (timer->state != pb_Timer_RUNNING) { + fputs("Ignoring attempt to stop a stopped timer\n", stderr); + return; + } + + timer->state = pb_Timer_STOPPED; + +#if _POSIX_VERSION >= 200112L + { + struct timeval tv; + gettimeofday(&tv, NULL); + fini = tv.tv_sec * 1000000LL + tv.tv_usec; + } +#else +# error "pb_StopTimer: not implemented for this system" +#endif + + accumulate_time(&timer->elapsed, timer->init, fini); + timer->init = fini; + +} + +void pb_StopTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) { + + pb_Timestamp fini; + + unsigned int numNotRunning = 0x3; // 0b11 + if (timer->state != pb_Timer_RUNNING) { + fputs("Warning: Timer was not running\n", stderr); + numNotRunning &= 0x1; // Zero out 2^1 + } + if (subtimer->state != pb_Timer_RUNNING) { + fputs("Warning: Subtimer was not running\n", stderr); + numNotRunning &= 0x2; // Zero out 2^0 + } + if (numNotRunning == 0x0) { + fputs("Ignoring attempt to stop stopped timer and subtimer\n", stderr); + return; + } + + + timer->state = pb_Timer_STOPPED; + subtimer->state = pb_Timer_STOPPED; + +#if _POSIX_VERSION >= 200112L + { + struct timeval tv; + gettimeofday(&tv, NULL); + fini = tv.tv_sec * 1000000LL + tv.tv_usec; + } +#else +# error "pb_StopTimer: not implemented for this system" +#endif + + if (numNotRunning & 0x2) { + accumulate_time(&timer->elapsed, timer->init, fini); + timer->init = fini; + } + + if (numNotRunning & 0x1) { + accumulate_time(&subtimer->elapsed, subtimer->init, fini); + subtimer->init = fini; + } + +} + +/* Get the elapsed time in seconds. */ +double +pb_GetElapsedTime(struct pb_Timer *timer) +{ + double ret; + + if (timer->state != pb_Timer_STOPPED) { + fputs("Elapsed time from a running timer is inaccurate\n", stderr); + } + +#if _POSIX_VERSION >= 200112L + ret = timer->elapsed / 1e6; +#else +# error "pb_GetElapsedTime: not implemented for this system" +#endif + return ret; +} + +void +pb_InitializeTimerSet(struct pb_TimerSet *timers) +{ + int n; + + timers->wall_begin = get_time(); + + timers->current = pb_TimerID_NONE; + + timers->async_markers = NULL; + + + for (n = 0; n < pb_TimerID_LAST; n++) { + pb_ResetTimer(&timers->timers[n]); + timers->sub_timer_list[n] = NULL; // free first? + } +} + +void +pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category) { + + struct pb_SubTimer *subtimer = (struct pb_SubTimer *) malloc + (sizeof(struct pb_SubTimer)); + + int len = strlen(label); + + subtimer->label = (char *) malloc (sizeof(char)*(len+1)); + sprintf(subtimer->label, "%s\0", label); + + pb_ResetTimer(&subtimer->timer); + subtimer->next = NULL; + + struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[pb_Category]; + if (subtimerlist == NULL) { + subtimerlist = (struct pb_SubTimerList *) malloc + (sizeof(struct pb_SubTimerList)); + subtimerlist->subtimer_list = subtimer; + timers->sub_timer_list[pb_Category] = subtimerlist; + } else { + // Append to list + struct pb_SubTimer *element = subtimerlist->subtimer_list; + while (element->next != NULL) { + element = element->next; + } + element->next = subtimer; + } + +} + +void +pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category) +{ + +// switchToSub( NULL, NONE +// switchToSub( NULL, some +// switchToSub( some, some +// switchToSub( some, NONE -- tries to find "some" in NONE's sublist, which won't be printed + + struct pb_Timer *topLevelToStop = NULL; + if (timers->current != category && timers->current != pb_TimerID_NONE) { + // Switching to subtimer in a different category needs to stop the top-level current, different categoried timer. + // NONE shouldn't have a timer associated with it, so exclude from branch + topLevelToStop = &timers->timers[timers->current]; + } + + struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; + struct pb_SubTimer *curr = (subtimerlist == NULL) ? NULL : subtimerlist->current; + + if (timers->current != pb_TimerID_NONE) { + if (curr != NULL && topLevelToStop != NULL) { + pb_StopTimerAndSubTimer(topLevelToStop, &curr->timer); + } else if (curr != NULL) { + pb_StopTimer(&curr->timer); + } else { + pb_StopTimer(topLevelToStop); + } + } + + subtimerlist = timers->sub_timer_list[category]; + struct pb_SubTimer *subtimer = NULL; + + if (label != NULL) { + subtimer = subtimerlist->subtimer_list; + while (subtimer != NULL) { + if (strcmp(subtimer->label, label) == 0) { + break; + } else { + subtimer = subtimer->next; + } + } + } + + if (category != pb_TimerID_NONE) { + + if (subtimerlist != NULL) { + subtimerlist->current = subtimer; + } + + if (category != timers->current && subtimer != NULL) { + pb_StartTimerAndSubTimer(&timers->timers[category], &subtimer->timer); + } else if (subtimer != NULL) { + // Same category, different non-NULL subtimer + pb_StartTimer(&subtimer->timer); + } else{ + // Different category, but no subtimer (not found or specified as NULL) -- unprefered way of setting topLevel timer + pb_StartTimer(&timers->timers[category]); + } + } + + timers->current = category; + +} + +void +pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer) +{ + /* Stop the currently running timer */ + /*if (timers->current != pb_TimerID_NONE) { + struct pb_SubTimer *currSubTimer = NULL; + struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; + + if ( subtimerlist != NULL) { + currSubTimer = timers->sub_timer_list[timers->current]->current; + } + if ( currSubTimer!= NULL) { + pb_StopTimerAndSubTimer(&timers->timers[timers->current], &currSubTimer->timer); + } else { + pb_StopTimer(&timers->timers[timers->current]); + } + } + + timers->current = timer; + + if (timer != pb_TimerID_NONE) { + pb_StartTimer(&timers->timers[timer]); + }*/ +} + +void +pb_PrintTimerSet(struct pb_TimerSet *timers) +{ + + pb_Timestamp wall_end = get_time(); + + struct pb_Timer *t = timers->timers; + struct pb_SubTimer* sub = NULL; + + int maxSubLength; + + const char *categories[] = { + "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute" + }; + + const int maxCategoryLength = 10; + + int i; + for(i = 1; i < pb_TimerID_LAST-1; ++i) { // exclude NONE and OVRELAP from this format + if(pb_GetElapsedTime(&t[i]) != 0) { + + // Print Category Timer + printf("%-*s: %f\n", maxCategoryLength, categories[i-1], pb_GetElapsedTime(&t[i])); + + if (timers->sub_timer_list[i] != NULL) { + sub = timers->sub_timer_list[i]->subtimer_list; + maxSubLength = 0; + while (sub != NULL) { + // Find longest SubTimer label + if (strlen(sub->label) > maxSubLength) { + maxSubLength = strlen(sub->label); + } + sub = sub->next; + } + + // Fit to Categories + if (maxSubLength <= maxCategoryLength) { + maxSubLength = maxCategoryLength; + } + + sub = timers->sub_timer_list[i]->subtimer_list; + + // Print SubTimers + while (sub != NULL) { + printf(" -%-*s: %f\n", maxSubLength, sub->label, pb_GetElapsedTime(&sub->timer)); + sub = sub->next; + } + } + } + } + + if(pb_GetElapsedTime(&t[pb_TimerID_OVERLAP]) != 0) + printf("CPU/Kernel Overlap: %f\n", pb_GetElapsedTime(&t[pb_TimerID_OVERLAP])); + + float walltime = (wall_end - timers->wall_begin)/ 1e6; + printf("Timer Wall Time: %f\n", walltime); + +} + +void pb_DestroyTimerSet(struct pb_TimerSet * timers) +{ + /* clean up all of the async event markers */ + struct pb_async_time_marker_list ** event = &(timers->async_markers); + while( *event != NULL) { + struct pb_async_time_marker_list ** next = &((*event)->next); + free(*event); + (*event) = NULL; + event = next; + } + + int i = 0; + for(i = 0; i < pb_TimerID_LAST; ++i) { + if (timers->sub_timer_list[i] != NULL) { + struct pb_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list; + struct pb_SubTimer *prev = NULL; + while (subtimer != NULL) { + free(subtimer->label); + prev = subtimer; + subtimer = subtimer->next; + free(prev); + } + free(timers->sub_timer_list[i]); + } + } +} + + diff --git a/benchmarks/opencl/stencil/parboil.h b/benchmarks/opencl/stencil/parboil.h new file mode 100644 index 00000000..4c9a8b5e --- /dev/null +++ b/benchmarks/opencl/stencil/parboil.h @@ -0,0 +1,348 @@ +/* + * (c) 2010 The Board of Trustees of the University of Illinois. + */ +#ifndef PARBOIL_HEADER +#define PARBOIL_HEADER + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* A platform as specified by the user on the command line */ +struct pb_PlatformParam { + char *name; /* The platform name. This string is owned. */ + char *version; /* The platform version; may be NULL. + * This string is owned. */ +}; + +/* Create a PlatformParam from the given strings. + * 'name' must not be NULL. 'version' may be NULL. + * If not NULL, the strings should have been allocated by malloc(), + * and they will be owned by the returned object. + */ +struct pb_PlatformParam * +pb_PlatformParam(char *name, char *version); + +void +pb_FreePlatformParam(struct pb_PlatformParam *); + +/* A criterion for how to select a device */ +enum pb_DeviceSelectionCriterion { + pb_Device_INDEX, /* Enumerate the devices and select one + * by its number */ + pb_Device_CPU, /* Select a CPU device */ + pb_Device_GPU, /* Select a GPU device */ + pb_Device_ACCELERATOR, /* Select an accelerator device */ + pb_Device_NAME /* Select a device by name */ +}; + +/* A device as specified by the user on the command line */ +struct pb_DeviceParam { + enum pb_DeviceSelectionCriterion criterion; + union { + int index; /* If criterion == pb_Device_INDEX, + * the index of the device */ + char *name; /* If criterion == pb_Device_NAME, + * the name of the device. + * This string is owned. */ + }; +}; + +struct pb_DeviceParam * +pb_DeviceParam_index(int index); + +struct pb_DeviceParam * +pb_DeviceParam_cpu(void); + +struct pb_DeviceParam * +pb_DeviceParam_gpu(void); + +struct pb_DeviceParam * +pb_DeviceParam_accelerator(void); + +/* Create a by-name device selection criterion. + * The string should have been allocated by malloc(), and it will will be + * owned by the returned object. + */ +struct pb_DeviceParam * +pb_DeviceParam_name(char *name); + +void +pb_FreeDeviceParam(struct pb_DeviceParam *); + +/* Command line parameters for benchmarks */ +struct pb_Parameters { + char *outFile; /* If not NULL, the raw output of the + * computation should be saved to this + * file. The string is owned. */ + char **inpFiles; /* A NULL-terminated array of strings + * holding the input file(s) for the + * computation. The array and strings + * are owned. */ + struct pb_PlatformParam *platform; /* If not NULL, the platform + * specified on the command line. */ + struct pb_DeviceParam *device; /* If not NULL, the device + * specified on the command line. */ +}; + +/* Read command-line parameters. + * + * The argc and argv parameters to main are read, and any parameters + * interpreted by this function are removed from the argument list. + * + * A new instance of struct pb_Parameters is returned. + * If there is an error, then an error message is printed on stderr + * and NULL is returned. + */ +struct pb_Parameters * +pb_ReadParameters(int *_argc, char **argv); + +/* Free an instance of struct pb_Parameters. + */ +void +pb_FreeParameters(struct pb_Parameters *p); + +void +pb_FreeStringArray(char **); + +/* Count the number of input files in a pb_Parameters instance. + */ +int +pb_Parameters_CountInputs(struct pb_Parameters *p); + +/* A time or duration. */ +//#if _POSIX_VERSION >= 200112L +typedef unsigned long long pb_Timestamp; /* time in microseconds */ +//#else +//# error "Timestamps not implemented" +//#endif + +enum pb_TimerState { + pb_Timer_STOPPED, + pb_Timer_RUNNING, +}; + +struct pb_Timer { + enum pb_TimerState state; + pb_Timestamp elapsed; /* Amount of time elapsed so far */ + pb_Timestamp init; /* Beginning of the current time interval, + * if state is RUNNING. End of the last + * recorded time interfal otherwise. */ +}; + +/* Reset a timer. + * Use this to initialize a timer or to clear + * its elapsed time. The reset timer is stopped. + */ +void +pb_ResetTimer(struct pb_Timer *timer); + +/* Start a timer. The timer is set to RUNNING mode and + * time elapsed while the timer is running is added to + * the timer. + * The timer should not already be running. + */ +void +pb_StartTimer(struct pb_Timer *timer); + +/* Stop a timer. + * This stops adding elapsed time to the timer. + * The timer should not already be stopped. + */ +void +pb_StopTimer(struct pb_Timer *timer); + +/* Get the elapsed time in seconds. */ +double +pb_GetElapsedTime(struct pb_Timer *timer); + +/* Execution time is assigned to one of these categories. */ +enum pb_TimerID { + pb_TimerID_NONE = 0, + pb_TimerID_IO, /* Time spent in input/output */ + pb_TimerID_KERNEL, /* Time spent computing on the device, + * recorded asynchronously */ + pb_TimerID_COPY, /* Time spent synchronously moving data + * to/from device and allocating/freeing + * memory on the device */ + pb_TimerID_DRIVER, /* Time spent in the host interacting with the + * driver, primarily for recording the time + * spent queueing asynchronous operations */ + pb_TimerID_COPY_ASYNC, /* Time spent in asynchronous transfers */ + pb_TimerID_COMPUTE, /* Time for all program execution other + * than parsing command line arguments, + * I/O, kernel, and copy */ + pb_TimerID_OVERLAP, /* Time double-counted in asynchronous and + * host activity: automatically filled in, + * not intended for direct usage */ + pb_TimerID_LAST /* Number of timer IDs */ +}; + +/* Dynamic list of asynchronously tracked times between events */ +struct pb_async_time_marker_list { + char *label; // actually just a pointer to a string + enum pb_TimerID timerID; /* The ID to which the interval beginning + * with this marker should be attributed */ + void * marker; + //cudaEvent_t marker; /* The driver event for this marker */ + struct pb_async_time_marker_list *next; +}; + +struct pb_SubTimer { + char *label; + struct pb_Timer timer; + struct pb_SubTimer *next; +}; + +struct pb_SubTimerList { + struct pb_SubTimer *current; + struct pb_SubTimer *subtimer_list; +}; + +/* A set of timers for recording execution times. */ +struct pb_TimerSet { + enum pb_TimerID current; + struct pb_async_time_marker_list* async_markers; + pb_Timestamp async_begin; + pb_Timestamp wall_begin; + struct pb_Timer timers[pb_TimerID_LAST]; + struct pb_SubTimerList *sub_timer_list[pb_TimerID_LAST]; +}; + +/* Reset all timers in the set. */ +void +pb_InitializeTimerSet(struct pb_TimerSet *timers); + +void +pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category); + +/* Select which timer the next interval of time should be accounted + * to. The selected timer is started and other timers are stopped. + * Using pb_TimerID_NONE stops all timers. */ +void +pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer); + +void +pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category); + +/* Print timer values to standard output. */ +void +pb_PrintTimerSet(struct pb_TimerSet *timers); + +/* Release timer resources */ +void +pb_DestroyTimerSet(struct pb_TimerSet * timers); + +void +pb_SetOpenCL(void *clContextPtr, void *clCommandQueuePtr); + + +typedef struct pb_Device_tag { + char* name; + void* clDevice; + int id; + unsigned int in_use; + unsigned int available; +} pb_Device; + +struct pb_Context_tag; +typedef struct pb_Context_tag pb_Context; + +typedef struct pb_Platform_tag { + char* name; + char* version; + void* clPlatform; + unsigned int in_use; + pb_Context** contexts; + pb_Device** devices; +} pb_Platform; + +struct pb_Context_tag { + void* clPlatformId; + void* clContext; + void* clDeviceId; + pb_Platform* pb_platform; + pb_Device* pb_device; +}; + +// verbosely print out list of platforms and their devices to the console. +pb_Platform** +pb_GetPlatforms(); + +// Choose a platform according to the given platform specification +pb_Platform* +pb_GetPlatform(struct pb_PlatformParam *platform); + +// choose a platform: by name, name & version +pb_Platform* +pb_GetPlatformByName(const char* name); + +pb_Platform* +pb_GetPlatformByNameAndVersion(const char* name, const char* version); + +// Choose a device according to the given device specification +pb_Device* +pb_GetDevice(pb_Platform* pb_platform, struct pb_DeviceParam *device); + +pb_Device** +pb_GetDevices(pb_Platform* pb_platform); + +// choose a device by name. +pb_Device* +pb_GetDeviceByName(pb_Platform* pb_platform, const char* name); + +pb_Platform* +pb_GetPlatformByEnvVars(); + +pb_Context* +pb_InitOpenCLContext(struct pb_Parameters* parameters); + +void +pb_ReleasePlatforms(); + +void +pb_ReleaseContext(pb_Context* c); + +void +pb_PrintPlatformInfo(pb_Context* c); + +void +perf_init(); + +//#define MEASURE_KERNEL_TIME + +#include + +#ifdef MEASURE_KERNEL_TIME +#define clEnqueueNDRangeKernel(q,k,d,o,dg,db,a,b,c) pb_clEnqueueNDRangeKernel((q), (k), (d), (o), (dg), (db), (a), (b), (c)) +cl_int +pb_clEnqueueNDRangeKernel(cl_command_queue /* command_queue */, + cl_kernel /* kernel */, + cl_uint /* work_dim */, + const size_t * /* global_work_offset */, + const size_t * /* global_work_size */, + const size_t * /* local_work_size */, + cl_uint /* num_events_in_wait_list */, + const cl_event * /* event_wait_list */, + cl_event * /* event */); +#endif + +enum { T_FLOAT, T_DOUBLE, T_SHORT, T_INT, T_UCHAR }; +void pb_sig_float(char*, float*, int); +void pb_sig_double(char*, double*, int); +void pb_sig_short(char*, short*, int); +void pb_sig_int(char*, int*, int); +void pb_sig_uchar(char*, unsigned char*, unsigned int); +void pb_sig_clmem(char*, cl_command_queue, cl_mem, int); + +#ifdef __cplusplus +} +#endif + +#endif //PARBOIL_HEADER + diff --git a/benchmarks/opencl/stencil/parboil_opencl.c b/benchmarks/opencl/stencil/parboil_opencl.c new file mode 100644 index 00000000..acf81a7e --- /dev/null +++ b/benchmarks/opencl/stencil/parboil_opencl.c @@ -0,0 +1,1390 @@ +/* + * (c) 2007 The Board of Trustees of the University of Illinois. + */ + +#include +#include +#include +#include +#include +#include + +#if _POSIX_VERSION >= 200112L +# include +#endif + +//#include "perfmon.h" + +cl_context *clContextPtr; +cl_command_queue *clCommandQueuePtr; + +// #define DISABLE_PARBOIL_TIMER + +/*****************************************************************************/ +/* Timer routines */ + +static int is_async(enum pb_TimerID timer) +{ + return (timer == pb_TimerID_KERNEL) || + (timer == pb_TimerID_COPY_ASYNC); +} + +static int is_blocking(enum pb_TimerID timer) +{ + return (timer == pb_TimerID_COPY) || (timer == pb_TimerID_NONE); +} + +#define INVALID_TIMERID pb_TimerID_LAST + +static int asyncs_outstanding(struct pb_TimerSet* timers) +{ + return (timers->async_markers != NULL) && + (timers->async_markers->timerID != INVALID_TIMERID); +} + +static struct pb_async_time_marker_list * +get_last_async(struct pb_TimerSet* timers) +{ + /* Find the last event recorded thus far */ + struct pb_async_time_marker_list * last_event = timers->async_markers; + if(last_event != NULL && last_event->timerID != INVALID_TIMERID) { + while(last_event->next != NULL && + last_event->next->timerID != INVALID_TIMERID) + last_event = last_event->next; + return last_event; + } else + return NULL; +} + +static void insert_marker(struct pb_TimerSet* tset, enum pb_TimerID timer) +{ + cl_int ciErrNum = CL_SUCCESS; + struct pb_async_time_marker_list ** new_event = &(tset->async_markers); + + while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) { + new_event = &((*new_event)->next); + } + + if(*new_event == NULL) { + *new_event = (struct pb_async_time_marker_list *) + malloc(sizeof(struct pb_async_time_marker_list)); + (*new_event)->marker = calloc(1, sizeof(cl_event)); + /* + // I don't think this is needed at all. I believe clEnqueueMarker 'creates' the event +#if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 ) +fprintf(stderr, "Creating Marker [%d]\n", timer); + *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, &ciErrNum); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Creating User Event Object!\n"); + } + ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), CL_QUEUED); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Setting User Event Status!\n"); + } +#endif +*/ + (*new_event)->next = NULL; + } + + /* valid event handle now aquired: insert the event record */ + (*new_event)->label = NULL; + (*new_event)->timerID = timer; + ciErrNum = clEnqueueMarker(*clCommandQueuePtr, (cl_event *)(*new_event)->marker); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Enqueueing Marker!\n"); + } + +} + +static void insert_submarker(struct pb_TimerSet* tset, char *label, enum pb_TimerID timer) +{ + cl_int ciErrNum = CL_SUCCESS; + struct pb_async_time_marker_list ** new_event = &(tset->async_markers); + + while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) { + new_event = &((*new_event)->next); + } + + if(*new_event == NULL) { + *new_event = (struct pb_async_time_marker_list *) + malloc(sizeof(struct pb_async_time_marker_list)); + (*new_event)->marker = calloc(1, sizeof(cl_event)); + /* +#if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 ) +fprintf(stderr, "Creating SubMarker %s[%d]\n", label, timer); + *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, &ciErrNum); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Creating User Event Object!\n"); + } + ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), CL_QUEUED); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Setting User Event Status!\n"); + } +#endif +*/ + (*new_event)->next = NULL; + } + + /* valid event handle now aquired: insert the event record */ + (*new_event)->label = label; + (*new_event)->timerID = timer; + ciErrNum = clEnqueueMarker(*clCommandQueuePtr, (cl_event *)(*new_event)->marker); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Enqueueing Marker!\n"); + } + +} + + +/* Assumes that all recorded events have completed */ +static pb_Timestamp record_async_times(struct pb_TimerSet* tset) +{ + struct pb_async_time_marker_list * next_interval = NULL; + struct pb_async_time_marker_list * last_marker = get_last_async(tset); + pb_Timestamp total_async_time = 0; + enum pb_TimerID timer; + + for(next_interval = tset->async_markers; next_interval != last_marker; + next_interval = next_interval->next) { + cl_ulong command_start=0, command_end=0; + cl_int ciErrNum = CL_SUCCESS; + + ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->marker), CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &command_start, NULL); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error getting first EventProfilingInfo: %d\n", ciErrNum); + } + + ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->next->marker), CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &command_end, NULL); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error getting second EventProfilingInfo: %d\n", ciErrNum); + } + + pb_Timestamp interval = (pb_Timestamp) (((double)(command_end - command_start)) / 1e3); + tset->timers[next_interval->timerID].elapsed += interval; + if (next_interval->label != NULL) { + struct pb_SubTimer *subtimer = tset->sub_timer_list[next_interval->timerID]->subtimer_list; + while (subtimer != NULL) { + if ( strcmp(subtimer->label, next_interval->label) == 0) { + subtimer->timer.elapsed += interval; + break; + } + subtimer = subtimer->next; + } + } + total_async_time += interval; + next_interval->timerID = INVALID_TIMERID; + } + + if(next_interval != NULL) + next_interval->timerID = INVALID_TIMERID; + + return total_async_time; +} + +static void +accumulate_time(pb_Timestamp *accum, + pb_Timestamp start, + pb_Timestamp end) +{ +//#if _POSIX_VERSION >= 200112L + *accum += end - start; +//#else +//# error "Timestamps not implemented for this system" +//#endif +} + +//#if _POSIX_VERSION >= 200112L +static pb_Timestamp get_time() +{ + //struct timeval tv; + //gettimeofday(&tv, NULL); + //return (pb_Timestamp) (tv.tv_sec * 1000000LL + tv.tv_usec); + return 0; +} +//#else +//# error "no supported time libraries are available on this platform" +//#endif + +void +pb_ResetTimer(struct pb_Timer *timer) +{ +//#ifndef DISABLE_PARBOIL_TIMER + timer->state = pb_Timer_STOPPED; + +//#if _POSIX_VERSION >= 200112L + timer->elapsed = 0; +//#else +//# error "pb_ResetTimer: not implemented for this system" +//#endif +//#endif +} + +void +pb_StartTimer(struct pb_Timer *timer) +{ +/*#ifndef DISABLE_PARBOIL_TIMER + if (timer->state != pb_Timer_STOPPED) { + fputs("Ignoring attempt to start a running timer\n", stderr); + return; + } + + timer->state = pb_Timer_RUNNING; + +#if _POSIX_VERSION >= 200112L + { + struct timeval tv; + gettimeofday(&tv, NULL); + timer->init = tv.tv_sec * 1000000LL + tv.tv_usec; + } +#else +# error "pb_StartTimer: not implemented for this system" +#endif +#endif*/ +} + +void +pb_StartTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) +{ +/*#ifndef DISABLE_PARBOIL_TIMER + + unsigned int numNotStopped = 0x3; // 11 + if (timer->state != pb_Timer_STOPPED) { + fputs("Warning: Timer was not stopped\n", stderr); + numNotStopped &= 0x1; // Zero out 2^1 + } + if (subtimer->state != pb_Timer_STOPPED) { + fputs("Warning: Subtimer was not stopped\n", stderr); + numNotStopped &= 0x2; // Zero out 2^0 + } + if (numNotStopped == 0x0) { + fputs("Ignoring attempt to start running timer and subtimer\n", stderr); + return; + } + + timer->state = pb_Timer_RUNNING; + subtimer->state = pb_Timer_RUNNING; + +#if _POSIX_VERSION >= 200112L + { + struct timeval tv; + gettimeofday(&tv, NULL); + + if (numNotStopped & 0x2) { + timer->init = tv.tv_sec * 1000000LL + tv.tv_usec; + } + + if (numNotStopped & 0x1) { + subtimer->init = tv.tv_sec * 1000000LL + tv.tv_usec; + } + } +#else +# error "pb_StartTimer: not implemented for this system" +#endif + +#endif*/ +} + +void +pb_StopTimer(struct pb_Timer *timer) +{ +/*#ifndef DISABLE_PARBOIL_TIMER + + pb_Timestamp fini; + + if (timer->state != pb_Timer_RUNNING) { + fputs("Ignoring attempt to stop a stopped timer\n", stderr); + return; + } + + timer->state = pb_Timer_STOPPED; + +#if _POSIX_VERSION >= 200112L + { + struct timeval tv; + gettimeofday(&tv, NULL); + fini = tv.tv_sec * 1000000LL + tv.tv_usec; + } +#else +# error "pb_StopTimer: not implemented for this system" +#endif + + accumulate_time(&timer->elapsed, timer->init, fini); + timer->init = fini; + +#endif*/ +} + +void pb_StopTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) { +/*#ifndef DISABLE_PARBOIL_TIMER + + pb_Timestamp fini; + + unsigned int numNotRunning = 0x3; // 11 + if (timer->state != pb_Timer_RUNNING) { + fputs("Warning: Timer was not running\n", stderr); + numNotRunning &= 0x1; // Zero out 2^1 + } + if (subtimer->state != pb_Timer_RUNNING) { + fputs("Warning: Subtimer was not running\n", stderr); + numNotRunning &= 0x2; // Zero out 2^0 + } + if (numNotRunning == 0x0) { + fputs("Ignoring attempt to stop stopped timer and subtimer\n", stderr); + return; + } + + + timer->state = pb_Timer_STOPPED; + subtimer->state = pb_Timer_STOPPED; + +#if _POSIX_VERSION >= 200112L + { + struct timeval tv; + gettimeofday(&tv, NULL); + fini = tv.tv_sec * 1000000LL + tv.tv_usec; + } +#else +# error "pb_StopTimer: not implemented for this system" +#endif + + if (numNotRunning & 0x2) { + accumulate_time(&timer->elapsed, timer->init, fini); + timer->init = fini; + } + + if (numNotRunning & 0x1) { + accumulate_time(&subtimer->elapsed, subtimer->init, fini); + subtimer->init = fini; + } + +#endif*/ +} + +/* Get the elapsed time in seconds. */ +double +pb_GetElapsedTime(struct pb_Timer *timer) +{ + /*double ret; +#ifndef DISABLE_PARBOIL_TIMER + + if (timer->state != pb_Timer_STOPPED) { + fputs("Elapsed time from a running timer is inaccurate\n", stderr); + } + +#if _POSIX_VERSION >= 200112L + ret = timer->elapsed / 1e6; +#else +# error "pb_GetElapsedTime: not implemented for this system" +#endif +#endif + return ret;*/ + return 0; +} + +void +pb_InitializeTimerSet(struct pb_TimerSet *timers) +{ +/*#ifndef DISABLE_PARBOIL_TIMER + int n; + + timers->wall_begin = 0; //get_time(); + timers->current = pb_TimerID_NONE; + + timers->async_markers = NULL; + + for (n = 0; n < pb_TimerID_LAST; n++) { + pb_ResetTimer(&timers->timers[n]); + timers->sub_timer_list[n] = NULL; + } +#endif*/ +} + +void pb_SetOpenCL(void *p_clContextPtr, void *p_clCommandQueuePtr) { + clContextPtr = ((cl_context *)p_clContextPtr); + clCommandQueuePtr = ((cl_command_queue *)p_clCommandQueuePtr); +} + +void +pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category) { +/*#ifndef DISABLE_PARBOIL_TIMER + + struct pb_SubTimer *subtimer = (struct pb_SubTimer *) malloc + (sizeof(struct pb_SubTimer)); + + int len = strlen(label); + + subtimer->label = (char *) malloc (sizeof(char)*(len+1)); + sprintf(subtimer->label, "%s\0", label); + + pb_ResetTimer(&subtimer->timer); + subtimer->next = NULL; + + struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[pb_Category]; + if (subtimerlist == NULL) { + subtimerlist = (struct pb_SubTimerList *) calloc + (1, sizeof(struct pb_SubTimerList)); + subtimerlist->subtimer_list = subtimer; + timers->sub_timer_list[pb_Category] = subtimerlist; + } else { + // Append to list + struct pb_SubTimer *element = subtimerlist->subtimer_list; + while (element->next != NULL) { + element = element->next; + } + element->next = subtimer; + } + +#endif*/ +} + +void +pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer) +{ +#if 0 +#ifndef DISABLE_PARBOIL_TIMER + + /* Stop the currently running timer */ + if (timers->current != pb_TimerID_NONE) { + struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; + struct pb_SubTimer *currSubTimer = (subtimerlist != NULL) ? subtimerlist->current : NULL; + + if (!is_async(timers->current) ) { + if (timers->current != timer) { + if (currSubTimer != NULL) { + pb_StopTimerAndSubTimer(&timers->timers[timers->current], &currSubTimer->timer); + } else { + pb_StopTimer(&timers->timers[timers->current]); + } + } else { + if (currSubTimer != NULL) { + pb_StopTimer(&currSubTimer->timer); + } + } + } else { + insert_marker(timers, timer); + if (!is_async(timer)) { // if switching to async too, keep driver going + pb_StopTimer(&timers->timers[pb_TimerID_DRIVER]); + } + } + } + + pb_Timestamp currentTime = 0; //get_time(); + + /* The only cases we check for asynchronous task completion is + * when an overlapping CPU operation completes, or the next + * segment blocks on completion of previous async operations */ + if( asyncs_outstanding(timers) && + (!is_async(timers->current) || is_blocking(timer) ) ) { + + struct pb_async_time_marker_list * last_event = get_last_async(timers); + /* CL_COMPLETE if completed */ + + cl_int ciErrNum = CL_SUCCESS; + cl_int async_done = CL_COMPLETE; + + ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &async_done, NULL); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Querying EventInfo!\n"); + } + + + if(is_blocking(timer)) { + /* Async operations completed after previous CPU operations: + * overlapped time is the total CPU time since this set of async + * operations were first issued */ + + // timer to switch to is COPY or NONE + if(async_done != CL_COMPLETE) { + accumulate_time(&(timers->timers[pb_TimerID_OVERLAP].elapsed), + timers->async_begin,currentTime); + } + + /* Wait on async operation completion */ + ciErrNum = clWaitForEvents(1, (cl_event *)last_event->marker); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Waiting for Events!\n"); + } + + pb_Timestamp total_async_time = record_async_times(timers); + + /* Async operations completed before previous CPU operations: + * overlapped time is the total async time */ + if(async_done == CL_COMPLETE) { + //fprintf(stderr, "Async_done: total_async_type = %lld\n", total_async_time); + timers->timers[pb_TimerID_OVERLAP].elapsed += total_async_time; + } + + } else + /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */ + // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding + // so something is deeper in stack + if(async_done == CL_COMPLETE ) { + /* Async operations completed before previous CPU operations: + * overlapped time is the total async time */ + timers->timers[pb_TimerID_OVERLAP].elapsed += record_async_times(timers); + } + } + + /* Start the new timer */ + if (timer != pb_TimerID_NONE) { + if(!is_async(timer)) { + pb_StartTimer(&timers->timers[timer]); + } else { + // toSwitchTo Is Async (KERNEL/COPY_ASYNC) + if (!asyncs_outstanding(timers)) { + /* No asyncs outstanding, insert a fresh async marker */ + + insert_marker(timers, timer); + timers->async_begin = currentTime; + } else if(!is_async(timers->current)) { + /* Previous asyncs still in flight, but a previous SwitchTo + * already marked the end of the most recent async operation, + * so we can rename that marker as the beginning of this async + * operation */ + + struct pb_async_time_marker_list * last_event = get_last_async(timers); + last_event->label = NULL; + last_event->timerID = timer; + } + if (!is_async(timers->current)) { + pb_StartTimer(&timers->timers[pb_TimerID_DRIVER]); + } + } + } + timers->current = timer; + +#endif +#endif +} + +void +pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category) +{ +#ifndef DISABLE_PARBOIL_TIMER + struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current]; + struct pb_SubTimer *curr = (subtimerlist != NULL) ? subtimerlist->current : NULL; + + if (timers->current != pb_TimerID_NONE) { + if (!is_async(timers->current) ) { + if (timers->current != category) { + if (curr != NULL) { + pb_StopTimerAndSubTimer(&timers->timers[timers->current], &curr->timer); + } else { + pb_StopTimer(&timers->timers[timers->current]); + } + } else { + if (curr != NULL) { + pb_StopTimer(&curr->timer); + } + } + } else { + insert_submarker(timers, label, category); + if (!is_async(category)) { // if switching to async too, keep driver going + pb_StopTimer(&timers->timers[pb_TimerID_DRIVER]); + } + } + } + + pb_Timestamp currentTime = 0; //get_time(); + + /* The only cases we check for asynchronous task completion is + * when an overlapping CPU operation completes, or the next + * segment blocks on completion of previous async operations */ + if( asyncs_outstanding(timers) && + (!is_async(timers->current) || is_blocking(category) ) ) { + + struct pb_async_time_marker_list * last_event = get_last_async(timers); + /* CL_COMPLETE if completed */ + + cl_int ciErrNum = CL_SUCCESS; + cl_int async_done = CL_COMPLETE; + + ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &async_done, NULL); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Querying EventInfo!\n"); + } + + if(is_blocking(category)) { + /* Async operations completed after previous CPU operations: + * overlapped time is the total CPU time since this set of async + * operations were first issued */ + + // timer to switch to is COPY or NONE + // if it hasn't already finished, then just take now and use that as the elapsed time in OVERLAP + // anything happening after now isn't OVERLAP because everything is being stopped to wait for synchronization + // it seems that the extra sync wall time isn't being recorded anywhere + if(async_done != CL_COMPLETE) + accumulate_time(&(timers->timers[pb_TimerID_OVERLAP].elapsed), + timers->async_begin,currentTime); + + /* Wait on async operation completion */ + ciErrNum = clWaitForEvents(1, (cl_event *)last_event->marker); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Waiting for Events!\n"); + } + pb_Timestamp total_async_time = record_async_times(timers); + + /* Async operations completed before previous CPU operations: + * overlapped time is the total async time */ + // If it did finish, then accumulate all the async time that did happen into OVERLAP + // the immediately preceding EventSynchronize theoretically didn't have any effect since it was already completed. + if(async_done == CL_COMPLETE /*cudaSuccess*/) + timers->timers[pb_TimerID_OVERLAP].elapsed += total_async_time; + + } else + /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */ + // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding + // so something is deeper in stack + if(async_done == CL_COMPLETE /*cudaSuccess*/) { + /* Async operations completed before previous CPU operations: + * overlapped time is the total async time */ + timers->timers[pb_TimerID_OVERLAP].elapsed += record_async_times(timers); + } + // else, this isn't blocking, so just check the next time around + } + + subtimerlist = timers->sub_timer_list[category]; + struct pb_SubTimer *subtimer = NULL; + + if (label != NULL) { + subtimer = subtimerlist->subtimer_list; + while (subtimer != NULL) { + if (strcmp(subtimer->label, label) == 0) { + break; + } else { + subtimer = subtimer->next; + } + } + } + + /* Start the new timer */ + if (category != pb_TimerID_NONE) { + if(!is_async(category)) { + if (subtimerlist != NULL) { + subtimerlist->current = subtimer; + } + + if (category != timers->current && subtimer != NULL) { + pb_StartTimerAndSubTimer(&timers->timers[category], &subtimer->timer); + } else if (subtimer != NULL) { + pb_StartTimer(&subtimer->timer); + } else { + pb_StartTimer(&timers->timers[category]); + } + } else { + if (subtimerlist != NULL) { + subtimerlist->current = subtimer; + } + + // toSwitchTo Is Async (KERNEL/COPY_ASYNC) + if (!asyncs_outstanding(timers)) { + /* No asyncs outstanding, insert a fresh async marker */ + insert_submarker(timers, label, category); + timers->async_begin = currentTime; + } else if(!is_async(timers->current)) { + /* Previous asyncs still in flight, but a previous SwitchTo + * already marked the end of the most recent async operation, + * so we can rename that marker as the beginning of this async + * operation */ + + struct pb_async_time_marker_list * last_event = get_last_async(timers); + last_event->timerID = category; + last_event->label = label; + } // else, marker for switchToThis was already inserted + + //toSwitchto is already asynchronous, but if current/prev state is async too, then DRIVER is already running + if (!is_async(timers->current)) { + pb_StartTimer(&timers->timers[pb_TimerID_DRIVER]); + } + } + } + + timers->current = category; +#endif +} + +void +pb_PrintTimerSet(struct pb_TimerSet *timers) +{ +#ifndef DISABLE_PARBOIL_TIMER + pb_Timestamp wall_end = 0; //get_time(); + + struct pb_Timer *t = timers->timers; + struct pb_SubTimer* sub = NULL; + + int maxSubLength; + + const char *categories[] = { + "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute" + }; + + const int maxCategoryLength = 10; + + int i; + for(i = 1; i < pb_TimerID_LAST-1; ++i) { // exclude NONE and OVRELAP from this format + if(pb_GetElapsedTime(&t[i]) != 0) { + + // Print Category Timer + printf("%-*s: %f\n", maxCategoryLength, categories[i-1], pb_GetElapsedTime(&t[i])); + + if (timers->sub_timer_list[i] != NULL) { + sub = timers->sub_timer_list[i]->subtimer_list; + maxSubLength = 0; + while (sub != NULL) { + // Find longest SubTimer label + if (strlen(sub->label) > maxSubLength) { + maxSubLength = strlen(sub->label); + } + sub = sub->next; + } + + // Fit to Categories + if (maxSubLength <= maxCategoryLength) { + maxSubLength = maxCategoryLength; + } + + sub = timers->sub_timer_list[i]->subtimer_list; + + // Print SubTimers + while (sub != NULL) { + printf(" -%-*s: %f\n", maxSubLength, sub->label, pb_GetElapsedTime(&sub->timer)); + sub = sub->next; + } + } + } + } + + if(pb_GetElapsedTime(&t[pb_TimerID_OVERLAP]) != 0) + printf("CPU/Kernel Overlap: %f\n", pb_GetElapsedTime(&t[pb_TimerID_OVERLAP])); + + float walltime = (wall_end - timers->wall_begin)/ 1e6; + printf("Timer Wall Time: %f\n", walltime); + +#endif +} + +void pb_DestroyTimerSet(struct pb_TimerSet * timers) +{ +#ifndef DISABLE_PARBOIL_TIMER + /* clean up all of the async event markers */ + struct pb_async_time_marker_list* event = timers->async_markers; + while(event != NULL) { + + cl_int ciErrNum = CL_SUCCESS; + ciErrNum = clWaitForEvents(1, (cl_event *)(event)->marker); + if (ciErrNum != CL_SUCCESS) { + //fprintf(stderr, "Error Waiting for Events!\n"); + } + + ciErrNum = clReleaseEvent( *((cl_event *)(event)->marker) ); + if (ciErrNum != CL_SUCCESS) { + fprintf(stderr, "Error Release Events!\n"); + } + + free((event)->marker); + struct pb_async_time_marker_list* next = ((event)->next); + + free(event); + + // (*event) = NULL; + event = next; + } + + int i = 0; + for(i = 0; i < pb_TimerID_LAST; ++i) { + if (timers->sub_timer_list[i] != NULL) { + struct pb_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list; + struct pb_SubTimer *prev = NULL; + while (subtimer != NULL) { + free(subtimer->label); + prev = subtimer; + subtimer = subtimer->next; + free(prev); + } + free(timers->sub_timer_list[i]); + } + } +#endif +} + +static pb_Platform** ptr = NULL; + +// verbosely print out list of platforms and their devices to the console. +pb_Platform** +pb_GetPlatforms() { + if (ptr == NULL) { + cl_uint num_platforms; + clGetPlatformIDs(0, NULL, &num_platforms); + if (num_platforms == 0) return NULL; + + ptr = (pb_Platform **) malloc(sizeof(pb_Platform *) * (num_platforms + 1)); + cl_platform_id* ids = (cl_platform_id *) malloc(num_platforms * sizeof(cl_platform_id)); + clGetPlatformIDs(num_platforms, ids, NULL); + + unsigned int i; + for (i = 0; i < num_platforms; i++) { + ptr[i] = (pb_Platform *) malloc(sizeof(pb_Platform)); + ptr[i]->clPlatform = ids[i]; + ptr[i]->contexts = NULL; + ptr[i]->in_use = 0; + ptr[i]->devices = NULL; + + size_t sz; + clGetPlatformInfo(ids[i], CL_PLATFORM_NAME, 0, NULL, &sz); + char* name = (char *) malloc(sz + 1); + clGetPlatformInfo(ids[i], CL_PLATFORM_NAME, sz, name, NULL); + name[sz] = '\0'; + ptr[i]->name = name; + + clGetPlatformInfo(ids[i], CL_PLATFORM_VERSION, 0, NULL, &sz); + char* version = (char *) malloc(sz + 1); + clGetPlatformInfo(ids[i], CL_PLATFORM_VERSION, sz, version, NULL); + version[sz] = '\0'; + ptr[i]->version = version; + } + ptr[i] = NULL; + + free(ids); + } + + return (pb_Platform**) ptr; +} + +pb_Context* +createContext(pb_Platform* pb_platform, pb_Device* pb_device) { + pb_Context* c = (pb_Context*) malloc(sizeof(pb_Context)); + cl_int clStatus; + cl_context_properties clCps[3] = { + CL_CONTEXT_PLATFORM, (cl_context_properties)(pb_platform->clPlatform), 0 + }; + c->clContext = + clCreateContext(clCps, 1, (cl_device_id*)&pb_device->clDevice, NULL, NULL, &clStatus); + c->clPlatformId = pb_platform->clPlatform; + c->clDeviceId = pb_device->clDevice; + c->pb_platform = pb_platform; + c->pb_device = pb_device; + pb_platform->in_use = 1; + pb_device->in_use = 1; + unsigned int i = 0; + if (pb_platform->contexts == NULL) { + pb_platform->contexts = (pb_Context**) malloc(2*sizeof(pb_Context*)); + } else { + for (i = 0; pb_platform->contexts[i] != NULL; i++) {}; + pb_platform->contexts = (pb_Context**) realloc(pb_platform->contexts, + (i+1)*sizeof(pb_Context*)); + } + pb_platform->contexts[i+1] = NULL; + pb_platform->contexts[i] = c; + return c; +} + +// choose a platform by name. +pb_Platform* +pb_GetPlatformByName(const char* name) { + pb_Platform** ps = (pb_Platform **) pb_GetPlatforms(); + if (ps == NULL) return NULL; + if (name == NULL) { + return *ps; + } + + while (*ps) { + if (strstr((*ps)->name, name)) break; + ps++; + } + return (pb_Platform*) *ps; +} + +pb_Device** +pb_GetDevices(pb_Platform* pb_platform) { + if (pb_platform->devices == NULL) { + cl_uint num_devs; + cl_device_id* dev_ids; + clGetDeviceIDs((cl_platform_id) pb_platform->clPlatform, + CL_DEVICE_TYPE_ALL, 0, NULL, &num_devs); + if (num_devs == 0) return NULL; + + pb_platform->devices = + (pb_Device **) malloc((num_devs + 1) * sizeof(pb_Device *)); + dev_ids = (cl_device_id *) malloc(sizeof(cl_device_id) * num_devs); + clGetDeviceIDs((cl_platform_id) pb_platform->clPlatform, + CL_DEVICE_TYPE_ALL, num_devs, dev_ids, NULL); + + unsigned int i; + for (i = 0; i < num_devs; i++) { + pb_platform->devices[i] = (pb_Device *) malloc(sizeof(pb_Device)); + + pb_platform->devices[i]->clDevice = dev_ids[i]; + pb_platform->devices[i]->id = i; + + size_t sz; + clGetDeviceInfo(dev_ids[i], CL_DEVICE_NAME, 0, NULL, &sz); + char* name = (char *) malloc(sz + 1); + clGetDeviceInfo(dev_ids[i], CL_DEVICE_NAME, sz, name, NULL); + name[sz] = '\0'; + pb_platform->devices[i]->name = (char *) name; + + cl_bool available; + clGetDeviceInfo(dev_ids[i], CL_DEVICE_AVAILABLE, sizeof(cl_bool), &available, NULL); + pb_platform->devices[i]->available = (int) available; + + pb_platform->devices[i]->in_use = 0; + } + pb_platform->devices[i] = NULL; + } + return (pb_Device **) pb_platform->devices; +} + +// choose a device by name. +static pb_Device* +pb_SelectDeviceByName(pb_Device **ds, const char* name) { + if (ds == NULL) return NULL; + if (name == NULL) return *ds; + while (*ds) { + if (strstr((*ds)->name, name)) break; + ds++; + } + + return *ds; +} + +// choose a device by name and set the device's 'in_use' flag. +pb_Device* +pb_GetDeviceByName(pb_Platform* pb_platform, const char* name) { + pb_Device** ds = (pb_Device **) pb_GetDevices(pb_platform); + pb_Device *d = pb_SelectDeviceByName(ds, name); + + if (d) d->in_use = 1; + + return d; +} + +void +pb_ReleasePlatforms() { + if (!ptr) return; + pb_Platform** cur_ptr = ptr; + while (*cur_ptr) { + pb_Platform* pfptr = *cur_ptr++; + if (pfptr->devices) { + pb_Device** dvptr = pfptr->devices; + while (*dvptr) { + pb_Device* d = *dvptr++; + free(d->name); + free(d); + } + free(pfptr->devices); + } + if (pfptr->contexts) { + pb_Context** cptr = pfptr->contexts; + while (*cptr) { + free(*cptr++); + } + free(pfptr->contexts); + } + free(pfptr->name); + free(pfptr); + } + free(ptr); + ptr = NULL; +} + +pb_Platform* +pb_GetPlatformByNameAndVersion(const char* name, const char* version) { + pb_Platform** ps = (pb_Platform **) pb_GetPlatforms(); + if (ps == NULL) return NULL; + if (name == NULL) return *ps; + while (*ps) { + if (strstr((*ps)->name, name) && strstr((*ps)->version, version)) break; + ps++; + } + return (pb_Platform*) *ps; +} + +/* Return a pointer to the device at the specified index, or NULL. + * Used by pb_GetDevice. */ +static pb_Device * +select_device_by_index(pb_Device** ds, int id) +{ + int i = 0; + pb_Device** p = ds; + while (*p && (i < id)) { p++; i++; } + return *p; +} + +/* Return a pointer to the device with the specified type, or NULL. + * Used by pb_GetDevice. */ +static pb_Device * +select_device_by_type(pb_Device** ds, + enum pb_DeviceSelectionCriterion criterion) +{ + cl_device_type sought_type; + + /* Determine the OpenCL device type to search for */ + switch(criterion) { + case pb_Device_CPU: + sought_type = CL_DEVICE_TYPE_CPU; + break; + case pb_Device_GPU: + sought_type = CL_DEVICE_TYPE_GPU; + break; + case pb_Device_ACCELERATOR: + sought_type = CL_DEVICE_TYPE_ACCELERATOR; + break; + default: + fprintf(stderr, "pb_GetDevice: Invalid device type"); + exit(-1); + } + + /* Find the device */ + { + pb_Device** p = ds; + cl_device_type type; + while (*p) { + clGetDeviceInfo(((cl_device_id) ((*p)->clDevice)), CL_DEVICE_TYPE, + sizeof(cl_device_type), &type, NULL); + if (type == sought_type) break; + } + + return *p; + } +} + +pb_Device* +pb_GetDevice(pb_Platform* pb_platform, struct pb_DeviceParam *device) +{ + pb_Device** ds = (pb_Device **) pb_GetDevices(pb_platform); + + // The list of devices must be nonempty + if (ds == NULL || *ds == NULL) { + fprintf(stderr, "Error: No device is found in platform: name = %s, version = %s\n.", pb_platform->name, pb_platform->version); + exit(-1); + } + + pb_Device *selected_device = NULL; + + if (device != NULL) { + /* Use 'device' to select and return a device. + * If unable to select a device, fall + * back on the default selection mechanism. */ + switch(device->criterion) { + case pb_Device_INDEX: + selected_device = select_device_by_index(ds, device->index); + break; + case pb_Device_GPU: + case pb_Device_CPU: + case pb_Device_ACCELERATOR: + selected_device = select_device_by_type(ds, device->criterion); + break; + case pb_Device_NAME: + selected_device = pb_SelectDeviceByName(ds, device->name); + break; + default: + fprintf(stderr, "pb_GetDevice: Invalid argument"); + exit(-1); + } + } + + /* By default or if user-specified selection failed, + * select the first device */ + if (selected_device == NULL) + selected_device = *ds; + + /* Set the in_use flag */ + selected_device->in_use = 1; + + return selected_device; +} + +pb_Device* +pb_GetDeviceByEnvVars(pb_Platform* pb_platform) { + + /* Convert environment variables to a 'pb_DeviceParam' */ + struct pb_DeviceParam *param = NULL; + + char* device_num = getenv("PARBOIL_DEVICE_NUMBER"); + if (device_num && strcmp(device_num, "")) { + int id = atoi(device_num); + param = pb_DeviceParam_index(id); + } + else { + char* device_name = getenv("PARBOIL_DEVICE_NAME"); + if (device_name && strcmp(device_name, "")) { + param = pb_DeviceParam_name(strdup(device_name)); + } + else { + char* device_type = getenv("PARBOIL_DEVICE_TYPE"); + if (device_type && strcmp(device_type, "")) { + if (strcmp(device_type, "CPU") == 0) + param = pb_DeviceParam_cpu(); + else if (strcmp(device_type, "GPU") == 0) + param = pb_DeviceParam_gpu(); + else if (strcmp(device_type, "ACCELERATOR") == 0) + param = pb_DeviceParam_accelerator(); + } + } + } + + /* Get a device */ + pb_Device *d = pb_GetDevice(pb_platform, param); + pb_FreeDeviceParam(param); + + return d; +} + +pb_Platform* +pb_GetPlatformByEnvVars() { + char* name = getenv("PARBOIL_PLATFORM_NAME"); + char* version = getenv("PARBOIL_PLATFORM_VERSION"); + + /* Create a pb_PlatformParam object (or NULL) representing the data from the + * environment variables */ + struct pb_PlatformParam *platform; + + if (name) { + if (version) { + platform = pb_PlatformParam(strdup(name), strdup(version)); + } + else { + platform = pb_PlatformParam(strdup(name), NULL); + } + } + else { + platform = NULL; + } + + /* Convert to a platform */ + pb_Platform *p = pb_GetPlatform(platform); + pb_FreePlatformParam(platform); + + return p; +} + +/* Choose an OpenCL platform based on the given command-line parameters. + * If NULL, use the default OpenCL platform. */ +pb_Platform* +pb_GetPlatform(struct pb_PlatformParam *platform) { + if (platform != NULL) { + /* Try to use command-line parameters to choose platform */ + char *name = platform->name; + char *version = platform->version; + + if (!name) { + fprintf(stderr, "Internal error: NULL pointer"); + exit(-1); + } + + if (version) { + pb_Platform* p = pb_GetPlatformByNameAndVersion(name, version); + if (p) return p; + } + + pb_Platform* p = pb_GetPlatformByName(name); + if (p) return p; + } + + pb_Platform* p = pb_GetPlatformByName(NULL); + if (p == NULL) { + fprintf(stderr, "Error: No OpenCL platform in this system. Exiting."); + exit(-1); + } + return p; +} + +//extern void perf_init(); +//extern void mxpa_scheduler_init(); + +pb_Context* +pb_InitOpenCLContext(struct pb_Parameters* parameters) { +#if 0 + pb_Platform* ps = pb_GetPlatform(parameters->platform); + if (!ps) return NULL; + pb_Device* ds = pb_GetDevice(ps, parameters->device); + if (!ds) return NULL; + + /* HERE INITIALIZE TIMER */ + //perf_init(); + //mxpa_scheduler_init(); + + pb_Context* c = createContext(ps, ds); + pb_PrintPlatformInfo(c); + return c; +#endif + cl_int _err; + cl_platform_id platform_id; + cl_device_id device_id; + cl_context context; + clGetPlatformIDs(1, &platform_id, NULL); + clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL); + context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err); + + pb_Context* c = (pb_Context*)malloc(sizeof(pb_Context)); + c->clContext = context; + c->clDeviceId = device_id; + c->clPlatformId = platform_id; + c->pb_platform = (pb_Platform*)malloc(sizeof(pb_Platform)); + c->pb_device = (pb_Device*)malloc(sizeof(pb_Device)); + c->pb_platform->devices = (pb_Device**)malloc(sizeof(pb_Device*) * 2); + c->pb_platform->devices[0] = c->pb_device; + c->pb_platform->devices[1] = NULL; + c->pb_platform->contexts = (pb_Context**)malloc(sizeof(pb_Context*) * 2); + c->pb_platform->contexts[0] = c; + c->pb_platform->contexts[1] = NULL; + c->pb_platform->in_use = 1; + c->pb_device->in_use = 1; + return c; +} + +void +pb_ReleaseOpenCLContext(pb_Context* c) { + pb_ReleasePlatforms(); +} + +void +pb_PrintPlatformInfo(pb_Context* c) { + /*pb_Platform** ps = pb_GetPlatforms(); + if (!ps) { + fprintf (stderr, "No platform found"); + return; + } + + printf ("********************************************************\n"); + printf ("DETECTED OPENCL PLATFORMS AND DEVICES:\n"); + printf ("--------------------------------------------------------\n"); + + while (*ps) { + printf ("PLATFORM = %s, %s", (*ps)->name, (*ps)->version); + if (c->pb_platform == *ps) printf (" (SELECTED)"); + printf ("\n"); + + pb_Device** ds = (pb_Device **) pb_GetDevices((*ps)); + if (ds == NULL) { + printf (" + (No devices)\n"); + } else { + while (*ds) { + printf (" + %d: %s", (*ds)->id, (*ds)->name); + if (c->pb_device == *ds) printf (" (SELECTED)"); + printf ("\n"); + ds++; + } + } + + ps++; + } + printf ("********************************************************\n");*/ +} + +#ifdef MEASURE_KERNEL_TIME + +#undef clEnqueueNDRangeKernel + +//extern void pin_trace_enable(char*); +//extern void pin_trace_disable(char*); + +cl_int +pb_clEnqueueNDRangeKernel(cl_command_queue q/* command_queue */, + cl_kernel k/* kernel */, + cl_uint d/* work_dim */, + const size_t * o/* global_work_offset */, + const size_t * gws/* global_work_size */, + const size_t * lws/* local_work_size */, + cl_uint n/* num_events_in_wait_list */, + const cl_event * w/* event_wait_list */, + cl_event * e/* event */) { + + char buf[128]; + struct timeval begin, end; + clGetKernelInfo(k, CL_KERNEL_FUNCTION_NAME, 128, buf, NULL); + +#if 0 + int i; + for (i = 0; i < d; i++) { + printf ("%s: %d: %d / %d\n", buf, i, gws[i], (lws == NULL ? 0 : lws[i])); + } +#endif + + clFinish(q); clFlush(q); + //pin_trace_enable(buf); + //gettimeofday(&begin, NULL); + cl_int result = clEnqueueNDRangeKernel(q, k, d, o, gws, lws, n, w, e); + clFinish(q); clFlush(q); + //gettimeofday(&end, NULL); + //pin_trace_disable(buf); + //float t = (float)(end.tv_sec - begin.tv_sec) + (end.tv_usec - begin.tv_usec) / 1000000.0f; + fflush(stdout); + fflush(stderr); + //printf ("PBTIMER: %s: %f\n", buf, t); + return result; +} + +#endif + +void +pb_sig_float(char* c, float* p, int sz) { + int i; + double s = 0.0; + for (i = 0; i < sz; i++) s += p[i] * (float)(i+1); + printf ("[Signature] %s = %lf\n", c, s); +} + +void +pb_sig_double(char* c, double* p, int sz) { + int i; + double s = 0.0; + for (i = 0; i < sz; i++) s += p[i]; + printf ("[Signature] %s = %lf\n", c, s); +} + +void +pb_sig_short(char* c, short* p, int sz) { + int i; + long long int s = 0; + for (i = 0; i < sz; i++) s += p[i]; + printf ("[Signature] %s = %lld\n", c, s); +} + +void +pb_sig_int(char* c, int* p, int sz) { + int i; + long long int s = 0; + for (i = 0; i < sz; i++) s += p[i]; + printf ("[Signature] %s = %lld\n", c, s); +} + +void +pb_sig_uchar(char* c, unsigned char* p, unsigned int sz) { + int i; + unsigned long long int s = 0; + for (i = 0; i < sz; i++) s += p[i]; + printf ("[Signature] %s = %lld\n", c, s); +} + +void pb_sig_clmem(char* s, cl_command_queue command_queue, cl_mem memobj, int ty) { + size_t sz; + if (clGetMemObjectInfo(memobj, CL_MEM_SIZE, sizeof(size_t), &sz, NULL) != CL_SUCCESS) { + printf ("Something wrong.\n"); + assert(0); + } else { + printf ("size = %d\n", sz); + } + char* hp; // = (char*) malloc(sz); + //posix_memalign((void**)&hp, 64, sz); + hp = (char*)malloc(sz); + + clEnqueueReadBuffer (command_queue, + memobj, + CL_TRUE, + 0, + sz, + hp, + 0, + NULL, + NULL); + + if (ty == T_FLOAT) pb_sig_float(s, (float*)hp, sz/sizeof(float)); + if (ty == T_DOUBLE) pb_sig_double(s, (double*)hp, sz/sizeof(double)); + if (ty == T_INT) pb_sig_int(s, (int*)hp, sz/sizeof(int)); + if (ty == T_SHORT) pb_sig_short(s, (short*)hp, sz/sizeof(short)); + if (ty == T_UCHAR) pb_sig_uchar(s, (unsigned char*)hp, sz/sizeof(char)); + + free(hp); +} +