project directories reorganization

This commit is contained in:
Blaise Tine
2020-04-14 06:35:20 -04:00
parent 1de06fd9c0
commit fc155e1223
1056 changed files with 8120 additions and 8120 deletions

View File

@@ -0,0 +1,101 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#if(0)
#define EXP(a) native_exp(a)
#define LOG(a) native_log(a)
#define SQRT(a) native_sqrt(a)
#else
#define EXP(a) exp(a)
#define LOG(a) log(a)
#define SQRT(a) sqrt(a)
#endif
///////////////////////////////////////////////////////////////////////////////
// Predefine functions to avoid bug in OpenCL compiler on Mac OSX 10.7 systems
///////////////////////////////////////////////////////////////////////////////
float CND(float d);
void BlackScholesBody(__global float *call, __global float *put, float S,
float X, float T, float R, float V);
///////////////////////////////////////////////////////////////////////////////
// Rational approximation of cumulative normal distribution function
///////////////////////////////////////////////////////////////////////////////
float CND(float d){
const float A1 = 0.31938153f;
const float A2 = -0.356563782f;
const float A3 = 1.781477937f;
const float A4 = -1.821255978f;
const float A5 = 1.330274429f;
const float RSQRT2PI = 0.39894228040143267793994605993438f;
float
K = 1.0f / (1.0f + 0.2316419f * fabs(d));
float
cnd = RSQRT2PI * EXP(- 0.5f * d * d) *
(K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
if(d > 0)
cnd = 1.0f - cnd;
return cnd;
}
///////////////////////////////////////////////////////////////////////////////
// Black-Scholes formula for both call and put
///////////////////////////////////////////////////////////////////////////////
void BlackScholesBody(
__global float *call, //Call option price
__global float *put, //Put option price
float S, //Current stock price
float X, //Option strike price
float T, //Option years
float R, //Riskless rate of return
float V //Stock volatility
){
float sqrtT = SQRT(T);
float d1 = (LOG(S / X) + (R + 0.5f * V * V) * T) / (V * sqrtT);
float d2 = d1 - V * sqrtT;
float CNDD1 = CND(d1);
float CNDD2 = CND(d2);
//Calculate Call and Put simultaneously
float expRT = EXP(- R * T);
*call = (S * CNDD1 - X * expRT * CNDD2);
*put = (X * expRT * (1.0f - CNDD2) - S * (1.0f - CNDD1));
}
__kernel void BlackScholes(
__global float *d_Call, //Call option price
__global float *d_Put, //Put option price
__global float *d_S, //Current stock price
__global float *d_X, //Option strike price
__global float *d_T, //Option years
float R, //Riskless rate of return
float V, //Stock volatility
unsigned int optN
){
for(unsigned int opt = get_global_id(0); opt < optN; opt += get_global_size(0))
BlackScholesBody(
&d_Call[opt],
&d_Put[opt],
d_S[opt],
d_X[opt],
d_T[opt],
R,
V
);
}

View File

@@ -0,0 +1,66 @@
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH) -I.
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT=BlackScholes
all: $(PROJECT).dump $(PROJECT).hex
lib$(PROJECT).a: BlackScholes.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc oclBlackScholes_common.h oclBlackScholes_launcher.cpp oclBlackScholes_gold.cpp $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.elf *.dump *.hex

View File

@@ -0,0 +1,248 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
// standard utilities and systems includes
#include <oclUtils.h>
#include <shrQATest.h>
#include "oclBlackScholes_common.h"
////////////////////////////////////////////////////////////////////////////////
// Helper functions
////////////////////////////////////////////////////////////////////////////////
double executionTime(cl_event &event){
cl_ulong start, end;
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
return (double)1.0e-9 * (end - start); // convert nanoseconds to seconds on return
}
////////////////////////////////////////////////////////////////////////////////
// Random float helper
////////////////////////////////////////////////////////////////////////////////
float randFloat(float low, float high){
float t = (float)rand() / (float)RAND_MAX;
return (1.0f - t) * low + t * high;
}
////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
cl_platform_id cpPlatform; //OpenCL platform
cl_device_id* cdDevices = NULL; //OpenCL devices list (array)
cl_context cxGPUContext; //OpenCL context
cl_command_queue cqCommandQueue; //OpenCL command que
cl_mem //OpenCL memory buffer objects
d_Call,
d_Put,
d_S,
d_X,
d_T;
cl_int ciErrNum;
float
*h_CallCPU,
*h_PutCPU,
*h_CallGPU,
*h_PutGPU,
*h_S,
*h_X,
*h_T;
const unsigned int optionCount = 4000000;
const float R = 0.02f;
const float V = 0.30f;
shrQAStart(argc, argv);
// Get the NVIDIA platform
ciErrNum = oclGetPlatformID(&cpPlatform);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
shrLog("clGetPlatformID...\n");
//Get all the devices
cl_uint uiNumDevices = 0; // Number of devices available
cl_uint uiTargetDevice = 0; // Default Device to compute on
cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU)
shrLog("Get the Device info and select Device...\n");
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
// Get command line device options and config accordingly
shrLog(" # of Devices Available = %u\n", uiNumDevices);
if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE)
{
uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
}
shrLog(" Using Device %u: ", uiTargetDevice);
oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits);
// set logfile name and start logs
shrSetLogFileName ("oclBlackScholes.txt");
shrLog("%s Starting...\n\n", argv[0]);
shrLog("Allocating and initializing host memory...\n");
h_CallCPU = (float *)malloc(optionCount * sizeof(float));
h_PutCPU = (float *)malloc(optionCount * sizeof(float));
h_CallGPU = (float *)malloc(optionCount * sizeof(float));
h_PutGPU = (float *)malloc(optionCount * sizeof(float));
h_S = (float *)malloc(optionCount * sizeof(float));
h_X = (float *)malloc(optionCount * sizeof(float));
h_T = (float *)malloc(optionCount * sizeof(float));
srand(2009);
for(unsigned int i = 0; i < optionCount; i++){
h_CallCPU[i] = -1.0f;
h_PutCPU[i] = -1.0f;
h_S[i] = randFloat(5.0f, 30.0f);
h_X[i] = randFloat(1.0f, 100.0f);
h_T[i] = randFloat(0.25f, 10.0f);
}
shrLog("Initializing OpenCL...\n");
// Get the NVIDIA platform
ciErrNum = oclGetPlatformID(&cpPlatform);
//oclCheckError(ciErrNum, CL_SUCCESS);
// Get a GPU device
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 1, &cdDevices[uiTargetDevice], NULL);
//oclCheckError(ciErrNum, CL_SUCCESS);
// Create the context
cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
//Create a command-queue
cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
shrLog("Creating OpenCL memory objects...\n");
d_Call = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, optionCount * sizeof(float), NULL, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
d_Put = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, optionCount * sizeof(float), NULL, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
d_S = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_S, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
d_X = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_X, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
d_T = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_T, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
shrLog("Starting up BlackScholes...\n");
initBlackScholes(cxGPUContext, cqCommandQueue, (const char **)argv);
shrLog("Running OpenCL BlackScholes...\n\n");
//Just a single run or a warmup iteration
BlackScholes(
NULL,
d_Call,
d_Put,
d_S,
d_X,
d_T,
R,
V,
optionCount
);
#ifdef GPU_PROFILING
const int numIterations = 16;
cl_event startMark, endMark;
ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark);
ciErrNum |= clFinish(cqCommandQueue);
shrCheckError(ciErrNum, CL_SUCCESS);
shrDeltaT(0);
for(int i = 0; i < numIterations; i++){
BlackScholes(
cqCommandQueue,
d_Call,
d_Put,
d_S,
d_X,
d_T,
R,
V,
optionCount
);
}
ciErrNum = clEnqueueMarker(cqCommandQueue, &endMark);
ciErrNum |= clFinish(cqCommandQueue);
shrCheckError(ciErrNum, CL_SUCCESS);
//Calculate performance metrics by wallclock time
double gpuTime = shrDeltaT(0) / numIterations;
shrLogEx(LOGBOTH | MASTER, 0, "oclBlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u options, NumDevsUsed = %i, Workgroup = %u\n",
(double)(2.0 * optionCount * 1.0e-9)/gpuTime, gpuTime, (2 * optionCount), 1, 0);
//Get profiling info
cl_ulong startTime = 0, endTime = 0;
ciErrNum = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &startTime, NULL);
ciErrNum |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
shrLog("\nOpenCL time: %.5f s\n\n", 1.0e-9 * ((double)endTime - (double)startTime) / (double)numIterations);
#endif
shrLog("\nReading back OpenCL BlackScholes results...\n");
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Call, CL_TRUE, 0, optionCount * sizeof(float), h_CallGPU, 0, NULL, NULL);
//oclCheckError(ciErrNum, CL_SUCCESS);
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Put, CL_TRUE, 0, optionCount * sizeof(float), h_PutGPU, 0, NULL, NULL);
//oclCheckError(ciErrNum, CL_SUCCESS);
shrLog("Comparing against Host/C++ computation...\n");
BlackScholesCPU(h_CallCPU, h_PutCPU, h_S, h_X, h_T, R, V, optionCount);
double deltaCall = 0, deltaPut = 0, sumCall = 0, sumPut = 0;
double L1call, L1put;
for(unsigned int i = 0; i < optionCount; i++)
{
sumCall += fabs(h_CallCPU[i]);
sumPut += fabs(h_PutCPU[i]);
deltaCall += fabs(h_CallCPU[i] - h_CallGPU[i]);
deltaPut += fabs(h_PutCPU[i] - h_PutGPU[i]);
}
L1call = deltaCall / sumCall;
L1put = deltaPut / sumPut;
shrLog("Relative L1 (call, put) = (%.3e, %.3e)\n\n", L1call, L1put);
shrLog("Shutting down...\n");
closeBlackScholes();
ciErrNum = clReleaseMemObject(d_T);
ciErrNum |= clReleaseMemObject(d_X);
ciErrNum |= clReleaseMemObject(d_S);
ciErrNum |= clReleaseMemObject(d_Put);
ciErrNum |= clReleaseMemObject(d_Call);
ciErrNum |= clReleaseCommandQueue(cqCommandQueue);
ciErrNum |= clReleaseContext(cxGPUContext);
//oclCheckError(ciErrNum, CL_SUCCESS);
free(h_T);
free(h_X);
free(h_S);
free(h_PutGPU);
free(h_CallGPU);
free(h_PutCPU);
free(h_CallCPU);
if(cdDevices)free(cdDevices);
shrQAFinishExit(argc, (const char **)argv, ((L1call < 1E-6) && (L1put < 1E-6)) ? QA_PASSED : QA_FAILED );
}

Binary file not shown.

View File

@@ -0,0 +1,50 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#include <oclUtils.h>
////////////////////////////////////////////////////////////////////////////////
// Process an array of optN options on CPU
////////////////////////////////////////////////////////////////////////////////
extern "C" void BlackScholesCPU(
float *h_Call, //Call option price
float *h_Put, //Put option price
float *h_S, //Current stock price
float *h_X, //Option strike price
float *h_T, //Option years
float R, //Riskless rate of return
float V, //Stock volatility
unsigned int optionCount
);
////////////////////////////////////////////////////////////////////////////////
// OpenCL Black-Scholes kernel launcher
////////////////////////////////////////////////////////////////////////////////
extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqParamCommandQue, const char **argv);
extern "C" void closeBlackScholes(void);
extern "C" void BlackScholes(
cl_command_queue cqCommandQueue,
cl_mem d_Call, //Call option price
cl_mem d_Put, //Put option price
cl_mem d_S, //Current stock price
cl_mem d_X, //Option strike price
cl_mem d_T, //Option years
cl_float R, //Riskless rate of return
cl_float V, //Stock volatility
cl_uint optionCount
);

View File

@@ -0,0 +1,92 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#include <math.h>
#include "oclBlackScholes_common.h"
///////////////////////////////////////////////////////////////////////////////
// Rational approximation of cumulative normal distribution function
///////////////////////////////////////////////////////////////////////////////
static double CND(double d){
const double A1 = 0.31938153;
const double A2 = -0.356563782;
const double A3 = 1.781477937;
const double A4 = -1.821255978;
const double A5 = 1.330274429;
const double RSQRT2PI = 0.39894228040143267793994605993438;
double
K = 1.0 / (1.0 + 0.2316419 * fabs(d));
double
cnd = RSQRT2PI * exp(- 0.5 * d * d) *
(K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
if(d > 0)
cnd = 1.0 - cnd;
return cnd;
}
///////////////////////////////////////////////////////////////////////////////
// Black-Scholes formula for both call and put
///////////////////////////////////////////////////////////////////////////////
static void BlackScholesBodyCPU(
float& call, //Call option price
float& put, //Put option price
float Sf, //Current stock price
float Xf, //Option strike price
float Tf, //Option years
float Rf, //Riskless rate of return
float Vf //Stock volatility
){
double S = Sf, X = Xf, T = Tf, R = Rf, V = Vf;
double sqrtT = sqrt(T);
double d1 = (log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT);
double d2 = d1 - V * sqrtT;
double CNDD1 = CND(d1);
double CNDD2 = CND(d2);
//Calculate Call and Put simultaneously
double expRT = exp(- R * T);
call = (float)(S * CNDD1 - X * expRT * CNDD2);
put = (float)(X * expRT * (1.0 - CNDD2) - S * (1.0 - CNDD1));
}
////////////////////////////////////////////////////////////////////////////////
// Process an array of optN options
////////////////////////////////////////////////////////////////////////////////
extern "C" void BlackScholesCPU(
float *h_Call, //Call option price
float *h_Put, //Put option price
float *h_S, //Current stock price
float *h_X, //Option strike price
float *h_T, //Option years
float R, //Riskless rate of return
float V, //Stock volatility
unsigned int optionCount
){
for(unsigned int i = 0; i < optionCount; i++)
BlackScholesBodyCPU(
h_Call[i],
h_Put[i],
h_S[i],
h_X[i],
h_T[i],
R,
V
);
}

View File

@@ -0,0 +1,125 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#include <oclUtils.h>
#include "oclBlackScholes_common.h"
static cl_program cpBlackScholes; //OpenCL program
static cl_kernel ckBlackScholes; //OpenCL kernel
static cl_command_queue cqDefaultCommandQueue;
extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqParamCommandQueue, const char **argv){
cl_int ciErrNum;
size_t kernelLength;
shrLog("...loading BlackScholes.cl\n");
char *cPathAndName = shrFindFilePath("BlackScholes.cl", argv[0]);
shrCheckError(cPathAndName != NULL, shrTRUE);
char *cBlackScholes = oclLoadProgSource(cPathAndName, "// My comment\n", &kernelLength);
shrCheckError(cBlackScholes != NULL, shrTRUE);
shrLog("...creating BlackScholes program\n");
//cpBlackScholes = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cBlackScholes, &kernelLength, &ciErrNum);
cpBlackScholes = clCreateProgramWithBuiltInKernels(context, 1, &device_id, "BlackScholes", NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
shrLog("...building BlackScholes program\n");
ciErrNum = clBuildProgram(cpBlackScholes, 0, NULL, "-cl-fast-relaxed-math -Werror", NULL, NULL);
if(ciErrNum != CL_BUILD_SUCCESS){
shrLog("*** Compilation failure ***\n");
size_t deviceNum;
cl_device_id *cdDevices;
ciErrNum = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &deviceNum);
shrCheckError(ciErrNum, CL_SUCCESS);
cdDevices = (cl_device_id *)malloc(deviceNum * sizeof(cl_device_id));
shrCheckError(cdDevices != NULL, shrTRUE);
ciErrNum = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, deviceNum * sizeof(cl_device_id), cdDevices, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
size_t logSize;
char *logTxt;
ciErrNum = clGetProgramBuildInfo(cpBlackScholes, cdDevices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
shrCheckError(ciErrNum, CL_SUCCESS);
logTxt = (char *)malloc(logSize);
shrCheckError(logTxt != NULL, shrTRUE);
ciErrNum = clGetProgramBuildInfo(cpBlackScholes, cdDevices[0], CL_PROGRAM_BUILD_LOG, logSize, logTxt, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
shrLog("%s\n", logTxt);
shrLog("*** Exiting ***\n");
free(logTxt);
free(cdDevices);
exit(666);
}
//Save ptx code to separate file
oclLogPtx(cpBlackScholes, oclGetFirstDev(cxGPUContext), "BlackScholes.ptx");
shrLog("...creating BlackScholes kernels\n");
ckBlackScholes = clCreateKernel(cpBlackScholes, "BlackScholes", &ciErrNum);
shrCheckError(ciErrNum, CL_SUCCESS);
cqDefaultCommandQueue = cqParamCommandQueue;
free(cBlackScholes);
free(cPathAndName);
}
extern "C" void closeBlackScholes(void){
cl_int ciErrNum;
ciErrNum = clReleaseKernel(ckBlackScholes);
ciErrNum |= clReleaseProgram(cpBlackScholes);
shrCheckError(ciErrNum, CL_SUCCESS);
}
////////////////////////////////////////////////////////////////////////////////
// OpenCL Black-Scholes kernel launcher
////////////////////////////////////////////////////////////////////////////////
extern "C" void BlackScholes(
cl_command_queue cqCommandQueue,
cl_mem d_Call, //Call option price
cl_mem d_Put, //Put option price
cl_mem d_S, //Current stock price
cl_mem d_X, //Option strike price
cl_mem d_T, //Option years
cl_float R, //Riskless rate of return
cl_float V, //Stock volatility
cl_uint optionCount
){
cl_int ciErrNum;
if(!cqCommandQueue)
cqCommandQueue = cqDefaultCommandQueue;
ciErrNum = clSetKernelArg(ckBlackScholes, 0, sizeof(cl_mem), (void *)&d_Call);
ciErrNum |= clSetKernelArg(ckBlackScholes, 1, sizeof(cl_mem), (void *)&d_Put);
ciErrNum |= clSetKernelArg(ckBlackScholes, 2, sizeof(cl_mem), (void *)&d_S);
ciErrNum |= clSetKernelArg(ckBlackScholes, 3, sizeof(cl_mem), (void *)&d_X);
ciErrNum |= clSetKernelArg(ckBlackScholes, 4, sizeof(cl_mem), (void *)&d_T);
ciErrNum |= clSetKernelArg(ckBlackScholes, 5, sizeof(cl_float), (void *)&R);
ciErrNum |= clSetKernelArg(ckBlackScholes, 6, sizeof(cl_float), (void *)&V);
ciErrNum |= clSetKernelArg(ckBlackScholes, 7, sizeof(cl_uint), (void *)&optionCount);
shrCheckError(ciErrNum, CL_SUCCESS);
//Run the kernel
size_t globalWorkSize = 60 * 1024;
size_t localWorkSize = 128;
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckBlackScholes, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
}

View File

@@ -0,0 +1,198 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef OCL_UTILS_H
#define OCL_UTILS_H
// *********************************************************************
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
// *********************************************************************
// Common headers: Cross-API utililties and OpenCL header
#include <shrUtils.h>
// All OpenCL headers
#if defined (__APPLE__) || defined(MACOSX)
#include <OpenCL/opencl.h>
#else
#include <CL/opencl.h>
#endif
// Includes
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
// For systems with CL_EXT that are not updated with these extensions, we copied these
// extensions from <CL/cl_ext.h>
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
#define CL_DEVICE_WARP_SIZE_NV 0x4003
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
#endif
// reminders for build output window and log
#ifdef _WIN32
#pragma message ("Note: including shrUtils.h")
#pragma message ("Note: including opencl.h")
#endif
// SDK Revision #
#define OCL_SDKREVISION "7027912"
// Error and Exit Handling Macros...
// *********************************************************************
// Full error handling macro with Cleanup() callback (if supplied)...
// (Companion Inline Function lower on page)
#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__)
// Short version without Cleanup() callback pointer
// Both Input (a) and Reference (b) are specified as args
#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0)
//////////////////////////////////////////////////////////////////////////////
//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
//!
//! @return the id
//! @param clSelectedPlatformID OpenCL platform ID
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
//////////////////////////////////////////////////////////////////////////////
//! Print info about the device
//!
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
//////////////////////////////////////////////////////////////////////////////
//! Get and return device capability
//!
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
extern "C" int oclGetDevCap(cl_device_id device);
//////////////////////////////////////////////////////////////////////////////
//! Print the device name
//!
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the first device from the context
//!
//! @return the id
//! @param cxGPUContext OpenCL context
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the nth device from the context
//!
//! @return the id or -1 when out of range
//! @param cxGPUContext OpenCL context
//! @param device_idx index of the device of interest
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of device with maximal FLOPS from the context
//!
//! @return the id
//! @param cxGPUContext OpenCL context
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
//////////////////////////////////////////////////////////////////////////////
//! Loads a Program file and prepends the cPreamble to the code.
//!
//! @return the source string if succeeded, 0 otherwise
//! @param cFilename program filename
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
//! @param szFinalLength returned length of the code string
//////////////////////////////////////////////////////////////////////////////
extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
//////////////////////////////////////////////////////////////////////////////
//! Get the binary (PTX) of the program associated with the device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//! @param binary returned code
//! @param length length of returned code
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
//////////////////////////////////////////////////////////////////////////////
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//! @param const char* cPtxFileName optional PTX file name
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
//////////////////////////////////////////////////////////////////////////////
//! Get and log the Build Log from the OpenCL compiler for the requested program & device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
// Helper function for De-allocating cl objects
// *********************************************************************
extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
// Helper function to get OpenCL error string from constant
// *********************************************************************
extern "C" const char* oclErrorString(cl_int error);
// Helper function to get OpenCL image format string (channel order and type) from constant
// *********************************************************************
extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
// *********************************************************************
inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
{
// An error condition is defined by the sample/test value not equal to the reference
if (iReference != iSample)
{
// If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
iSample = (iSample == 0) ? -9999 : iSample;
// Log the error info
shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
// Cleanup and exit, or just exit if no cleanup function pointer provided. Use iSample (error code in this case) as process exit code.
if (pCleanup != NULL)
{
pCleanup(iSample);
}
else
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
exit(iSample);
}
}
}
#endif

View File

@@ -0,0 +1,238 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef SHR_QATEST_H
#define SHR_QATEST_H
// *********************************************************************
// Generic utilities for NVIDIA GPU Computing SDK
// *********************************************************************
// OS dependent includes
#ifdef _WIN32
#pragma message ("Note: including windows.h")
#pragma message ("Note: including math.h")
#pragma message ("Note: including assert.h")
#pragma message ("Note: including time.h")
// Headers needed for Windows
#include <windows.h>
#include <time.h>
#else
// Headers needed for Linux
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#include <unistd.h>
#include <time.h>
#endif
#ifndef STRCASECMP
#ifdef _WIN32
#define STRCASECMP _stricmp
#else
#define STRCASECMP strcasecmp
#endif
#endif
#ifndef STRNCASECMP
#ifdef _WIN32
#define STRNCASECMP _strnicmp
#else
#define STRNCASECMP strncasecmp
#endif
#endif
// Standardized QA Start/Finish for CUDA SDK tests
#define shrQAStart(a, b) __shrQAStart(a, b)
#define shrQAFinish(a, b, c) __shrQAFinish(a, b, c)
#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
inline int findExeNameStart(const char *exec_name)
{
int exename_start = (int)strlen(exec_name);
while( (exename_start > 0) &&
(exec_name[exename_start] != '\\') &&
(exec_name[exename_start] != '/') )
{
exename_start--;
}
if (exec_name[exename_start] == '\\' ||
exec_name[exename_start] == '/')
{
return exename_start+1;
} else {
return exename_start;
}
}
inline int __shrQAStart(int argc, char **argv)
{
bool bQATest = false;
// First clear the output buffer
fflush(stdout);
fflush(stdout);
for (int i=1; i < argc; i++) {
int string_start = 0;
while (argv[i][string_start] == '-')
string_start++;
char *string_argv = &argv[i][string_start];
if (!STRCASECMP(string_argv, "qatest")) {
bQATest = true;
}
}
// We don't want to print the entire path, so we search for the first
int exename_start = findExeNameStart(argv[0]);
if (bQATest) {
fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
fprintf(stdout, "\n");
} else {
fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
}
fflush(stdout);
printf("\n"); fflush(stdout);
return exename_start;
}
enum eQAstatus {
QA_FAILED = 0,
QA_PASSED = 1,
QA_WAIVED = 2
};
inline void __ExitInTime(int seconds)
{
fprintf(stdout, "> exiting in %d seconds: ", seconds);
fflush(stdout);
time_t t;
int count;
for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
fprintf(stdout, "%d...", count);
#ifdef WIN32
Sleep(1000);
#else
sleep(1);
#endif
}
fprintf(stdout,"done!\n\n");
fflush(stdout);
}
inline void __shrQAFinish(int argc, const char **argv, int iStatus)
{
// By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
for (int i=1; i < argc; i++) {
int string_start = 0;
while (argv[i][string_start] == '-')
string_start++;
const char *string_argv = &argv[i][string_start];
if (!STRCASECMP(string_argv, "qatest")) {
bQATest = true;
}
// For SDK individual samples that don't specify -noprompt or -prompt,
// a 3 second delay will happen before exiting, giving a user time to view results
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
bNoPrompt = true;
bQuitInTime = false;
}
if (!STRCASECMP(string_argv, "prompt")) {
bNoPrompt = false;
bQuitInTime = false;
}
}
int exename_start = findExeNameStart(argv[0]);
if (bQATest) {
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
fprintf(stdout, "\n");
} else {
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
}
fflush(stdout);
printf("\n"); fflush(stdout);
if (bQuitInTime) {
__ExitInTime(3);
} else {
if (!bNoPrompt) {
fprintf(stdout, "\nPress <Enter> to exit...\n");
fflush(stdout);
getchar();
}
}
}
inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
{
bool bQuitInTime = true;
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
for (int i=1; i < argc; i++) {
int string_start = 0;
while (argv[i][string_start] == '-')
string_start++;
const char *string_argv = &argv[i][string_start];
// For SDK individual samples that don't specify -noprompt or -prompt,
// a 3 second delay will happen before exiting, giving a user time to view results
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
bQuitInTime = false;
}
if (!STRCASECMP(string_argv, "prompt")) {
bQuitInTime = false;
}
}
int exename_start = findExeNameStart(argv[0]);
if (bQATest) {
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
fprintf(stdout, "\n");
} else {
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
}
fflush(stdout);
if (bQuitInTime) {
__ExitInTime(3);
}
}
inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
{
__shrQAFinish(argc, argv, iStatus);
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
}
inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
{
__shrQAFinish2(bQAtest, argc, argv, iStatus);
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
}
#endif

View File

@@ -0,0 +1,642 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef SHR_UTILS_H
#define SHR_UTILS_H
// *********************************************************************
// Generic utilities for NVIDIA GPU Computing SDK
// *********************************************************************
// reminders for output window and build log
#ifdef _WIN32
#pragma message ("Note: including windows.h")
#pragma message ("Note: including math.h")
#pragma message ("Note: including assert.h")
#endif
// OS dependent includes
#ifdef _WIN32
// Headers needed for Windows
#include <windows.h>
#else
// Headers needed for Linux
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#endif
// Other headers needed for both Windows and Linux
#include <math.h>
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
// Un-comment the following #define to enable profiling code in SDK apps
//#define GPU_PROFILING
// Beginning of GPU Architecture definitions
inline int ConvertSMVer2Cores(int major, int minor)
{
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
typedef struct {
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
int Cores;
} sSMtoCores;
sSMtoCores nGpuArchCoresPerSM[] =
{ { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
{ 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
{ -1, -1 }
};
int index = 0;
while (nGpuArchCoresPerSM[index].SM != -1) {
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
return nGpuArchCoresPerSM[index].Cores;
}
index++;
}
printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
return -1;
}
// end of GPU Architecture definitions
// Defines and enum for use with logging functions
// *********************************************************************
#define DEFAULTLOGFILE "SdkConsoleLog.txt"
#define MASTERLOGFILE "SdkMasterLog.csv"
enum LOGMODES
{
LOGCONSOLE = 1, // bit to signal "log to console"
LOGFILE = 2, // bit to signal "log to file"
LOGBOTH = 3, // convenience union of first 2 bits to signal "log to both"
APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
MASTER = 8, // bit to signal master .csv log output
ERRORMSG = 16, // bit to signal "pre-pend Error"
CLOSELOG = 32 // bit to close log file, if open, after any requested file write
};
#define HDASHLINE "-----------------------------------------------------------\n"
// Standardized boolean
enum shrBOOL
{
shrFALSE = 0,
shrTRUE = 1
};
// Standardized MAX, MIN and CLAMP
#define MAX(a, b) ((a > b) ? a : b)
#define MIN(a, b) ((a < b) ? a : b)
#define CLAMP(a, b, c) MIN(MAX(a, b), c) // double sided clip of input a
#define TOPCLAMP(a, b) (a < b ? a:b) // single top side clip of input a
// Error and Exit Handling Macros...
// *********************************************************************
// Full error handling macro with Cleanup() callback (if supplied)...
// (Companion Inline Function lower on page)
#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__)
// Short version without Cleanup() callback pointer
// Both Input (a) and Reference (b) are specified as args
#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0)
// Standardized Exit Macro for leaving main()... extended version
// (Companion Inline Function lower on page)
#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
// Standardized Exit Macro for leaving main()... short version
// (Companion Inline Function lower on page)
#define shrEXIT(a, b) __shrExitEX(a, b, EXIT_SUCCESS)
// Simple argument checker macro
#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE
// Define for user-customized error handling
#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
// Function to deallocate memory allocated within shrUtils
// *********************************************************************
extern "C" void shrFree(void* ptr);
// *********************************************************************
// Helper function to log standardized information to Console, to File or to both
//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n");
//! : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
//!
//! Automatically opens file and stores handle if needed and not done yet
//! Closes file and nulls handle on request
//!
//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.
//! LOGFILE and LOGBOTH may be | 'd with APPENDMODE to select file append mode instead of overwrite mode
//! LOGFILE and LOGBOTH may be | 'd with CLOSELOG to "write and close"
//! First 3 options may be | 'd with MASTER to enable independent write to master data log file
//! First 3 options may be | 'd with ERRORMSG to start line with standard error message
//! @param 2 dValue:
//! Positive val = double value for time in secs to be formatted to 6 decimals.
//! Negative val is an error code and this give error preformatting.
//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.
//! ALL printf flags, width, precision and type specifiers are supported with this exception:
//! Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
//! Single byte char type specifiers (%s and %c) ARE supported
//! @param 4... variable args: like printf or fprintf. Must match format specifer type above.
//! @return 0 if OK, negative value on error or if error occurs or was passed in.
// *********************************************************************
extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0,
// *********************************************************************
extern "C" int shrLog(const char* cFormatString, ...);
// *********************************************************************
// Delta timer function for up to 3 independent timers using host high performance counters
// Maintains state for 3 independent counters
//! Example: double dElapsedTime = shrDeltaTime(0);
//!
//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
//! @return delta time of specified counter since last call in seconds. Otherwise -9999.0 if error
// *********************************************************************
extern "C" double shrDeltaT(int iCounterID);
// Optional LogFileNameOverride function
// *********************************************************************
extern "C" void shrSetLogFileName (const char* cOverRideName);
// Helper function to init data arrays
// *********************************************************************
extern "C" void shrFillArray(float* pfData, int iSize);
// Helper function to print data arrays
// *********************************************************************
extern "C" void shrPrintArray(float* pfData, int iSize);
////////////////////////////////////////////////////////////////////////////
//! Find the path for a filename
//! @return the path if succeeded, otherwise 0
//! @param filename name of the file
//! @param executablePath optional absolute path of the executable
////////////////////////////////////////////////////////////////////////////
extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing single precision floating point data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing double precision floating point data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing integer data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing unsigned integer data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data,
unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing char / byte data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing unsigned char / byte data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data,
unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing single precision floating point
//! data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
//! @param epsilon epsilon for comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
const float epsilon, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing double precision floating point
//! data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
//! @param epsilon epsilon for comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
const double epsilon, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing integer data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing unsigned integer data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data,
unsigned int len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing char / byte data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing unsigned char / byte data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
unsigned int len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Load PPM image file (with unsigned char as data element type), padding
//! 4th component
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param OutData handle to the data read
//! @param w width of the image
//! @param h height of the image
//!
//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData,
unsigned int *w, unsigned int *h);
////////////////////////////////////////////////////////////////////////////
//! Save PPM image file (with unsigned char as data element type, padded to
//! 4 bytes)
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////////
//! Save PGM image file (with unsigned char as data element type)
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////
//! Load PGM image file (with unsigned char as data element type)
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
unsigned int *w,unsigned int *h);
////////////////////////////////////////////////////////////////////////////
// Command line arguments: General notes
// * All command line arguments begin with '--' followed by the token;
// token and value are seperated by '='; example --samples=50
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
// (without whitespaces)
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
//! Check if command line argument \a flag-name is given
//! @return shrTRUE if command line argument \a flag_name has been given,
//! otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param flag_name name of command line flag
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv,
const char* flag_name);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type int
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv,
const char* arg_name, int* val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type unsigned int
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv,
const char* arg_name, unsigned int* val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type float
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv,
const char* arg_name, float* val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type string
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv,
const char* arg_name, char** val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument list those element are strings
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val command line argument list
//! @param len length of the list / number of elements
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv,
const char* arg_name, char** val,
unsigned int* len);
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparef( const float* reference, const float* data,
const unsigned int len);
////////////////////////////////////////////////////////////////////////////
//! Compare two integer arrays
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparei( const int* reference, const int* data,
const unsigned int len );
////////////////////////////////////////////////////////////////////////////////
//! Compare two unsigned integer arrays, with epsilon and threshold
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
const unsigned int len, const float epsilon, const float threshold );
////////////////////////////////////////////////////////////////////////////
//! Compare two unsigned char arrays
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
const unsigned int len );
////////////////////////////////////////////////////////////////////////////////
//! Compare two integers with a tolernance for # of byte errors
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
const unsigned int len, const float epsilon, const float threshold );
////////////////////////////////////////////////////////////////////////////////
//! Compare two integer arrays witha n epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
const unsigned int len, const float epsilon );
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays with an epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
const unsigned int len, const float epsilon );
////////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays with an epsilon tolerance for equality and a
//! threshold for # pixel errors
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
const unsigned int len, const float epsilon, const float threshold );
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays using L2-norm with an epsilon tolerance for
//! equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
const unsigned int len, const float epsilon );
////////////////////////////////////////////////////////////////////////////////
//! Compare two PPM image files with an epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param src_file filename for the image to be compared
//! @param data filename for the reference data / gold image
//! @param epsilon epsilon to use for the comparison
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
//! $param verboseErrors output details of image mismatch to std::err
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
////////////////////////////////////////////////////////////////////////////////
//! Compare two PGM image files with an epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param src_file filename for the image to be compared
//! @param data filename for the reference data / gold image
//! @param epsilon epsilon to use for the comparison
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
//! $param verboseErrors output details of image mismatch to std::err
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
extern "C" size_t shrRoundUp(int group_size, int global_size);
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
// *********************************************************************
inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
{
if (iReference != iSample)
{
shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile);
if (pCleanup != NULL)
{
pCleanup(EXIT_FAILURE);
}
else
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
exit(EXIT_FAILURE);
}
}
}
// Standardized Exit
// *********************************************************************
inline void __shrExitEX(int argc, const char** argv, int iExitCode)
{
#ifdef WIN32
if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
#else
if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
#endif
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");
getchar();
}
else
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]);
}
fflush(stderr);
exit(iExitCode);
}
#endif

View File

@@ -0,0 +1,29 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
__kernel void DotProduct (__global float* a, __global float* b, __global float* c, int iNumElements)
{
// find position in global arrays
int iGID = get_global_id(0);
// bound check (equivalent to the limit on a 'for' loop for standard/serial C code
if (iGID >= iNumElements)
{
return;
}
// process
int iInOffset = iGID << 2;
c[iGID] = a[iInOffset] * b[iInOffset]
+ a[iInOffset + 1] * b[iInOffset + 1]
+ a[iInOffset + 2] * b[iInOffset + 2]
+ a[iInOffset + 3] * b[iInOffset + 3];
}

View File

@@ -0,0 +1,66 @@
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH) -I.
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT=DotProduct
all: $(PROJECT).dump $(PROJECT).hex
lib$(PROJECT).a: DotProduct.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.elf *.dump *.hex

View File

@@ -0,0 +1,270 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
// *********************************************************************
// oclDotProduct Notes:
//
// A simple OpenCL API demo application that implements a
// vector dot product computation between 2 float arrays.
//
// Runs computations with OpenCL on the GPU device and then checks results
// against basic host CPU/C++ computation.
//
// Uses 'shr' and 'ocl' functions from oclUtils and shrUtils libraries for compactness.
// But these are NOT required libs for OpenCL developement in general.
// *********************************************************************
// standard utilities and systems includes
#include <oclUtils.h>
#include <shrQATest.h>
// Name of the file with the source code for the computation kernel
// *********************************************************************
const char* cSourceFile = "DotProduct.cl";
// Host buffers for demo
// *********************************************************************
void *srcA, *srcB, *dst; // Host buffers for OpenCL test
void* Golden; // Host buffer for host golden processing cross check
// OpenCL Vars
cl_platform_id cpPlatform; // OpenCL platform
cl_device_id *cdDevices; // OpenCL device
cl_context cxGPUContext; // OpenCL context
cl_command_queue cqCommandQueue;// OpenCL command que
cl_program program; // OpenCL program
cl_kernel ckKernel; // OpenCL kernel
cl_mem cmDevSrcA; // OpenCL device source buffer A
cl_mem cmDevSrcB; // OpenCL device source buffer B
cl_mem cmDevDst; // OpenCL device destination buffer
size_t szGlobalWorkSize; // Total # of work items in the 1D range
size_t szLocalWorkSize; // # of work items in the 1D work group
size_t szParmDataBytes; // Byte size of context information
size_t szKernelLength; // Byte size of kernel code
cl_int ciErrNum; // Error code var
char* cPathAndName = NULL; // var for full paths to data, src, etc.
char* cSourceCL = NULL; // Buffer to hold source for compilation
const char* cExecutableName = NULL;
// demo config vars
int iNumElements= 1277944; // Length of float arrays to process (odd # for illustration)
shrBOOL bNoPrompt = shrFALSE;
// Forward Declarations
// *********************************************************************
void DotProductHost(const float* pfData1, const float* pfData2, float* pfResult, int iNumElements);
void Cleanup (int iExitCode);
void (*pCleanup)(int) = &Cleanup;
int *gp_argc = NULL;
char ***gp_argv = NULL;
// Main function
// *********************************************************************
int main(int argc, char **argv)
{
gp_argc = &argc;
gp_argv = &argv;
shrQAStart(argc, argv);
// Get the NVIDIA platform
ciErrNum = oclGetPlatformID(&cpPlatform);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
shrLog("clGetPlatformID...\n");
// Get the NVIDIA platform
ciErrNum = oclGetPlatformID(&cpPlatform);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
shrLog("clGetPlatformID...\n");
//Get all the devices
cl_uint uiNumDevices = 0; // Number of devices available
cl_uint uiTargetDevice = 0; // Default Device to compute on
cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU)
shrLog("Get the Device info and select Device...\n");
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
// Get command line device options and config accordingly
shrLog(" # of Devices Available = %u\n", uiNumDevices);
if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE)
{
uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
}
shrLog(" Using Device %u: ", uiTargetDevice);
oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits);
// get command line arg for quick test, if provided
bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");
// start logs
cExecutableName = argv[0];
shrSetLogFileName ("oclDotProduct.txt");
shrLog("%s Starting...\n\n# of float elements per Array \t= %u\n", argv[0], iNumElements);
// set and log Global and Local work size dimensions
szLocalWorkSize = 256;
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize
shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n",
szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize));
// Allocate and initialize host arrays
shrLog( "Allocate and Init Host Mem...\n");
srcA = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize);
srcB = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize);
dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize);
Golden = (void *)malloc(sizeof(cl_float) * iNumElements);
shrFillArray((float*)srcA, 4 * iNumElements);
shrFillArray((float*)srcB, 4 * iNumElements);
// Get the NVIDIA platform
ciErrNum = oclGetPlatformID(&cpPlatform);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Get a GPU device
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 1, &cdDevices[uiTargetDevice], NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Create the context
cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Create a command-queue
shrLog("clCreateCommandQueue...\n");
cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Allocate the OpenCL buffer memory objects for source and result on the device GMEM
shrLog("clCreateBuffer (SrcA, SrcB and Dst in Device GMEM)...\n");
cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Read the OpenCL kernel in from source file
shrLog("oclLoadProgSource (%s)...\n", cSourceFile);
cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
//oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup);
cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
//oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup);
// Create the program
shrLog("clCreateProgramWithSource...\n");
//program = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);
cl_program program =
clCreateProgramWithBuiltInKernels(context, 1, &device_id, "sgemm", NULL);
// Build the program with 'mad' Optimization option
#ifdef MAC
char* flags = "-cl-fast-relaxed-math -DMAC";
#else
char* flags = "-cl-fast-relaxed-math";
#endif
shrLog("clBuildProgram...\n");
ciErrNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (ciErrNum != CL_SUCCESS)
{
// write out standard error, Build Log and PTX, then cleanup and exit
shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
oclLogBuildInfo(program, oclGetFirstDev(cxGPUContext));
oclLogPtx(program, oclGetFirstDev(cxGPUContext), "oclDotProduct.ptx");
Cleanup(EXIT_FAILURE);
}
// Create the kernel
shrLog("clCreateKernel (DotProduct)...\n");
ckKernel = clCreateKernel(program, "DotProduct", &ciErrNum);
// Set the Argument values
shrLog("clSetKernelArg 0 - 3...\n\n");
ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA);
ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB);
ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst);
ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// --------------------------------------------------------
// Core sequence... copy input data to GPU, compute, copy results back
// Asynchronous write of data to GPU device
shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n");
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcA, 0, NULL, NULL);
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcB, 0, NULL, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Launch kernel
shrLog("clEnqueueNDRangeKernel (DotProduct)...\n");
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Read back results and check accumulated errors
shrLog("clEnqueueReadBuffer (Dst)...\n\n");
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Compute and compare results for golden-host and report errors and pass/fail
shrLog("Comparing against Host/C++ computation...\n\n");
DotProductHost ((const float*)srcA, (const float*)srcB, (float*)Golden, iNumElements);
shrBOOL bMatch = shrComparefet((const float*)Golden, (const float*)dst, (unsigned int)iNumElements, 0.0f, 0);
// Cleanup and leave
Cleanup (EXIT_SUCCESS);
}
// "Golden" Host processing dot product function for comparison purposes
// *********************************************************************
void DotProductHost(const float* pfData1, const float* pfData2, float* pfResult, int iNumElements)
{
int i, j, k;
for (i = 0, j = 0; i < iNumElements; i++)
{
pfResult[i] = 0.0f;
for (k = 0; k < 4; k++, j++)
{
pfResult[i] += pfData1[j] * pfData2[j];
}
}
}
// Cleanup and exit code
// *********************************************************************
void Cleanup(int iExitCode)
{
// Cleanup allocated objects
shrLog("Starting Cleanup...\n\n");
if(cPathAndName)free(cPathAndName);
if(cSourceCL)free(cSourceCL);
if(ckKernel)clReleaseKernel(ckKernel);
if(program)clReleaseProgram(program);
if(cqCommandQueue)clReleaseCommandQueue(cqCommandQueue);
if(cxGPUContext)clReleaseContext(cxGPUContext);
if (cmDevSrcA)clReleaseMemObject(cmDevSrcA);
if (cmDevSrcB)clReleaseMemObject(cmDevSrcB);
if (cmDevDst)clReleaseMemObject(cmDevDst);
// Free host memory
free(srcA);
free(srcB);
free (dst);
free(Golden);
if (cdDevices) free(cdDevices);
shrQAFinishExit(*gp_argc, (const char **)*gp_argv, (iExitCode == EXIT_SUCCESS) ? QA_PASSED : QA_FAILED);
}

View File

@@ -0,0 +1,198 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef OCL_UTILS_H
#define OCL_UTILS_H
// *********************************************************************
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
// *********************************************************************
// Common headers: Cross-API utililties and OpenCL header
#include <shrUtils.h>
// All OpenCL headers
#if defined (__APPLE__) || defined(MACOSX)
#include <OpenCL/opencl.h>
#else
#include <CL/opencl.h>
#endif
// Includes
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
// For systems with CL_EXT that are not updated with these extensions, we copied these
// extensions from <CL/cl_ext.h>
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
#define CL_DEVICE_WARP_SIZE_NV 0x4003
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
#endif
// reminders for build output window and log
#ifdef _WIN32
#pragma message ("Note: including shrUtils.h")
#pragma message ("Note: including opencl.h")
#endif
// SDK Revision #
#define OCL_SDKREVISION "7027912"
// Error and Exit Handling Macros...
// *********************************************************************
// Full error handling macro with Cleanup() callback (if supplied)...
// (Companion Inline Function lower on page)
#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__)
// Short version without Cleanup() callback pointer
// Both Input (a) and Reference (b) are specified as args
#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0)
//////////////////////////////////////////////////////////////////////////////
//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
//!
//! @return the id
//! @param clSelectedPlatformID OpenCL platform ID
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
//////////////////////////////////////////////////////////////////////////////
//! Print info about the device
//!
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
//////////////////////////////////////////////////////////////////////////////
//! Get and return device capability
//!
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
extern "C" int oclGetDevCap(cl_device_id device);
//////////////////////////////////////////////////////////////////////////////
//! Print the device name
//!
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the first device from the context
//!
//! @return the id
//! @param cxGPUContext OpenCL context
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the nth device from the context
//!
//! @return the id or -1 when out of range
//! @param cxGPUContext OpenCL context
//! @param device_idx index of the device of interest
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of device with maximal FLOPS from the context
//!
//! @return the id
//! @param cxGPUContext OpenCL context
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
//////////////////////////////////////////////////////////////////////////////
//! Loads a Program file and prepends the cPreamble to the code.
//!
//! @return the source string if succeeded, 0 otherwise
//! @param cFilename program filename
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
//! @param szFinalLength returned length of the code string
//////////////////////////////////////////////////////////////////////////////
extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
//////////////////////////////////////////////////////////////////////////////
//! Get the binary (PTX) of the program associated with the device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//! @param binary returned code
//! @param length length of returned code
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
//////////////////////////////////////////////////////////////////////////////
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//! @param const char* cPtxFileName optional PTX file name
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
//////////////////////////////////////////////////////////////////////////////
//! Get and log the Build Log from the OpenCL compiler for the requested program & device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
// Helper function for De-allocating cl objects
// *********************************************************************
extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
// Helper function to get OpenCL error string from constant
// *********************************************************************
extern "C" const char* oclErrorString(cl_int error);
// Helper function to get OpenCL image format string (channel order and type) from constant
// *********************************************************************
extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
// *********************************************************************
inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
{
// An error condition is defined by the sample/test value not equal to the reference
if (iReference != iSample)
{
// If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
iSample = (iSample == 0) ? -9999 : iSample;
// Log the error info
shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
// Cleanup and exit, or just exit if no cleanup function pointer provided. Use iSample (error code in this case) as process exit code.
if (pCleanup != NULL)
{
pCleanup(iSample);
}
else
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
exit(iSample);
}
}
}
#endif

View File

@@ -0,0 +1,238 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef SHR_QATEST_H
#define SHR_QATEST_H
// *********************************************************************
// Generic utilities for NVIDIA GPU Computing SDK
// *********************************************************************
// OS dependent includes
#ifdef _WIN32
#pragma message ("Note: including windows.h")
#pragma message ("Note: including math.h")
#pragma message ("Note: including assert.h")
#pragma message ("Note: including time.h")
// Headers needed for Windows
#include <windows.h>
#include <time.h>
#else
// Headers needed for Linux
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#include <unistd.h>
#include <time.h>
#endif
#ifndef STRCASECMP
#ifdef _WIN32
#define STRCASECMP _stricmp
#else
#define STRCASECMP strcasecmp
#endif
#endif
#ifndef STRNCASECMP
#ifdef _WIN32
#define STRNCASECMP _strnicmp
#else
#define STRNCASECMP strncasecmp
#endif
#endif
// Standardized QA Start/Finish for CUDA SDK tests
#define shrQAStart(a, b) __shrQAStart(a, b)
#define shrQAFinish(a, b, c) __shrQAFinish(a, b, c)
#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
inline int findExeNameStart(const char *exec_name)
{
int exename_start = (int)strlen(exec_name);
while( (exename_start > 0) &&
(exec_name[exename_start] != '\\') &&
(exec_name[exename_start] != '/') )
{
exename_start--;
}
if (exec_name[exename_start] == '\\' ||
exec_name[exename_start] == '/')
{
return exename_start+1;
} else {
return exename_start;
}
}
inline int __shrQAStart(int argc, char **argv)
{
bool bQATest = false;
// First clear the output buffer
fflush(stdout);
fflush(stdout);
for (int i=1; i < argc; i++) {
int string_start = 0;
while (argv[i][string_start] == '-')
string_start++;
char *string_argv = &argv[i][string_start];
if (!STRCASECMP(string_argv, "qatest")) {
bQATest = true;
}
}
// We don't want to print the entire path, so we search for the first
int exename_start = findExeNameStart(argv[0]);
if (bQATest) {
fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
fprintf(stdout, "\n");
} else {
fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
}
fflush(stdout);
printf("\n"); fflush(stdout);
return exename_start;
}
enum eQAstatus {
QA_FAILED = 0,
QA_PASSED = 1,
QA_WAIVED = 2
};
inline void __ExitInTime(int seconds)
{
fprintf(stdout, "> exiting in %d seconds: ", seconds);
fflush(stdout);
time_t t;
int count;
for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
fprintf(stdout, "%d...", count);
#ifdef WIN32
Sleep(1000);
#else
sleep(1);
#endif
}
fprintf(stdout,"done!\n\n");
fflush(stdout);
}
inline void __shrQAFinish(int argc, const char **argv, int iStatus)
{
// By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
for (int i=1; i < argc; i++) {
int string_start = 0;
while (argv[i][string_start] == '-')
string_start++;
const char *string_argv = &argv[i][string_start];
if (!STRCASECMP(string_argv, "qatest")) {
bQATest = true;
}
// For SDK individual samples that don't specify -noprompt or -prompt,
// a 3 second delay will happen before exiting, giving a user time to view results
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
bNoPrompt = true;
bQuitInTime = false;
}
if (!STRCASECMP(string_argv, "prompt")) {
bNoPrompt = false;
bQuitInTime = false;
}
}
int exename_start = findExeNameStart(argv[0]);
if (bQATest) {
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
fprintf(stdout, "\n");
} else {
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
}
fflush(stdout);
printf("\n"); fflush(stdout);
if (bQuitInTime) {
__ExitInTime(3);
} else {
if (!bNoPrompt) {
fprintf(stdout, "\nPress <Enter> to exit...\n");
fflush(stdout);
getchar();
}
}
}
inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
{
bool bQuitInTime = true;
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
for (int i=1; i < argc; i++) {
int string_start = 0;
while (argv[i][string_start] == '-')
string_start++;
const char *string_argv = &argv[i][string_start];
// For SDK individual samples that don't specify -noprompt or -prompt,
// a 3 second delay will happen before exiting, giving a user time to view results
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
bQuitInTime = false;
}
if (!STRCASECMP(string_argv, "prompt")) {
bQuitInTime = false;
}
}
int exename_start = findExeNameStart(argv[0]);
if (bQATest) {
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
fprintf(stdout, "\n");
} else {
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
}
fflush(stdout);
if (bQuitInTime) {
__ExitInTime(3);
}
}
inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
{
__shrQAFinish(argc, argv, iStatus);
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
}
inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
{
__shrQAFinish2(bQAtest, argc, argv, iStatus);
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
}
#endif

View File

@@ -0,0 +1,642 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef SHR_UTILS_H
#define SHR_UTILS_H
// *********************************************************************
// Generic utilities for NVIDIA GPU Computing SDK
// *********************************************************************
// reminders for output window and build log
#ifdef _WIN32
#pragma message ("Note: including windows.h")
#pragma message ("Note: including math.h")
#pragma message ("Note: including assert.h")
#endif
// OS dependent includes
#ifdef _WIN32
// Headers needed for Windows
#include <windows.h>
#else
// Headers needed for Linux
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#endif
// Other headers needed for both Windows and Linux
#include <math.h>
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
// Un-comment the following #define to enable profiling code in SDK apps
//#define GPU_PROFILING
// Beginning of GPU Architecture definitions
inline int ConvertSMVer2Cores(int major, int minor)
{
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
typedef struct {
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
int Cores;
} sSMtoCores;
sSMtoCores nGpuArchCoresPerSM[] =
{ { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
{ 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
{ -1, -1 }
};
int index = 0;
while (nGpuArchCoresPerSM[index].SM != -1) {
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
return nGpuArchCoresPerSM[index].Cores;
}
index++;
}
printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
return -1;
}
// end of GPU Architecture definitions
// Defines and enum for use with logging functions
// *********************************************************************
#define DEFAULTLOGFILE "SdkConsoleLog.txt"
#define MASTERLOGFILE "SdkMasterLog.csv"
enum LOGMODES
{
LOGCONSOLE = 1, // bit to signal "log to console"
LOGFILE = 2, // bit to signal "log to file"
LOGBOTH = 3, // convenience union of first 2 bits to signal "log to both"
APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
MASTER = 8, // bit to signal master .csv log output
ERRORMSG = 16, // bit to signal "pre-pend Error"
CLOSELOG = 32 // bit to close log file, if open, after any requested file write
};
#define HDASHLINE "-----------------------------------------------------------\n"
// Standardized boolean
enum shrBOOL
{
shrFALSE = 0,
shrTRUE = 1
};
// Standardized MAX, MIN and CLAMP
#define MAX(a, b) ((a > b) ? a : b)
#define MIN(a, b) ((a < b) ? a : b)
#define CLAMP(a, b, c) MIN(MAX(a, b), c) // double sided clip of input a
#define TOPCLAMP(a, b) (a < b ? a:b) // single top side clip of input a
// Error and Exit Handling Macros...
// *********************************************************************
// Full error handling macro with Cleanup() callback (if supplied)...
// (Companion Inline Function lower on page)
#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__)
// Short version without Cleanup() callback pointer
// Both Input (a) and Reference (b) are specified as args
#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0)
// Standardized Exit Macro for leaving main()... extended version
// (Companion Inline Function lower on page)
#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
// Standardized Exit Macro for leaving main()... short version
// (Companion Inline Function lower on page)
#define shrEXIT(a, b) __shrExitEX(a, b, EXIT_SUCCESS)
// Simple argument checker macro
#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE
// Define for user-customized error handling
#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
// Function to deallocate memory allocated within shrUtils
// *********************************************************************
extern "C" void shrFree(void* ptr);
// *********************************************************************
// Helper function to log standardized information to Console, to File or to both
//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n");
//! : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
//!
//! Automatically opens file and stores handle if needed and not done yet
//! Closes file and nulls handle on request
//!
//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.
//! LOGFILE and LOGBOTH may be | 'd with APPENDMODE to select file append mode instead of overwrite mode
//! LOGFILE and LOGBOTH may be | 'd with CLOSELOG to "write and close"
//! First 3 options may be | 'd with MASTER to enable independent write to master data log file
//! First 3 options may be | 'd with ERRORMSG to start line with standard error message
//! @param 2 dValue:
//! Positive val = double value for time in secs to be formatted to 6 decimals.
//! Negative val is an error code and this give error preformatting.
//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.
//! ALL printf flags, width, precision and type specifiers are supported with this exception:
//! Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
//! Single byte char type specifiers (%s and %c) ARE supported
//! @param 4... variable args: like printf or fprintf. Must match format specifer type above.
//! @return 0 if OK, negative value on error or if error occurs or was passed in.
// *********************************************************************
extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0,
// *********************************************************************
extern "C" int shrLog(const char* cFormatString, ...);
// *********************************************************************
// Delta timer function for up to 3 independent timers using host high performance counters
// Maintains state for 3 independent counters
//! Example: double dElapsedTime = shrDeltaTime(0);
//!
//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
//! @return delta time of specified counter since last call in seconds. Otherwise -9999.0 if error
// *********************************************************************
extern "C" double shrDeltaT(int iCounterID);
// Optional LogFileNameOverride function
// *********************************************************************
extern "C" void shrSetLogFileName (const char* cOverRideName);
// Helper function to init data arrays
// *********************************************************************
extern "C" void shrFillArray(float* pfData, int iSize);
// Helper function to print data arrays
// *********************************************************************
extern "C" void shrPrintArray(float* pfData, int iSize);
////////////////////////////////////////////////////////////////////////////
//! Find the path for a filename
//! @return the path if succeeded, otherwise 0
//! @param filename name of the file
//! @param executablePath optional absolute path of the executable
////////////////////////////////////////////////////////////////////////////
extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing single precision floating point data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing double precision floating point data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing integer data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing unsigned integer data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data,
unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing char / byte data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing unsigned char / byte data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data,
unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing single precision floating point
//! data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
//! @param epsilon epsilon for comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
const float epsilon, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing double precision floating point
//! data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
//! @param epsilon epsilon for comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
const double epsilon, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing integer data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing unsigned integer data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data,
unsigned int len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing char / byte data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing unsigned char / byte data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
unsigned int len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Load PPM image file (with unsigned char as data element type), padding
//! 4th component
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param OutData handle to the data read
//! @param w width of the image
//! @param h height of the image
//!
//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData,
unsigned int *w, unsigned int *h);
////////////////////////////////////////////////////////////////////////////
//! Save PPM image file (with unsigned char as data element type, padded to
//! 4 bytes)
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////////
//! Save PGM image file (with unsigned char as data element type)
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////
//! Load PGM image file (with unsigned char as data element type)
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
unsigned int *w,unsigned int *h);
////////////////////////////////////////////////////////////////////////////
// Command line arguments: General notes
// * All command line arguments begin with '--' followed by the token;
// token and value are seperated by '='; example --samples=50
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
// (without whitespaces)
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
//! Check if command line argument \a flag-name is given
//! @return shrTRUE if command line argument \a flag_name has been given,
//! otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param flag_name name of command line flag
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv,
const char* flag_name);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type int
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv,
const char* arg_name, int* val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type unsigned int
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv,
const char* arg_name, unsigned int* val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type float
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv,
const char* arg_name, float* val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type string
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv,
const char* arg_name, char** val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument list those element are strings
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val command line argument list
//! @param len length of the list / number of elements
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv,
const char* arg_name, char** val,
unsigned int* len);
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparef( const float* reference, const float* data,
const unsigned int len);
////////////////////////////////////////////////////////////////////////////
//! Compare two integer arrays
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparei( const int* reference, const int* data,
const unsigned int len );
////////////////////////////////////////////////////////////////////////////////
//! Compare two unsigned integer arrays, with epsilon and threshold
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
const unsigned int len, const float epsilon, const float threshold );
////////////////////////////////////////////////////////////////////////////
//! Compare two unsigned char arrays
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
const unsigned int len );
////////////////////////////////////////////////////////////////////////////////
//! Compare two integers with a tolernance for # of byte errors
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
const unsigned int len, const float epsilon, const float threshold );
////////////////////////////////////////////////////////////////////////////////
//! Compare two integer arrays witha n epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
const unsigned int len, const float epsilon );
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays with an epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
const unsigned int len, const float epsilon );
////////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays with an epsilon tolerance for equality and a
//! threshold for # pixel errors
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
const unsigned int len, const float epsilon, const float threshold );
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays using L2-norm with an epsilon tolerance for
//! equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
const unsigned int len, const float epsilon );
////////////////////////////////////////////////////////////////////////////////
//! Compare two PPM image files with an epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param src_file filename for the image to be compared
//! @param data filename for the reference data / gold image
//! @param epsilon epsilon to use for the comparison
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
//! $param verboseErrors output details of image mismatch to std::err
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
////////////////////////////////////////////////////////////////////////////////
//! Compare two PGM image files with an epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param src_file filename for the image to be compared
//! @param data filename for the reference data / gold image
//! @param epsilon epsilon to use for the comparison
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
//! $param verboseErrors output details of image mismatch to std::err
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
extern "C" size_t shrRoundUp(int group_size, int global_size);
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
// *********************************************************************
inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
{
if (iReference != iSample)
{
shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile);
if (pCleanup != NULL)
{
pCleanup(EXIT_FAILURE);
}
else
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
exit(EXIT_FAILURE);
}
}
}
// Standardized Exit
// *********************************************************************
inline void __shrExitEX(int argc, const char** argv, int iExitCode)
{
#ifdef WIN32
if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
#else
if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
#endif
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");
getchar();
}
else
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]);
}
fflush(stderr);
exit(iExitCode);
}
#endif

View File

@@ -0,0 +1,66 @@
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH) -I.
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT=VectorHypot
all: $(PROJECT).dump $(PROJECT).hex
lib$(PROJECT).a: VectorHypot.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.elf *.dump *.hex

View File

@@ -0,0 +1,41 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
// OpenCL Kernel Function Naive Implementation for hyptenuse
__kernel void VectorHypot(__global float4* fg4A, __global float4* fg4B, __global float4* fg4Hypot, unsigned int uiOffset, int iInnerLoopCount, unsigned int uiNumElements)
{
// get index into global data array
size_t szGlobalOffset = get_global_id(0) + uiOffset;
// bound check
if (szGlobalOffset >= uiNumElements)
{
return;
}
// Processing 4 elements per work item, so read fgA and fgB source values from GMEM
float4 f4A = fg4A[szGlobalOffset];
float4 f4B = fg4B[szGlobalOffset];
float4 f4H = (float4)0.0f;
// Get the hypotenuses the vectors of 'legs', but exaggerate the time needed with loop
for (int i = 0; i < iInnerLoopCount; i++)
{
// compute the 4 hypotenuses using built-in function
f4H.x = hypot (f4A.x, f4B.x);
f4H.y = hypot (f4A.y, f4B.y);
f4H.z = hypot (f4A.z, f4B.z);
f4H.w = hypot (f4A.w, f4B.w);
}
// Write 4 result values back out to GMEM
fg4Hypot[szGlobalOffset] = f4H;
}

View File

@@ -0,0 +1,686 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
// *********************************************************************
// oclCopyComputeOverlap Notes:
//
// OpenCL API demo application for NVIDIA CUDA GPU's that implements a
// element by element vector hyptenuse computation using 2 input float arrays
// and 1 output float array.
//
// Demonstrates host->GPU and GPU->host copies that are asynchronous/overlapped
// with respect to GPU computation (and with respect to host thread).
//
// Because the overlap acheivable for this computation and data set on a given system depends upon the GPU being used and the
// GPU/Host bandwidth, the sample adjust the computation duration to test the most ideal case and test against a consistent standard.
// This sample should be able to achieve up to 30% overlap on GPU's arch 1.2 and 1.3, and up to 50% on arch 2.0+ (Fermi) GPU's.
//
// After setup, warmup and calibration to the system, the sample runs 4 scenarios:
// A) Computations with 2 command queues on GPU
// A multiple-cycle sequence is executed, timed and compared against the host
// B) Computations with 1 command queue on GPU
// A multiple-cycle sequence is executed, timed and compared against the host
//
// The 2-command queue approach ought to be substantially faster
//
// For developmental purposes, the "iInnerLoopCount" variable passes into kernel and independently
// increases compute time without increasing data size (via a loop inside the kernel)
//
// At some value of iInnerLoopCount, # of elements, workgroup size, etc the Overlap percentage should reach 30%:
// (This ~naively assumes time H2D bandwidth is the same as D2H bandwidth, but this is close on most systems)
//
// If we name the time to copy single input vector H2D (or outpute vector D2H) as "T", then the optimum comparison case is:
//
// Single Queue with all the data and all the work
// Ttot (serial) = 4T + 4T + 2T = 10T
//
// Dual Queue, where each queue has 1/2 the data and 1/2 the work
// Tq0 (overlap) = 2T + 2T + T ....
// Tq1 (overlap) = .... 2T + 2T + T
//
// Ttot (elapsed, wall) = 2T + 2T + 2T + T = 7T
//
// Best Overlap % = 100.0 * (10T - 7T)/10T = 30.0 % (Tesla arch 1.2 or 1.3, single copy engine)
//
// For multiple independent cycles using arch >= 2.0 with 2 copy engines, input and output copies can also be overlapped.
// This doesn't help for the first cycle, but theoretically can lead to 50% overlap over many independent cycles.
// *********************************************************************
// common SDK header for standard utilities and system libs
#include <oclUtils.h>
#include <shrQATest.h>
// Best possible and Min ratio of compute/copy overlap timing benefit to pass the test
// values greater than 0.0f represent a speed-up relative to non-overlapped
#define EXPECTED_OVERLAP 30.0f
#define EXPECTED_OVERLAP_FERMI 45.0f
#define PASS_FACTOR 0.60f
#define RETRIES_ON_FAILURE 1
// Base sizes for parameters manipulated dynamically or on the command line
#define BASE_WORK_ITEMS 64
#define BASE_ARRAY_LENGTH 40000
#define BASE_LOOP_COUNT 32
// Vars
// *********************************************************************
cl_platform_id cpPlatform; // OpenCL platform
cl_context cxGPUContext; // OpenCL context
cl_command_queue cqCommandQueue[2]; // OpenCL command queues
cl_device_id* cdDevices; // OpenCL device list
cl_program cpProgram; // OpenCL program
cl_kernel ckKernel[2]; // OpenCL kernel, 1 per queue
cl_mem cmPinnedSrcA; // OpenCL pinned host source buffer A
cl_mem cmPinnedSrcB; // OpenCL pinned host source buffer B
cl_mem cmPinnedResult; // OpenCL pinned host result buffer
float* fSourceA = NULL; // Mapped pointer for pinned Host source A buffer
float* fSourceB = NULL; // Mapped pointer for pinned Host source B buffer
float* fResult = NULL; // Mapped pointer for pinned Host result buffer
cl_mem cmDevSrcA; // OpenCL device source buffer A
cl_mem cmDevSrcB; // OpenCL device source buffer B
cl_mem cmDevResult; // OpenCL device result buffer
size_t szBuffBytes; // Size of main buffers
size_t szGlobalWorkSize; // 1D var for Total # of work items in the launched ND range
size_t szLocalWorkSize = BASE_WORK_ITEMS; // initial # of work items in the work group
cl_int ciErrNum; // Error code var
char* cPathAndName = NULL; // Var for full paths to data, src, etc.
char* cSourceCL = NULL; // Buffer to hold source for compilation
const char* cExecutableName = NULL;
// demo config vars
const char* cSourceFile = "VectorHypot.cl"; // OpenCL computation kernel source code
float* Golden = NULL; // temp buffer to hold golden results for cross check
bool bNoPrompt = false; // Command line switch to skip exit prompt
bool bQATest = false; // Command line switch to test
// Forward Declarations
// *********************************************************************
double DualQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig);
double OneQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig);
int AdjustCompute(cl_device_id cdTargetDevice, unsigned int uiNumElements, int iInitialLoopCount, int iCycles);
void VectorHypotHost(const float* pfData1, const float* pfData2, float* pfResult, unsigned int uiNumElements, int iInnerLoopCount);
void Cleanup (int iExitCode);
void (*pCleanup)(int) = &Cleanup;
int *gp_argc = 0;
const char *** gp_argv = NULL;
// Main function
// *********************************************************************
int main(int argc, const char **argv)
{
//Locals
size_t szKernelLength; // Byte size of kernel code
double dBuildTime; // Compile time
cl_uint uiTargetDevice = 0; // Default Device to compute on
cl_uint uiNumDevsUsed = 1; // Number of devices used in this sample
cl_uint uiNumDevices; // Number of devices available
int iDevCap = -1; // Capability of device
int iInnerLoopCount = BASE_LOOP_COUNT; // Varies "compute intensity" per data within the kernel
const int iTestCycles = 10; // How many times to run the external test loop
const int iWarmupCycles = 8; // How many times to run the warmup sequence
cl_uint uiWorkGroupMultiple = 4; // Command line var (using "workgroupmult=<n>") to optionally increase workgroup size
cl_uint uiNumElements = BASE_ARRAY_LENGTH; // initial # of elements per array to process (note: procesing 4 per work item)
cl_uint uiSizeMultiple = 4; // Command line var (using "sizemult=<n>") to optionally increase vector sizes
bool bPassFlag = false; // Var to accumulate test pass/fail
shrBOOL bMatch = shrFALSE; // Cross check result
shrBOOL bTestOverlap = shrFALSE;
double dAvgGPUTime[2] = {0.0, 0.0}; // Average time of iTestCycles calls for 2-Queue and 1-Queue test
double dHostTime[2] = {0.0, 0.0}; // Host computation time (2nd test is redundant but a good stability indicator)
float fMinPassCriteria[2] = {0.0f, 0.0f}; // Test pass cireria, adjusted dependant on GPU arch
gp_argc = &argc;
gp_argv = &argv;
shrQAStart(argc, (char **)argv);
// start logs
cExecutableName = argv[0];
shrSetLogFileName ("oclCopyComputeOverlap.txt");
shrLog("%s Starting...\n\n", argv[0]);
// get basic command line args
bNoPrompt = (shrTRUE == shrCheckCmdLineFlag(argc, argv, "noprompt"));
bQATest = (shrTRUE == shrCheckCmdLineFlag(argc, argv, "qatest"));
shrGetCmdLineArgumentu(argc, argv, "device", &uiTargetDevice);
// Optional Command-line multiplier for vector size
// Default val of 4 gives 10.24 million float elements per vector
// Range of 3 - 16 (7.68 to 40.96 million floats) is reasonable range (if system and GPU have enough memory)
shrGetCmdLineArgumentu(argc, argv, "sizemult", &uiSizeMultiple);
uiSizeMultiple = CLAMP(uiSizeMultiple, 1, 50);
uiNumElements = uiSizeMultiple * BASE_ARRAY_LENGTH * BASE_WORK_ITEMS;
shrLog("Array sizes = %u float elements\n", uiNumElements);
// Optional Command-line multiplier for workgroup size (x 64 work items)
// Default val of 4 gives szLocalWorkSize of 256.
// Range of 1 - 8 (resulting in workgroup sizes of 64 to 512) is reasonable range
shrGetCmdLineArgumentu(argc, argv, "workgroupmult", &uiWorkGroupMultiple);
uiWorkGroupMultiple = CLAMP(uiWorkGroupMultiple, 1, 10);
szLocalWorkSize = uiWorkGroupMultiple * BASE_WORK_ITEMS;
shrLog("Workgroup Size = %u\n\n", szLocalWorkSize);
// Get the NVIDIA platform if available, otherwise use default
shrLog("Get the Platform ID...\n\n");
ciErrNum = oclGetPlatformID(&cpPlatform);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Get OpenCL platform name and version
char cBuffer[256];
ciErrNum = clGetPlatformInfo (cpPlatform, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("Platform Name = %s\n\n", cBuffer);
// Get all the devices
shrLog("Get the Device info and select Device...\n");
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cdDevices = (cl_device_id*)malloc(uiNumDevices * sizeof(cl_device_id));
// Ethans changes
CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
//ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Set target device and check capabilities
shrLog(" # of Devices Available = %u\n", uiNumDevices);
uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
shrLog(" Using Device %u, ", uiTargetDevice);
oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
iDevCap = oclGetDevCap(cdDevices[uiTargetDevice]);
if (iDevCap > 0) {
shrLog(", Capability = %d.%d\n\n", iDevCap/10, iDevCap%10);
} else {
shrLog("\n\n", iDevCap);
}
if (strstr(cBuffer, "NVIDIA") != NULL)
{
if (iDevCap < 12)
{
shrLog("Device doesn't have overlap capability. Skipping test...\n");
Cleanup (EXIT_SUCCESS);
}
// Device and Platform eligible for overlap testing
bTestOverlap = shrTRUE;
// If device has overlap capability, proceed
fMinPassCriteria[0] = PASS_FACTOR * EXPECTED_OVERLAP; // 1st cycle overlap is same for 1 or 2 copy engines
if (iDevCap != 20)
{
// Single copy engine
fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP; // avg of many cycles
}
else
{
char cDevName[1024];
clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_NAME, sizeof(cDevName), &cDevName, NULL);
if(strstr(cDevName, "Quadro")!=0 || strstr(cDevName, "Tesla")!=0)
{
// Tesla or Quadro (arch = 2.0) ... Dual copy engine
fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP_FERMI; // average of many cycles
}
else
{
// Geforce ... Single copy engine
fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP; // average of many cycles
}
}
}
// Create the context
shrLog("clCreateContext...\n");
cxGPUContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Create 2 command-queues
cqCommandQueue[0] = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clCreateCommandQueue [0]...\n");
cqCommandQueue[1] = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clCreateCommandQueue [1]...\n");
// Allocate the OpenCL source and result buffer memory objects on GPU device GMEM
szBuffBytes = sizeof(cl_float) * uiNumElements;
cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szBuffBytes, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szBuffBytes, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmDevResult = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, szBuffBytes, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clCreateBuffer (Src A, Src B and Result GPU Device GMEM, 3 x %u floats) ...\n", uiNumElements);
// Allocate pinned source and result host buffers:
// Note: Pinned (Page Locked) memory is needed for async host<->GPU memory copy operations ***
cmPinnedSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmPinnedSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmPinnedResult = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clCreateBuffer (Src A, Src B and Result Pinned Host buffers, 3 x %u floats)...\n\n", uiNumElements);
// Get mapped pointers to pinned input host buffers
// Note: This allows general (non-OpenCL) host functions to access pinned buffers using standard pointers
fSourceA = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedSrcA, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
fSourceB = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedSrcB, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
fResult = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedResult, CL_TRUE, CL_MAP_READ, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
//oclCheckErrorEX (ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clEnqueueMapBuffer (Pointers to 3 pinned host buffers)...\n");
// Alloc temp golden buffer for cross checks
Golden = (float*)malloc(szBuffBytes);
//oclCheckErrorEX(Golden != NULL, shrTRUE, pCleanup);
// Read the OpenCL kernel in from source file
cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
//oclCheckError(cPathAndName != NULL, shrTRUE);
cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
// oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("oclLoadProgSource (%s)...\n", cSourceFile);
// Create the program object
//cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clCreateProgramWithSource...\n");
cl_program program =
clCreateProgramWithBuiltInKernels(context, 1, &device_id, "VectorHypot", NULL);
// Build the program for the target device
clFinish(cqCommandQueue[0]);
shrDeltaT(0);
ciErrNum = clBuildProgram(program, uiNumDevsUsed, &cdDevices[uiTargetDevice], "-cl-fast-relaxed-math", NULL, NULL);
shrLog("clBuildProgram...");
if (ciErrNum != CL_SUCCESS)
{
// write out standard error, Build Log and PTX, then cleanup and exit
shrLogEx(LOGBOTH | ERRORMSG, (double)ciErrNum, STDERROR);
oclLogBuildInfo(program, oclGetFirstDev(cxGPUContext));
oclLogPtx(program, oclGetFirstDev(cxGPUContext), "VectorHypot.ptx");
Cleanup(EXIT_FAILURE);
}
dBuildTime = shrDeltaT(0);
// Ethan - Kernel Addition
if (program == NULL) {
std::cerr << "Failed to write program binary" << std::endl;
Cleanup(context, queue, program, kernel, memObjects);
return 1;
} else {
std::cout << "Read program from binary." << std::endl;
}
// Create the kernel
ckKernel[0] = clCreateKernel(program, "VectorHypot", &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
ckKernel[1] = clCreateKernel(program, "VectorHypot", &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clCreateKernel (ckKernel[2])...\n");
// Offsets for 2 queues
cl_uint uiOffset[2] = {0, uiNumElements / (2 * 4)};
// Set the Argument values for the 1st kernel instance (queue 0)
ciErrNum = clSetKernelArg(ckKernel[0], 0, sizeof(cl_mem), (void*)&cmDevSrcA);
ciErrNum |= clSetKernelArg(ckKernel[0], 1, sizeof(cl_mem), (void*)&cmDevSrcB);
ciErrNum |= clSetKernelArg(ckKernel[0], 2, sizeof(cl_mem), (void*)&cmDevResult);
ciErrNum |= clSetKernelArg(ckKernel[0], 3, sizeof(cl_uint), (void*)&uiOffset[0]);
ciErrNum |= clSetKernelArg(ckKernel[0], 4, sizeof(cl_int), (void*)&iInnerLoopCount);
ciErrNum |= clSetKernelArg(ckKernel[0], 5, sizeof(cl_uint), (void*)&uiNumElements);
shrLog("clSetKernelArg ckKernel[0] args 0 - 5...\n");
// Set the Argument values for the 2d kernel instance (queue 1)
ciErrNum |= clSetKernelArg(ckKernel[1], 0, sizeof(cl_mem), (void*)&cmDevSrcA);
ciErrNum |= clSetKernelArg(ckKernel[1], 1, sizeof(cl_mem), (void*)&cmDevSrcB);
ciErrNum |= clSetKernelArg(ckKernel[1], 2, sizeof(cl_mem), (void*)&cmDevResult);
ciErrNum |= clSetKernelArg(ckKernel[1], 3, sizeof(cl_uint), (void*)&uiOffset[1]);
ciErrNum |= clSetKernelArg(ckKernel[1], 4, sizeof(cl_int), (void*)&iInnerLoopCount);
ciErrNum |= clSetKernelArg(ckKernel[1], 5, sizeof(cl_uint), (void*)&uiNumElements);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clSetKernelArg ckKernel[1] args 0 - 5...\n\n");
//*******************************************
// Warmup the driver with dual queue sequence
//*******************************************
// Warmup with dual queue sequence for iTestCycles
shrLog("Warmup with 2-Queue sequence, %d cycles...\n", iWarmupCycles);
DualQueueSequence(iWarmupCycles, uiNumElements, false);
// Use single queue config to adjust compute intensity
shrLog("Adjust compute for GPU / system...\n");
iInnerLoopCount = AdjustCompute(cdDevices[uiTargetDevice], uiNumElements, iInnerLoopCount, iTestCycles);
shrLog(" Kernel inner loop count = %d\n", iInnerLoopCount);
//*******************************************
// Run and time with 2 command-queues
//*******************************************
for( int iRun =0; iRun <= RETRIES_ON_FAILURE; ++iRun ) {
// Run the sequence iTestCycles times
dAvgGPUTime[0] = DualQueueSequence(iTestCycles, uiNumElements, false);
// Warmup then Compute on host iTestCycles times (using mapped standard pointer to pinned host cl_mem buffer)
shrLog(" Device vs Host Result Comparison\t: ");
VectorHypotHost(fSourceA, fSourceB, Golden, uiNumElements, iInnerLoopCount);
shrDeltaT(0);
for (int i = 0; i < iTestCycles; i++)
{
VectorHypotHost (fSourceA, fSourceB, Golden, uiNumElements, iInnerLoopCount);
}
dHostTime[0] = shrDeltaT(0)/iTestCycles;
// Compare host and GPU results (using mapped standard pointer to pinned host cl_mem buffer)
bMatch = shrComparefet(Golden, fResult, uiNumElements, 0.0f, 0);
shrLog("gpu %s cpu\n", (bMatch == shrTRUE) ? "MATCHES" : "DOESN'T MATCH");
bPassFlag = (bMatch == shrTRUE);
//*******************************************
// Run and time with 1 command queue
//*******************************************
// Run the sequence iTestCycles times
dAvgGPUTime[1] = OneQueueSequence(iTestCycles, uiNumElements, false);
// Compute on host iTestCycles times (using mapped standard pointer to pinned host cl_mem buffer)
shrLog(" Device vs Host Result Comparison\t: ");
shrDeltaT(0);
for (int i = 0; i < iTestCycles; i++)
{
VectorHypotHost(fSourceA, fSourceB, Golden, (int)uiNumElements, iInnerLoopCount);
}
dHostTime[1] = shrDeltaT(0)/iTestCycles;
// Compare host and GPU results (using mapped standard pointer to pinned host cl_mem buffer)
bMatch = shrComparefet(Golden, fResult, uiNumElements, 0.0f, 0);
shrLog("gpu %s cpu\n", (bMatch == shrTRUE) ? "MATCHES" : "DOESN'T MATCH");
bPassFlag &= (bMatch == shrTRUE);
//*******************************************
// Compare Single and Dual queue timing
shrLog("\nResult Summary:\n");
// Log GPU and CPU Time for 2-queue scenario
shrLog(" Avg GPU Elapsed Time for 2-Queues\t= %.5f s\n", dAvgGPUTime[0]);
shrLog(" Avg Host Elapsed Time\t\t\t= %.5f s\n\n", dHostTime[0]);
// Log GPU and CPU Time for 1-queue scenario
shrLog(" Avg GPU Elapsed Time for 1-Queue\t= %.5f s\n", dAvgGPUTime[1]);
shrLog(" Avg Host Elapsed Time\t\t\t= %.5f s\n\n", dHostTime[1]);
// Log overlap % for GPU (comparison of 2-queue and 1 queue scenarios) and status
double dAvgOverlap = 100.0 * (1.0 - dAvgGPUTime[0]/dAvgGPUTime[1]);
if( bTestOverlap ) {
bool bAvgOverlapOK = (dAvgOverlap >= fMinPassCriteria[1]);
if( iRun == RETRIES_ON_FAILURE || bAvgOverlapOK ) {
shrLog(" Measured and (Acceptable) Avg Overlap\t= %.1f %% (%.1f %%) -> Measured Overlap is %s\n\n", dAvgOverlap, fMinPassCriteria[1], bAvgOverlapOK ? "Acceptable" : "NOT Acceptable");
// Log info to master log in standard format
shrLogEx(LOGBOTH | MASTER, 0, "oclCopyComputeOverlap-Avg, Throughput = %.4f OverlapPercent, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n",
dAvgOverlap, dAvgGPUTime[0], uiNumElements, uiNumDevsUsed, szLocalWorkSize);
bPassFlag &= bAvgOverlapOK;
break;
}
}
shrLog(" Measured and (Acceptable) Avg Overlap\t= %.1f %% (%.1f %%) -> Retry %d more time(s)...\n\n", dAvgOverlap, fMinPassCriteria[1], RETRIES_ON_FAILURE - iRun);
}
//*******************************************
// Report pass/fail, cleanup and exit
Cleanup (bPassFlag ? EXIT_SUCCESS : EXIT_FAILURE);
}
// Run 1 queue sequence for n cycles
// *********************************************************************
double OneQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig)
{
// Use fresh source Data: (re)initialize pinned host array buffers (using mapped standard pointer to pinned host cl_mem buffer)
shrFillArray(fSourceA, (int)uiNumElements);
shrFillArray(fSourceB, (int)uiNumElements);
// Reset Global work size for 1 command-queue, and log work sizes & dimensions
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/4));
// *** Make sure queues are empty and then start timer
double dAvgTime = 0.0;
clFinish(cqCommandQueue[0]);
clFinish(cqCommandQueue[1]);
shrDeltaT(0);
// Run the sequence iCycles times
for (int i = 0; i < iCycles; i++)
{
// Nonblocking Write of all of input data from host to device in command-queue 0
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szBuffBytes, (void*)&fSourceA[0], 0, NULL, NULL);
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szBuffBytes, (void*)&fSourceB[0], 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
// Launch kernel computation, command-queue 0
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Non Blocking Read of output data from device to host, command-queue 0
ciErrNum = clEnqueueReadBuffer(cqCommandQueue[0], cmDevResult, CL_FALSE, 0, szBuffBytes, (void*)&fResult[0], 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
// Flush sequence to device (may not be necessary on Linux or WinXP or when using the NVIDIA Tesla Computing Cluster driver)
clFlush(cqCommandQueue[0]);
}
// *** Assure sync to host and return average sequence time
clFinish(cqCommandQueue[0]);
dAvgTime = shrDeltaT(0)/(double)iCycles;
// Log config if asked for
if (bShowConfig)
{
shrLog("\n1-Queue sequence Configuration:\n");
shrLog(" Global Work Size (per command-queue)\t= %u\n Local Work Size \t\t\t= %u\n # of Work Groups (per command-queue)\t= %u\n # of command-queues\t\t\t= 1\n",
szGlobalWorkSize, szLocalWorkSize, szGlobalWorkSize/szLocalWorkSize);
}
return dAvgTime;
}
// Run 2 queue sequence for n cycles
// *********************************************************************
double DualQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig)
{
// Locals
size_t szHalfBuffer = szBuffBytes / 2;
size_t szHalfOffset = szHalfBuffer / sizeof(float);
double dAvgTime = 0.0;
// Use fresh source Data: (re)initialize pinned host array buffers (using mapped standard pointer to pinned host cl_mem buffer)
shrFillArray(fSourceA, (int)uiNumElements);
shrFillArray(fSourceB, (int)uiNumElements);
// Set Global work size for 2 command-queues, and log work sizes & dimensions
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/(2 * 4)));
// Make sure queues are empty and then start timer
clFinish(cqCommandQueue[0]);
clFinish(cqCommandQueue[1]);
shrDeltaT(0);
for (int i = 0; i < iCycles; i++)
{
// Mid Phase 0
// Nonblocking Write of 1st half of input data from host to device in command-queue 0
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szHalfBuffer, (void*)&fSourceA[0], 0, NULL, NULL);
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szHalfBuffer, (void*)&fSourceB[0], 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
// Push out the write for queue 0 (and prior read from queue 1 at end of loop) to the driver
// (not necessary on Linux, Mac OSX or WinXP)
clFlush(cqCommandQueue[0]);
clFlush(cqCommandQueue[1]);
// Start Phase 1 ***********************************
// Launch kernel computation, command-queue 0
// (Note: The order MATTERS here on Fermi ! THE KERNEL IN THIS PHASE SHOULD BE LAUNCHED BEFORE THE WRITE)
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Nonblocking Write of 2nd half of input data from host to device in command-queue 1
// (Note: The order MATTERS here on Fermi ! THE KERNEL IN THIS PHASE SHOULD BE LAUNCHED BEFORE THE WRITE)
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[1], cmDevSrcA, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fSourceA[szHalfOffset], 0, NULL, NULL);
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[1], cmDevSrcB, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fSourceB[szHalfOffset], 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
// Push out the compute for queue 0 and write for queue 1 to the driver
// (not necessary on Linux, Mac OSX or WinXP)
clFlush(cqCommandQueue[0]);
clFlush(cqCommandQueue[1]);
// Start Phase 2 ***********************************
// Launch kernel computation, command-queue 1
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[1], ckKernel[1], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Non Blocking Read of 1st half of output data from device to host, command-queue 0
ciErrNum = clEnqueueReadBuffer(cqCommandQueue[0], cmDevResult, CL_FALSE, 0, szHalfBuffer, (void*)&fResult[0], 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
// Push out the compute for queue 1 and the read for queue 0 to the driver
// (not necessary on Linux, Mac OSX or WinXP)
clFlush(cqCommandQueue[0]);
clFlush(cqCommandQueue[1]);
// Start Phase 0 (Rolls over) ***********************************
// Non Blocking Read of 2nd half of output data from device to host, command-queue 1
ciErrNum = clEnqueueReadBuffer(cqCommandQueue[1], cmDevResult, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fResult[szHalfOffset], 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
}
// *** Sync to host and get average sequence time
clFinish(cqCommandQueue[0]);
clFinish(cqCommandQueue[1]);
dAvgTime = shrDeltaT(0)/(double)iCycles;
// Log config if asked for
if (bShowConfig)
{
shrLog("\n2-Queue sequence Configuration:\n");
shrLog(" Global Work Size (per command-queue)\t= %u\n Local Work Size \t\t\t= %u\n # of Work Groups (per command-queue)\t= %u\n # of command-queues\t\t\t= 2\n",
szGlobalWorkSize, szLocalWorkSize, szGlobalWorkSize/szLocalWorkSize);
}
return dAvgTime;
}
// Function to adjust compute task according to device capability
// This allows a consistent overlap % across a wide variety of GPU's for test purposes
// It also implitly illustrates the relationship between compute capability and overlap at fixed work size
// *********************************************************************
int AdjustCompute(cl_device_id cdTargetDevice, unsigned int uiNumElements, int iInitLoopCount, int iCycles)
{
// Locals
double dCopyTime, dComputeTime;
int iComputedLoopCount;
// Change Source Data
shrFillArray(fSourceA, (int)uiNumElements);
shrFillArray(fSourceB, (int)uiNumElements);
// Reset Global work size for 1 command-queue, and log work sizes & dimensions
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/4));
// *** Make sure queues are empty and then start timer
clFinish(cqCommandQueue[0]);
clFinish(cqCommandQueue[1]);
shrDeltaT(0);
// Run the copy iCycles times and measure copy time on this system
for (int i = 0; i < iCycles; i++)
{
// Nonblocking Write of all of input data from host to device in command-queue 0
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szBuffBytes, (void*)&fSourceA[0], 0, NULL, NULL);
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szBuffBytes, (void*)&fSourceB[0], 0, NULL, NULL);
ciErrNum |= clFlush(cqCommandQueue[0]);
shrCheckError(ciErrNum, CL_SUCCESS);
}
clFinish(cqCommandQueue[0]);
dCopyTime = shrDeltaT(0);
// Run the compute iCycles times and measure compute time on this system
for (int i = 0; i < iCycles; i++)
{
// Launch kernel computation, command-queue 0
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
ciErrNum |= clFlush(cqCommandQueue[0]);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
}
clFinish(cqCommandQueue[0]);
dComputeTime = shrDeltaT(0);
// Determine number of core loop cycles proportional to copy/compute time ratio
dComputeTime = MAX(dComputeTime, 1.0e-6);
iComputedLoopCount = CLAMP(2, (int)((dCopyTime/dComputeTime) * (double)iInitLoopCount), (iInitLoopCount * 4));
ciErrNum |= clSetKernelArg(ckKernel[0], 4, sizeof(cl_int), (void*)&iComputedLoopCount);
ciErrNum |= clSetKernelArg(ckKernel[1], 4, sizeof(cl_int), (void*)&iComputedLoopCount);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
return (iComputedLoopCount);
}
// Cleanup/Exit function
// *********************************************************************
void Cleanup (int iExitCode)
{
// Cleanup allocated objects
shrLog("Starting Cleanup...\n\n");
if(cPathAndName)free(cPathAndName);
if(cSourceCL)free(cSourceCL);
if(Golden)free(Golden);
if(ckKernel[0])clReleaseKernel(ckKernel[0]);
if(ckKernel[1])clReleaseKernel(ckKernel[1]);
if(program)clReleaseProgram(program);
if(fSourceA)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedSrcA, (void*)fSourceA, 0, NULL, NULL);
if(fSourceB)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedSrcB, (void*)fSourceB, 0, NULL, NULL);
if(fResult)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedResult, (void*)fResult, 0, NULL, NULL);
if(cmDevSrcA)clReleaseMemObject(cmDevSrcA);
if(cmDevSrcB)clReleaseMemObject(cmDevSrcB);
if(cmDevResult)clReleaseMemObject(cmDevResult);
if(cmPinnedSrcA)clReleaseMemObject(cmPinnedSrcA);
if(cmPinnedSrcB)clReleaseMemObject(cmPinnedSrcB);
if(cmPinnedResult)clReleaseMemObject(cmPinnedResult);
if(cqCommandQueue[0])clReleaseCommandQueue(cqCommandQueue[0]);
if(cqCommandQueue[1])clReleaseCommandQueue(cqCommandQueue[1]);
if(cxGPUContext)clReleaseContext(cxGPUContext);
if(cdDevices)free(cdDevices);
// Master status Pass/Fail (all tests)
shrQAFinishExit( *gp_argc, (const char **)*gp_argv, (iExitCode == EXIT_SUCCESS) ? QA_PASSED : QA_FAILED );
}
// "Golden" Host processing vector hyptenuse function for comparison purposes
// *********************************************************************
void VectorHypotHost(const float* pfData1, const float* pfData2, float* pfResult, unsigned int uiNumElements, int iInnerLoopCount)
{
for (unsigned int i = 0; i < uiNumElements; i++)
{
float fA = pfData1[i];
float fB = pfData2[i];
float fC = sqrtf(fA * fA + fB * fB);
pfResult[i] = fC;
}
}

View File

@@ -0,0 +1,198 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef OCL_UTILS_H
#define OCL_UTILS_H
// *********************************************************************
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
// *********************************************************************
// Common headers: Cross-API utililties and OpenCL header
#include <shrUtils.h>
// All OpenCL headers
#if defined (__APPLE__) || defined(MACOSX)
#include <OpenCL/opencl.h>
#else
#include <CL/opencl.h>
#endif
// Includes
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
// For systems with CL_EXT that are not updated with these extensions, we copied these
// extensions from <CL/cl_ext.h>
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
#define CL_DEVICE_WARP_SIZE_NV 0x4003
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
#endif
// reminders for build output window and log
#ifdef _WIN32
#pragma message ("Note: including shrUtils.h")
#pragma message ("Note: including opencl.h")
#endif
// SDK Revision #
#define OCL_SDKREVISION "7027912"
// Error and Exit Handling Macros...
// *********************************************************************
// Full error handling macro with Cleanup() callback (if supplied)...
// (Companion Inline Function lower on page)
#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__)
// Short version without Cleanup() callback pointer
// Both Input (a) and Reference (b) are specified as args
#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0)
//////////////////////////////////////////////////////////////////////////////
//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
//!
//! @return the id
//! @param clSelectedPlatformID OpenCL platform ID
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
//////////////////////////////////////////////////////////////////////////////
//! Print info about the device
//!
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
//////////////////////////////////////////////////////////////////////////////
//! Get and return device capability
//!
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
extern "C" int oclGetDevCap(cl_device_id device);
//////////////////////////////////////////////////////////////////////////////
//! Print the device name
//!
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the first device from the context
//!
//! @return the id
//! @param cxGPUContext OpenCL context
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the nth device from the context
//!
//! @return the id or -1 when out of range
//! @param cxGPUContext OpenCL context
//! @param device_idx index of the device of interest
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of device with maximal FLOPS from the context
//!
//! @return the id
//! @param cxGPUContext OpenCL context
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
//////////////////////////////////////////////////////////////////////////////
//! Loads a Program file and prepends the cPreamble to the code.
//!
//! @return the source string if succeeded, 0 otherwise
//! @param cFilename program filename
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
//! @param szFinalLength returned length of the code string
//////////////////////////////////////////////////////////////////////////////
extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
//////////////////////////////////////////////////////////////////////////////
//! Get the binary (PTX) of the program associated with the device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//! @param binary returned code
//! @param length length of returned code
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
//////////////////////////////////////////////////////////////////////////////
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//! @param const char* cPtxFileName optional PTX file name
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
//////////////////////////////////////////////////////////////////////////////
//! Get and log the Build Log from the OpenCL compiler for the requested program & device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
// Helper function for De-allocating cl objects
// *********************************************************************
extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
// Helper function to get OpenCL error string from constant
// *********************************************************************
extern "C" const char* oclErrorString(cl_int error);
// Helper function to get OpenCL image format string (channel order and type) from constant
// *********************************************************************
extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
// *********************************************************************
inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
{
// An error condition is defined by the sample/test value not equal to the reference
if (iReference != iSample)
{
// If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
iSample = (iSample == 0) ? -9999 : iSample;
// Log the error info
shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
// Cleanup and exit, or just exit if no cleanup function pointer provided. Use iSample (error code in this case) as process exit code.
if (pCleanup != NULL)
{
pCleanup(iSample);
}
else
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
exit(iSample);
}
}
}
#endif

View File

@@ -0,0 +1,238 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef SHR_QATEST_H
#define SHR_QATEST_H
// *********************************************************************
// Generic utilities for NVIDIA GPU Computing SDK
// *********************************************************************
// OS dependent includes
#ifdef _WIN32
#pragma message ("Note: including windows.h")
#pragma message ("Note: including math.h")
#pragma message ("Note: including assert.h")
#pragma message ("Note: including time.h")
// Headers needed for Windows
#include <windows.h>
#include <time.h>
#else
// Headers needed for Linux
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#include <unistd.h>
#include <time.h>
#endif
#ifndef STRCASECMP
#ifdef _WIN32
#define STRCASECMP _stricmp
#else
#define STRCASECMP strcasecmp
#endif
#endif
#ifndef STRNCASECMP
#ifdef _WIN32
#define STRNCASECMP _strnicmp
#else
#define STRNCASECMP strncasecmp
#endif
#endif
// Standardized QA Start/Finish for CUDA SDK tests
#define shrQAStart(a, b) __shrQAStart(a, b)
#define shrQAFinish(a, b, c) __shrQAFinish(a, b, c)
#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
inline int findExeNameStart(const char *exec_name)
{
int exename_start = (int)strlen(exec_name);
while( (exename_start > 0) &&
(exec_name[exename_start] != '\\') &&
(exec_name[exename_start] != '/') )
{
exename_start--;
}
if (exec_name[exename_start] == '\\' ||
exec_name[exename_start] == '/')
{
return exename_start+1;
} else {
return exename_start;
}
}
inline int __shrQAStart(int argc, char **argv)
{
bool bQATest = false;
// First clear the output buffer
fflush(stdout);
fflush(stdout);
for (int i=1; i < argc; i++) {
int string_start = 0;
while (argv[i][string_start] == '-')
string_start++;
char *string_argv = &argv[i][string_start];
if (!STRCASECMP(string_argv, "qatest")) {
bQATest = true;
}
}
// We don't want to print the entire path, so we search for the first
int exename_start = findExeNameStart(argv[0]);
if (bQATest) {
fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
fprintf(stdout, "\n");
} else {
fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
}
fflush(stdout);
printf("\n"); fflush(stdout);
return exename_start;
}
enum eQAstatus {
QA_FAILED = 0,
QA_PASSED = 1,
QA_WAIVED = 2
};
inline void __ExitInTime(int seconds)
{
fprintf(stdout, "> exiting in %d seconds: ", seconds);
fflush(stdout);
time_t t;
int count;
for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
fprintf(stdout, "%d...", count);
#ifdef WIN32
Sleep(1000);
#else
sleep(1);
#endif
}
fprintf(stdout,"done!\n\n");
fflush(stdout);
}
inline void __shrQAFinish(int argc, const char **argv, int iStatus)
{
// By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
for (int i=1; i < argc; i++) {
int string_start = 0;
while (argv[i][string_start] == '-')
string_start++;
const char *string_argv = &argv[i][string_start];
if (!STRCASECMP(string_argv, "qatest")) {
bQATest = true;
}
// For SDK individual samples that don't specify -noprompt or -prompt,
// a 3 second delay will happen before exiting, giving a user time to view results
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
bNoPrompt = true;
bQuitInTime = false;
}
if (!STRCASECMP(string_argv, "prompt")) {
bNoPrompt = false;
bQuitInTime = false;
}
}
int exename_start = findExeNameStart(argv[0]);
if (bQATest) {
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
fprintf(stdout, "\n");
} else {
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
}
fflush(stdout);
printf("\n"); fflush(stdout);
if (bQuitInTime) {
__ExitInTime(3);
} else {
if (!bNoPrompt) {
fprintf(stdout, "\nPress <Enter> to exit...\n");
fflush(stdout);
getchar();
}
}
}
inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
{
bool bQuitInTime = true;
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
for (int i=1; i < argc; i++) {
int string_start = 0;
while (argv[i][string_start] == '-')
string_start++;
const char *string_argv = &argv[i][string_start];
// For SDK individual samples that don't specify -noprompt or -prompt,
// a 3 second delay will happen before exiting, giving a user time to view results
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
bQuitInTime = false;
}
if (!STRCASECMP(string_argv, "prompt")) {
bQuitInTime = false;
}
}
int exename_start = findExeNameStart(argv[0]);
if (bQATest) {
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
fprintf(stdout, "\n");
} else {
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
}
fflush(stdout);
if (bQuitInTime) {
__ExitInTime(3);
}
}
inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
{
__shrQAFinish(argc, argv, iStatus);
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
}
inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
{
__shrQAFinish2(bQAtest, argc, argv, iStatus);
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
}
#endif

View File

@@ -0,0 +1,642 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef SHR_UTILS_H
#define SHR_UTILS_H
// *********************************************************************
// Generic utilities for NVIDIA GPU Computing SDK
// *********************************************************************
// reminders for output window and build log
#ifdef _WIN32
#pragma message ("Note: including windows.h")
#pragma message ("Note: including math.h")
#pragma message ("Note: including assert.h")
#endif
// OS dependent includes
#ifdef _WIN32
// Headers needed for Windows
#include <windows.h>
#else
// Headers needed for Linux
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#endif
// Other headers needed for both Windows and Linux
#include <math.h>
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
// Un-comment the following #define to enable profiling code in SDK apps
//#define GPU_PROFILING
// Beginning of GPU Architecture definitions
inline int ConvertSMVer2Cores(int major, int minor)
{
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
typedef struct {
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
int Cores;
} sSMtoCores;
sSMtoCores nGpuArchCoresPerSM[] =
{ { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
{ 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
{ -1, -1 }
};
int index = 0;
while (nGpuArchCoresPerSM[index].SM != -1) {
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
return nGpuArchCoresPerSM[index].Cores;
}
index++;
}
printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
return -1;
}
// end of GPU Architecture definitions
// Defines and enum for use with logging functions
// *********************************************************************
#define DEFAULTLOGFILE "SdkConsoleLog.txt"
#define MASTERLOGFILE "SdkMasterLog.csv"
enum LOGMODES
{
LOGCONSOLE = 1, // bit to signal "log to console"
LOGFILE = 2, // bit to signal "log to file"
LOGBOTH = 3, // convenience union of first 2 bits to signal "log to both"
APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
MASTER = 8, // bit to signal master .csv log output
ERRORMSG = 16, // bit to signal "pre-pend Error"
CLOSELOG = 32 // bit to close log file, if open, after any requested file write
};
#define HDASHLINE "-----------------------------------------------------------\n"
// Standardized boolean
enum shrBOOL
{
shrFALSE = 0,
shrTRUE = 1
};
// Standardized MAX, MIN and CLAMP
#define MAX(a, b) ((a > b) ? a : b)
#define MIN(a, b) ((a < b) ? a : b)
#define CLAMP(a, b, c) MIN(MAX(a, b), c) // double sided clip of input a
#define TOPCLAMP(a, b) (a < b ? a:b) // single top side clip of input a
// Error and Exit Handling Macros...
// *********************************************************************
// Full error handling macro with Cleanup() callback (if supplied)...
// (Companion Inline Function lower on page)
#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__)
// Short version without Cleanup() callback pointer
// Both Input (a) and Reference (b) are specified as args
#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0)
// Standardized Exit Macro for leaving main()... extended version
// (Companion Inline Function lower on page)
#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
// Standardized Exit Macro for leaving main()... short version
// (Companion Inline Function lower on page)
#define shrEXIT(a, b) __shrExitEX(a, b, EXIT_SUCCESS)
// Simple argument checker macro
#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE
// Define for user-customized error handling
#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
// Function to deallocate memory allocated within shrUtils
// *********************************************************************
extern "C" void shrFree(void* ptr);
// *********************************************************************
// Helper function to log standardized information to Console, to File or to both
//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n");
//! : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
//!
//! Automatically opens file and stores handle if needed and not done yet
//! Closes file and nulls handle on request
//!
//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.
//! LOGFILE and LOGBOTH may be | 'd with APPENDMODE to select file append mode instead of overwrite mode
//! LOGFILE and LOGBOTH may be | 'd with CLOSELOG to "write and close"
//! First 3 options may be | 'd with MASTER to enable independent write to master data log file
//! First 3 options may be | 'd with ERRORMSG to start line with standard error message
//! @param 2 dValue:
//! Positive val = double value for time in secs to be formatted to 6 decimals.
//! Negative val is an error code and this give error preformatting.
//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.
//! ALL printf flags, width, precision and type specifiers are supported with this exception:
//! Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
//! Single byte char type specifiers (%s and %c) ARE supported
//! @param 4... variable args: like printf or fprintf. Must match format specifer type above.
//! @return 0 if OK, negative value on error or if error occurs or was passed in.
// *********************************************************************
extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0,
// *********************************************************************
extern "C" int shrLog(const char* cFormatString, ...);
// *********************************************************************
// Delta timer function for up to 3 independent timers using host high performance counters
// Maintains state for 3 independent counters
//! Example: double dElapsedTime = shrDeltaTime(0);
//!
//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
//! @return delta time of specified counter since last call in seconds. Otherwise -9999.0 if error
// *********************************************************************
extern "C" double shrDeltaT(int iCounterID);
// Optional LogFileNameOverride function
// *********************************************************************
extern "C" void shrSetLogFileName (const char* cOverRideName);
// Helper function to init data arrays
// *********************************************************************
extern "C" void shrFillArray(float* pfData, int iSize);
// Helper function to print data arrays
// *********************************************************************
extern "C" void shrPrintArray(float* pfData, int iSize);
////////////////////////////////////////////////////////////////////////////
//! Find the path for a filename
//! @return the path if succeeded, otherwise 0
//! @param filename name of the file
//! @param executablePath optional absolute path of the executable
////////////////////////////////////////////////////////////////////////////
extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing single precision floating point data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing double precision floating point data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing integer data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing unsigned integer data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data,
unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing char / byte data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing unsigned char / byte data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data,
unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing single precision floating point
//! data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
//! @param epsilon epsilon for comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
const float epsilon, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing double precision floating point
//! data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
//! @param epsilon epsilon for comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
const double epsilon, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing integer data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing unsigned integer data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data,
unsigned int len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing char / byte data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing unsigned char / byte data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
unsigned int len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Load PPM image file (with unsigned char as data element type), padding
//! 4th component
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param OutData handle to the data read
//! @param w width of the image
//! @param h height of the image
//!
//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData,
unsigned int *w, unsigned int *h);
////////////////////////////////////////////////////////////////////////////
//! Save PPM image file (with unsigned char as data element type, padded to
//! 4 bytes)
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////////
//! Save PGM image file (with unsigned char as data element type)
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////
//! Load PGM image file (with unsigned char as data element type)
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
unsigned int *w,unsigned int *h);
////////////////////////////////////////////////////////////////////////////
// Command line arguments: General notes
// * All command line arguments begin with '--' followed by the token;
// token and value are seperated by '='; example --samples=50
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
// (without whitespaces)
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
//! Check if command line argument \a flag-name is given
//! @return shrTRUE if command line argument \a flag_name has been given,
//! otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param flag_name name of command line flag
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv,
const char* flag_name);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type int
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv,
const char* arg_name, int* val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type unsigned int
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv,
const char* arg_name, unsigned int* val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type float
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv,
const char* arg_name, float* val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type string
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv,
const char* arg_name, char** val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument list those element are strings
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val command line argument list
//! @param len length of the list / number of elements
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv,
const char* arg_name, char** val,
unsigned int* len);
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparef( const float* reference, const float* data,
const unsigned int len);
////////////////////////////////////////////////////////////////////////////
//! Compare two integer arrays
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparei( const int* reference, const int* data,
const unsigned int len );
////////////////////////////////////////////////////////////////////////////////
//! Compare two unsigned integer arrays, with epsilon and threshold
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
const unsigned int len, const float epsilon, const float threshold );
////////////////////////////////////////////////////////////////////////////
//! Compare two unsigned char arrays
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
const unsigned int len );
////////////////////////////////////////////////////////////////////////////////
//! Compare two integers with a tolernance for # of byte errors
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
const unsigned int len, const float epsilon, const float threshold );
////////////////////////////////////////////////////////////////////////////////
//! Compare two integer arrays witha n epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
const unsigned int len, const float epsilon );
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays with an epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
const unsigned int len, const float epsilon );
////////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays with an epsilon tolerance for equality and a
//! threshold for # pixel errors
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
const unsigned int len, const float epsilon, const float threshold );
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays using L2-norm with an epsilon tolerance for
//! equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
const unsigned int len, const float epsilon );
////////////////////////////////////////////////////////////////////////////////
//! Compare two PPM image files with an epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param src_file filename for the image to be compared
//! @param data filename for the reference data / gold image
//! @param epsilon epsilon to use for the comparison
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
//! $param verboseErrors output details of image mismatch to std::err
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
////////////////////////////////////////////////////////////////////////////////
//! Compare two PGM image files with an epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param src_file filename for the image to be compared
//! @param data filename for the reference data / gold image
//! @param epsilon epsilon to use for the comparison
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
//! $param verboseErrors output details of image mismatch to std::err
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
extern "C" size_t shrRoundUp(int group_size, int global_size);
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
// *********************************************************************
inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
{
if (iReference != iSample)
{
shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile);
if (pCleanup != NULL)
{
pCleanup(EXIT_FAILURE);
}
else
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
exit(EXIT_FAILURE);
}
}
}
// Standardized Exit
// *********************************************************************
inline void __shrExitEX(int argc, const char** argv, int iExitCode)
{
#ifdef WIN32
if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
#else
if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
#endif
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");
getchar();
}
else
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]);
}
fflush(stderr);
exit(iExitCode);
}
#endif

View File

@@ -0,0 +1,816 @@
//------------------------------------------
//--cambine:helper function for OpenCL
//--programmer: Jianbin Fang
//--date: 27/12/2010
//------------------------------------------
#ifndef _CL_HELPER_
#define _CL_HELPER_
#include <CL/cl.h>
#include <fstream>
#include <iostream>
#include <string>
#include <vector>
using std::string;
using std::ifstream;
using std::cerr;
using std::endl;
using std::cout;
//#pragma OPENCL EXTENSION cl_nv_compiler_options:enable
#define WORK_DIM 2 // work-items dimensions
struct oclHandleStruct {
cl_context context;
cl_device_id *devices;
cl_command_queue queue;
cl_program program;
cl_int cl_status;
std::string error_str;
std::vector<cl_kernel> kernel;
};
struct oclHandleStruct oclHandles;
char kernel_file[100] = "Kernels.cl";
int total_kernels = 2;
string kernel_names[2] = {"BFS_1", "BFS_2"};
int work_group_size = 512;
int device_id_inused = 0; // deviced id used (default : 0)
/*
* Converts the contents of a file into a string
*/
string FileToString(const string fileName) {
ifstream f(fileName.c_str(), ifstream::in | ifstream::binary);
try {
size_t size;
char *str;
string s;
if (f.is_open()) {
size_t fileSize;
f.seekg(0, ifstream::end);
size = fileSize = f.tellg();
f.seekg(0, ifstream::beg);
str = new char[size + 1];
if (!str)
throw(string("Could not allocate memory"));
f.read(str, fileSize);
f.close();
str[size] = '\0';
s = str;
delete[] str;
return s;
}
} catch (std::string msg) {
cerr << "Exception caught in FileToString(): " << msg << endl;
if (f.is_open())
f.close();
} catch (...) {
cerr << "Exception caught in FileToString()" << endl;
if (f.is_open())
f.close();
}
string errorMsg = "FileToString()::Error: Unable to open file " + fileName;
throw(errorMsg);
}
//---------------------------------------
// Read command line parameters
//
void _clCmdParams(int argc, char *argv[]) {
for (int i = 0; i < argc; ++i) {
switch (argv[i][1]) {
case 'g': //--g stands for size of work group
if (++i < argc) {
sscanf(argv[i], "%u", &work_group_size);
} else {
std::cerr << "Could not read argument after option " << argv[i - 1]
<< std::endl;
throw;
}
break;
case 'd': //--d stands for device id used in computaion
if (++i < argc) {
sscanf(argv[i], "%u", &device_id_inused);
} else {
std::cerr << "Could not read argument after option " << argv[i - 1]
<< std::endl;
throw;
}
break;
default:;
}
}
}
//---------------------------------------
// Initlize CL objects
//--description: there are 5 steps to initialize all the OpenCL objects needed
//--revised on 04/01/2011: get the number of devices and
// devices have no relationship with context
void _clInit() {
printf("_clInit()\n");
int DEVICE_ID_INUSED = device_id_inused;
cl_int resultCL;
oclHandles.context = NULL;
oclHandles.devices = NULL;
oclHandles.queue = NULL;
oclHandles.program = NULL;
cl_uint deviceListSize;
//-----------------------------------------------
//--cambine-1: find the available platforms and select one
cl_uint numPlatforms = 1;
cl_platform_id targetPlatform = NULL;
cl_platform_id *allPlatforms =
(cl_platform_id *)malloc(numPlatforms * sizeof(cl_platform_id));
resultCL = clGetPlatformIDs(numPlatforms, allPlatforms, NULL);
if (resultCL != CL_SUCCESS)
throw(string("InitCL()::Error: Getting platform ids (clGetPlatformIDs)"));
// Select the target platform. Default: first platform
targetPlatform = allPlatforms[0];
/*for (int i = 0; i < numPlatforms; i++)
{
char pbuff[128];
resultCL = clGetPlatformInfo( allPlatforms[i],
CL_PLATFORM_VENDOR,
sizeof(pbuff),
pbuff,
NULL);
if (resultCL != CL_SUCCESS)
throw (string("InitCL()::Error: Getting platform info (clGetPlatformInfo)"));
//printf("vedor is %s\n",pbuff);
}
free(allPlatforms);*/
//-----------------------------------------------
//--cambine-2: create an OpenCL context
/*cl_context_properties cprops[3] = { CL_CONTEXT_PLATFORM,
(cl_context_properties)targetPlatform, 0 };
oclHandles.context = clCreateContextFromType(cprops,
CL_DEVICE_TYPE_GPU,
NULL,
NULL,
&resultCL);
if ((resultCL != CL_SUCCESS) || (oclHandles.context == NULL))
throw (string("InitCL()::Error: Creating Context
(clCreateContextFromType)"));
//-----------------------------------------------
//--cambine-3: detect OpenCL devices
// First, get the size of device list
oclHandles.cl_status = clGetDeviceIDs(targetPlatform, CL_DEVICE_TYPE_GPU, 0,
NULL, &deviceListSize);
if(oclHandles.cl_status!=CL_SUCCESS){
throw(string("exception in _clInit -> clGetDeviceIDs"));
}
if (deviceListSize == 0)
throw(string("InitCL()::Error: No devices found."));
printf("OK1()\n");
//std::cout<<"device number:"<<deviceListSize<<std::endl;*/
// Now, allocate the device list
deviceListSize = 1;
oclHandles.devices =
(cl_device_id *)malloc(deviceListSize * sizeof(cl_device_id));
if (oclHandles.devices == 0)
throw(string("InitCL()::Error: Could not allocate memory."));
//* Next, get the device list data
oclHandles.cl_status =
clGetDeviceIDs(targetPlatform, CL_DEVICE_TYPE_DEFAULT, deviceListSize,
oclHandles.devices, NULL);
if (oclHandles.cl_status != CL_SUCCESS) {
throw(string("exception in _clInit -> clGetDeviceIDs-2"));
}
oclHandles.context = clCreateContext(NULL, deviceListSize, oclHandles.devices,
NULL, NULL, &resultCL);
if ((resultCL != CL_SUCCESS) || (oclHandles.context == NULL))
throw(string("InitCL()::Error: Creating Context (clCreateContext)"));
//-----------------------------------------------
//--cambine-4: Create an OpenCL command queue
oclHandles.queue = clCreateCommandQueue(
oclHandles.context, oclHandles.devices[DEVICE_ID_INUSED], 0, &resultCL);
printf("resultCL=%d, queue=0x%x\n", resultCL, oclHandles.queue);
if ((resultCL != CL_SUCCESS) || (oclHandles.queue == NULL))
throw(string("InitCL()::Creating Command Queue. (clCreateCommandQueue)"));
//-----------------------------------------------
//--cambine-5: Load CL file, build CL program object, create CL kernel object
/*std::string source_str = FileToString(kernel_file);
const char * source = source_str.c_str();
size_t sourceSize[] = { source_str.length() };*/
oclHandles.program = clCreateProgramWithBuiltInKernels(
oclHandles.context, 1, &oclHandles.devices[DEVICE_ID_INUSED],
"BFS_1;BFS_2", &resultCL);
/*oclHandles.program = clCreateProgramWithSource(oclHandles.context,
1,
&source,
sourceSize,
&resultCL);*/
if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL))
throw(string("InitCL()::Error: Loading Binary into cl_program. "
"(clCreateProgramWithBinary)"));
// insert debug information
// std::string options= "-cl-nv-verbose"; //Doesn't work on AMD machines
// options += " -cl-nv-opt-level=3";
resultCL = clBuildProgram(oclHandles.program, deviceListSize,
oclHandles.devices, NULL, NULL, NULL);
if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL)) {
cerr << "InitCL()::Error: In clBuildProgram" << endl;
size_t length;
resultCL = clGetProgramBuildInfo(oclHandles.program,
oclHandles.devices[DEVICE_ID_INUSED],
CL_PROGRAM_BUILD_LOG, 0, NULL, &length);
if (resultCL != CL_SUCCESS)
throw(string("InitCL()::Error: Getting Program build "
"info(clGetProgramBuildInfo)"));
char *buffer = (char *)malloc(length);
resultCL = clGetProgramBuildInfo(
oclHandles.program, oclHandles.devices[DEVICE_ID_INUSED],
CL_PROGRAM_BUILD_LOG, length, buffer, NULL);
if (resultCL != CL_SUCCESS)
throw(string("InitCL()::Error: Getting Program build "
"info(clGetProgramBuildInfo)"));
cerr << buffer << endl;
free(buffer);
throw(string("InitCL()::Error: Building Program (clBuildProgram)"));
}
// get program information in intermediate representation
#ifdef PTX_MSG
size_t binary_sizes[deviceListSize];
char *binaries[deviceListSize];
// figure out number of devices and the sizes of the binary for each device.
oclHandles.cl_status =
clGetProgramInfo(oclHandles.program, CL_PROGRAM_BINARY_SIZES,
sizeof(size_t) * deviceListSize, &binary_sizes, NULL);
if (oclHandles.cl_status != CL_SUCCESS) {
throw(string("--cambine:exception in _InitCL -> clGetProgramInfo-2"));
}
std::cout << "--cambine:" << binary_sizes << std::endl;
// copy over all of the generated binaries.
for (int i = 0; i < deviceListSize; i++)
binaries[i] = (char *)malloc(sizeof(char) * (binary_sizes[i] + 1));
oclHandles.cl_status =
clGetProgramInfo(oclHandles.program, CL_PROGRAM_BINARIES,
sizeof(char *) * deviceListSize, binaries, NULL);
if (oclHandles.cl_status != CL_SUCCESS) {
throw(string("--cambine:exception in _InitCL -> clGetProgramInfo-3"));
}
for (int i = 0; i < deviceListSize; i++)
binaries[i][binary_sizes[i]] = '\0';
std::cout << "--cambine:writing ptd information..." << std::endl;
FILE *ptx_file = fopen("cl.ptx", "w");
if (ptx_file == NULL) {
throw(string("exceptions in allocate ptx file."));
}
fprintf(ptx_file, "%s", binaries[DEVICE_ID_INUSED]);
fclose(ptx_file);
std::cout << "--cambine:writing ptd information done." << std::endl;
for (int i = 0; i < deviceListSize; i++)
free(binaries[i]);
#endif
for (int nKernel = 0; nKernel < total_kernels; nKernel++) {
/* get a kernel object handle for a kernel with the given name */
cl_kernel kernel = clCreateKernel(
oclHandles.program, (kernel_names[nKernel]).c_str(), &resultCL);
if ((resultCL != CL_SUCCESS) || (kernel == NULL)) {
string errorMsg = "InitCL()::Error: Creating Kernel (clCreateKernel) \"" +
kernel_names[nKernel] + "\"";
throw(errorMsg);
}
oclHandles.kernel.push_back(kernel);
}
// get resource alocation information
#ifdef RES_MSG
char *build_log;
size_t ret_val_size;
oclHandles.cl_status = clGetProgramBuildInfo(
oclHandles.program, oclHandles.devices[DEVICE_ID_INUSED],
CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
if (oclHandles.cl_status != CL_SUCCESS) {
throw(string("exceptions in _InitCL -> getting resource information"));
}
build_log = (char *)malloc(ret_val_size + 1);
oclHandles.cl_status = clGetProgramBuildInfo(
oclHandles.program, oclHandles.devices[DEVICE_ID_INUSED],
CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL);
if (oclHandles.cl_status != CL_SUCCESS) {
throw(string(
"exceptions in _InitCL -> getting resources allocation information-2"));
}
build_log[ret_val_size] = '\0';
std::cout << "--cambine:" << build_log << std::endl;
free(build_log);
#endif
}
//---------------------------------------
// release CL objects
void _clRelease() {
char errorFlag = false;
for (int nKernel = 0; nKernel < oclHandles.kernel.size(); nKernel++) {
if (oclHandles.kernel[nKernel] != NULL) {
cl_int resultCL = clReleaseKernel(oclHandles.kernel[nKernel]);
if (resultCL != CL_SUCCESS) {
cerr << "ReleaseCL()::Error: In clReleaseKernel" << endl;
errorFlag = true;
}
oclHandles.kernel[nKernel] = NULL;
}
oclHandles.kernel.clear();
}
if (oclHandles.program != NULL) {
cl_int resultCL = clReleaseProgram(oclHandles.program);
if (resultCL != CL_SUCCESS) {
cerr << "ReleaseCL()::Error: In clReleaseProgram" << endl;
errorFlag = true;
}
oclHandles.program = NULL;
}
if (oclHandles.queue != NULL) {
cl_int resultCL = clReleaseCommandQueue(oclHandles.queue);
if (resultCL != CL_SUCCESS) {
cerr << "ReleaseCL()::Error: In clReleaseCommandQueue" << endl;
errorFlag = true;
}
oclHandles.queue = NULL;
}
free(oclHandles.devices);
if (oclHandles.context != NULL) {
cl_int resultCL = clReleaseContext(oclHandles.context);
if (resultCL != CL_SUCCESS) {
cerr << "ReleaseCL()::Error: In clReleaseContext" << endl;
errorFlag = true;
}
oclHandles.context = NULL;
}
if (errorFlag)
throw(string("ReleaseCL()::Error encountered."));
}
//--------------------------------------------------------
//--cambine:create buffer and then copy data from host to device
cl_mem _clCreateAndCpyMem(int size, void *h_mem_source) throw(string) {
cl_mem d_mem;
d_mem = clCreateBuffer(oclHandles.context,
CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, size,
h_mem_source, &oclHandles.cl_status);
#ifdef ERRMSG
if (oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clCreateAndCpyMem()"));
#endif
return d_mem;
}
//-------------------------------------------------------
//--cambine: create read only buffer for devices
//--date: 17/01/2011
cl_mem _clMallocRW(int size, void *h_mem_ptr) throw(string) {
cl_mem d_mem;
d_mem = clCreateBuffer(oclHandles.context,
CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, size,
h_mem_ptr, &oclHandles.cl_status);
#ifdef ERRMSG
if (oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clMallocRW"));
#endif
return d_mem;
}
//-------------------------------------------------------
//--cambine: create read and write buffer for devices
//--date: 17/01/2011
cl_mem _clMalloc(int size, void *h_mem_ptr) throw(string) {
cl_mem d_mem;
d_mem = clCreateBuffer(oclHandles.context,
CL_MEM_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, size,
h_mem_ptr, &oclHandles.cl_status);
#ifdef ERRMSG
if (oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clMalloc"));
#endif
return d_mem;
}
//-------------------------------------------------------
//--cambine: transfer data from host to device
//--date: 17/01/2011
void _clMemcpyH2D(cl_mem d_mem, int size, const void *h_mem_ptr) throw(string) {
oclHandles.cl_status = clEnqueueWriteBuffer(
oclHandles.queue, d_mem, CL_TRUE, 0, size, h_mem_ptr, 0, NULL, NULL);
#ifdef ERRMSG
if (oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clMemcpyH2D"));
#endif
}
//--------------------------------------------------------
//--cambine:create buffer and then copy data from host to device with pinned
// memory
cl_mem _clCreateAndCpyPinnedMem(int size, float *h_mem_source) throw(string) {
cl_mem d_mem, d_mem_pinned;
float *h_mem_pinned = NULL;
d_mem_pinned = clCreateBuffer(oclHandles.context,
CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, size,
NULL, &oclHandles.cl_status);
#ifdef ERRMSG
if (oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clCreateAndCpyMem()->d_mem_pinned"));
#endif
//------------
d_mem = clCreateBuffer(oclHandles.context, CL_MEM_READ_ONLY, size, NULL,
&oclHandles.cl_status);
#ifdef ERRMSG
if (oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clCreateAndCpyMem() -> d_mem "));
#endif
//----------
h_mem_pinned = (cl_float *)clEnqueueMapBuffer(
oclHandles.queue, d_mem_pinned, CL_TRUE, CL_MAP_WRITE, 0, size, 0, NULL,
NULL, &oclHandles.cl_status);
#ifdef ERRMSG
if (oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clCreateAndCpyMem() -> clEnqueueMapBuffer"));
#endif
int element_number = size / sizeof(float);
#pragma omp parallel for
for (int i = 0; i < element_number; i++) {
h_mem_pinned[i] = h_mem_source[i];
}
//----------
oclHandles.cl_status = clEnqueueWriteBuffer(
oclHandles.queue, d_mem, CL_TRUE, 0, size, h_mem_pinned, 0, NULL, NULL);
#ifdef ERRMSG
if (oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clCreateAndCpyMem() -> clEnqueueWriteBuffer"));
#endif
return d_mem;
}
//--------------------------------------------------------
//--cambine:create write only buffer on device
cl_mem _clMallocWO(int size) throw(string) {
cl_mem d_mem;
d_mem = clCreateBuffer(oclHandles.context, CL_MEM_WRITE_ONLY, size, 0,
&oclHandles.cl_status);
#ifdef ERRMSG
if (oclHandles.cl_status != CL_SUCCESS)
throw(string("excpetion in _clCreateMem()"));
#endif
return d_mem;
}
//--------------------------------------------------------
// transfer data from device to host
void _clMemcpyD2H(cl_mem d_mem, int size, void *h_mem) throw(string) {
oclHandles.cl_status = clEnqueueReadBuffer(oclHandles.queue, d_mem, CL_TRUE,
0, size, h_mem, 0, 0, 0);
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clCpyMemD2H -> ";
switch (oclHandles.cl_status) {
case CL_INVALID_COMMAND_QUEUE:
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
break;
case CL_INVALID_CONTEXT:
oclHandles.error_str += "CL_INVALID_CONTEXT";
break;
case CL_INVALID_MEM_OBJECT:
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
break;
case CL_INVALID_VALUE:
oclHandles.error_str += "CL_INVALID_VALUE";
break;
case CL_INVALID_EVENT_WAIT_LIST:
oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
break;
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
break;
case CL_OUT_OF_HOST_MEMORY:
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
break;
default:
oclHandles.error_str += "Unknown reason";
break;
}
if (oclHandles.cl_status != CL_SUCCESS)
throw(oclHandles.error_str);
#endif
}
//--------------------------------------------------------
// set kernel arguments
void _clSetArgs(int kernel_id, int arg_idx, void *d_mem,
int size = 0) throw(string) {
if (!size) {
oclHandles.cl_status = clSetKernelArg(oclHandles.kernel[kernel_id], arg_idx,
sizeof(d_mem), &d_mem);
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clSetKernelArg() ";
switch (oclHandles.cl_status) {
case CL_INVALID_KERNEL:
oclHandles.error_str += "CL_INVALID_KERNEL";
break;
case CL_INVALID_ARG_INDEX:
oclHandles.error_str += "CL_INVALID_ARG_INDEX";
break;
case CL_INVALID_ARG_VALUE:
oclHandles.error_str += "CL_INVALID_ARG_VALUE";
break;
case CL_INVALID_MEM_OBJECT:
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
break;
case CL_INVALID_SAMPLER:
oclHandles.error_str += "CL_INVALID_SAMPLER";
break;
case CL_INVALID_ARG_SIZE:
oclHandles.error_str += "CL_INVALID_ARG_SIZE";
break;
case CL_OUT_OF_RESOURCES:
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
break;
case CL_OUT_OF_HOST_MEMORY:
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
break;
default:
oclHandles.error_str += "Unknown reason";
break;
}
if (oclHandles.cl_status != CL_SUCCESS)
throw(oclHandles.error_str);
#endif
} else {
oclHandles.cl_status =
clSetKernelArg(oclHandles.kernel[kernel_id], arg_idx, size, d_mem);
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clSetKernelArg() ";
switch (oclHandles.cl_status) {
case CL_INVALID_KERNEL:
oclHandles.error_str += "CL_INVALID_KERNEL";
break;
case CL_INVALID_ARG_INDEX:
oclHandles.error_str += "CL_INVALID_ARG_INDEX";
break;
case CL_INVALID_ARG_VALUE:
oclHandles.error_str += "CL_INVALID_ARG_VALUE";
break;
case CL_INVALID_MEM_OBJECT:
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
break;
case CL_INVALID_SAMPLER:
oclHandles.error_str += "CL_INVALID_SAMPLER";
break;
case CL_INVALID_ARG_SIZE:
oclHandles.error_str += "CL_INVALID_ARG_SIZE";
break;
case CL_OUT_OF_RESOURCES:
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
break;
case CL_OUT_OF_HOST_MEMORY:
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
break;
default:
oclHandles.error_str += "Unknown reason";
break;
}
if (oclHandles.cl_status != CL_SUCCESS)
throw(oclHandles.error_str);
#endif
}
}
void _clFinish() throw(string) {
oclHandles.cl_status = clFinish(oclHandles.queue);
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clFinish";
switch (oclHandles.cl_status) {
case CL_INVALID_COMMAND_QUEUE:
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
break;
case CL_OUT_OF_RESOURCES:
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
break;
case CL_OUT_OF_HOST_MEMORY:
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
break;
default:
oclHandles.error_str += "Unknown reasons";
break;
}
if (oclHandles.cl_status != CL_SUCCESS) {
throw(oclHandles.error_str);
}
#endif
}
//--------------------------------------------------------
//--cambine:enqueue kernel
void _clInvokeKernel(int kernel_id, int work_items,
int work_group_size) throw(string) {
cl_uint work_dim = WORK_DIM;
cl_event e[1];
if (work_items % work_group_size != 0) // process situations that work_items
// cannot be divided by work_group_size
work_items =
work_items + (work_group_size - (work_items % work_group_size));
size_t local_work_size[] = {work_group_size, 1};
size_t global_work_size[] = {work_items, 1};
oclHandles.cl_status = clEnqueueNDRangeKernel(
oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0,
global_work_size, local_work_size, 0, 0, &(e[0]));
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clInvokeKernel() -> ";
switch (oclHandles.cl_status) {
case CL_INVALID_PROGRAM_EXECUTABLE:
oclHandles.error_str += "CL_INVALID_PROGRAM_EXECUTABLE";
break;
case CL_INVALID_COMMAND_QUEUE:
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
break;
case CL_INVALID_KERNEL:
oclHandles.error_str += "CL_INVALID_KERNEL";
break;
case CL_INVALID_CONTEXT:
oclHandles.error_str += "CL_INVALID_CONTEXT";
break;
case CL_INVALID_KERNEL_ARGS:
oclHandles.error_str += "CL_INVALID_KERNEL_ARGS";
break;
case CL_INVALID_WORK_DIMENSION:
oclHandles.error_str += "CL_INVALID_WORK_DIMENSION";
break;
case CL_INVALID_GLOBAL_WORK_SIZE:
oclHandles.error_str += "CL_INVALID_GLOBAL_WORK_SIZE";
break;
case CL_INVALID_WORK_GROUP_SIZE:
oclHandles.error_str += "CL_INVALID_WORK_GROUP_SIZE";
break;
case CL_INVALID_WORK_ITEM_SIZE:
oclHandles.error_str += "CL_INVALID_WORK_ITEM_SIZE";
break;
case CL_INVALID_GLOBAL_OFFSET:
oclHandles.error_str += "CL_INVALID_GLOBAL_OFFSET";
break;
case CL_OUT_OF_RESOURCES:
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
break;
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
break;
case CL_INVALID_EVENT_WAIT_LIST:
oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
break;
case CL_OUT_OF_HOST_MEMORY:
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
break;
default:
oclHandles.error_str += "Unkown reseason";
break;
}
if (oclHandles.cl_status != CL_SUCCESS)
throw(oclHandles.error_str);
#endif
//_clFinish();
// oclHandles.cl_status = clWaitForEvents(1, &e[0]);
// #ifdef ERRMSG
// if (oclHandles.cl_status!= CL_SUCCESS)
// throw(string("excpetion in _clEnqueueNDRange() -> clWaitForEvents"));
// #endif
}
void _clInvokeKernel2D(int kernel_id, int range_x, int range_y, int group_x,
int group_y) throw(string) {
cl_uint work_dim = WORK_DIM;
size_t local_work_size[] = {group_x, group_y};
size_t global_work_size[] = {range_x, range_y};
cl_event e[1];
/*if(work_items%work_group_size != 0) //process situations that work_items
cannot be divided by work_group_size
work_items = work_items + (work_group_size-(work_items%work_group_size));*/
oclHandles.cl_status = clEnqueueNDRangeKernel(
oclHandles.queue, oclHandles.kernel[kernel_id], work_dim, 0,
global_work_size, local_work_size, 0, 0, &(e[0]));
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clInvokeKernel() -> ";
switch (oclHandles.cl_status) {
case CL_INVALID_PROGRAM_EXECUTABLE:
oclHandles.error_str += "CL_INVALID_PROGRAM_EXECUTABLE";
break;
case CL_INVALID_COMMAND_QUEUE:
oclHandles.error_str += "CL_INVALID_COMMAND_QUEUE";
break;
case CL_INVALID_KERNEL:
oclHandles.error_str += "CL_INVALID_KERNEL";
break;
case CL_INVALID_CONTEXT:
oclHandles.error_str += "CL_INVALID_CONTEXT";
break;
case CL_INVALID_KERNEL_ARGS:
oclHandles.error_str += "CL_INVALID_KERNEL_ARGS";
break;
case CL_INVALID_WORK_DIMENSION:
oclHandles.error_str += "CL_INVALID_WORK_DIMENSION";
break;
case CL_INVALID_GLOBAL_WORK_SIZE:
oclHandles.error_str += "CL_INVALID_GLOBAL_WORK_SIZE";
break;
case CL_INVALID_WORK_GROUP_SIZE:
oclHandles.error_str += "CL_INVALID_WORK_GROUP_SIZE";
break;
case CL_INVALID_WORK_ITEM_SIZE:
oclHandles.error_str += "CL_INVALID_WORK_ITEM_SIZE";
break;
case CL_INVALID_GLOBAL_OFFSET:
oclHandles.error_str += "CL_INVALID_GLOBAL_OFFSET";
break;
case CL_OUT_OF_RESOURCES:
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
break;
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
oclHandles.error_str += "CL_MEM_OBJECT_ALLOCATION_FAILURE";
break;
case CL_INVALID_EVENT_WAIT_LIST:
oclHandles.error_str += "CL_INVALID_EVENT_WAIT_LIST";
break;
case CL_OUT_OF_HOST_MEMORY:
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
break;
default:
oclHandles.error_str += "Unkown reseason";
break;
}
if (oclHandles.cl_status != CL_SUCCESS)
throw(oclHandles.error_str);
#endif
//_clFinish();
/*oclHandles.cl_status = clWaitForEvents(1, &e[0]);
#ifdef ERRMSG
if (oclHandles.cl_status!= CL_SUCCESS)
throw(string("excpetion in _clEnqueueNDRange() -> clWaitForEvents"));
#endif*/
}
//--------------------------------------------------------
// release OpenCL objects
void _clFree(cl_mem ob) throw(string) {
if (ob != NULL)
oclHandles.cl_status = clReleaseMemObject(ob);
#ifdef ERRMSG
oclHandles.error_str = "excpetion in _clFree() ->";
switch (oclHandles.cl_status) {
case CL_INVALID_MEM_OBJECT:
oclHandles.error_str += "CL_INVALID_MEM_OBJECT";
break;
case CL_OUT_OF_RESOURCES:
oclHandles.error_str += "CL_OUT_OF_RESOURCES";
break;
case CL_OUT_OF_HOST_MEMORY:
oclHandles.error_str += "CL_OUT_OF_HOST_MEMORY";
break;
default:
oclHandles.error_str += "Unkown reseason";
break;
}
if (oclHandles.cl_status != CL_SUCCESS)
throw(oclHandles.error_str);
#endif
}
#endif //_CL_HELPER_

View File

@@ -0,0 +1,68 @@
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH)
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT = bfs
SRCS = main.cc
all: $(PROJECT).dump $(PROJECT).hex
lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -strace -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug

View File

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,53 @@
/* ============================================================
//--cambine: kernel funtion of Breadth-First-Search
//--author: created by Jianbin Fang
//--date: 06/12/2010
============================================================ */
//#pragma OPENCL EXTENSION cl_khr_byte_addressable_store: enable
//Structure to hold a node information
typedef struct{
int starting;
int no_of_edges;
} Node;
//--7 parameters
__kernel void BFS_1( const __global Node* g_graph_nodes,
const __global int* g_graph_edges,
__global char* g_graph_mask,
__global char* g_updating_graph_mask,
__global char* g_graph_visited,
__global int* g_cost,
const int no_of_nodes){
int tid = get_global_id(0);
if( tid<no_of_nodes && g_graph_mask[tid]){
g_graph_mask[tid]=false;
for(int i=g_graph_nodes[tid].starting; i<(g_graph_nodes[tid].no_of_edges + g_graph_nodes[tid].starting); i++){
int id = g_graph_edges[i];
if(!g_graph_visited[id]){
g_cost[id]=g_cost[tid]+1;
g_updating_graph_mask[id]=true;
}
}
}
}
//--5 parameters
__kernel void BFS_2(__global char* g_graph_mask,
__global char* g_updating_graph_mask,
__global char* g_graph_visited,
__global char* g_over,
const int no_of_nodes
) {
int tid = get_global_id(0);
if( tid<no_of_nodes && g_updating_graph_mask[tid]){
g_graph_mask[tid]=true;
g_graph_visited[tid]=true;
*g_over=true;
g_updating_graph_mask[tid]=false;
}
}

Binary file not shown.

297
benchmarks/old_opencl/bfs/main.cc Executable file
View File

@@ -0,0 +1,297 @@
//--by Jianbin Fang
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <string>
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#ifdef PROFILING
#include "timer.h"
#endif
#include "CLHelper.h"
#include "util.h"
#define MAX_THREADS_PER_BLOCK 256
// Structure to hold a node information
struct Node {
int starting;
int no_of_edges;
};
//----------------------------------------------------------
//--bfs on cpu
//--programmer: jianbin
//--date: 26/01/2011
//--note: width is changed to the new_width
//----------------------------------------------------------
void run_bfs_cpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size,
int *h_graph_edges, char *h_graph_mask,
char *h_updating_graph_mask, char *h_graph_visited,
int *h_cost_ref) {
char stop;
int k = 0;
do {
// if no thread changes this value then the loop stops
stop = false;
for (int tid = 0; tid < no_of_nodes; tid++) {
if (h_graph_mask[tid] == true) {
h_graph_mask[tid] = false;
for (int i = h_graph_nodes[tid].starting;
i < (h_graph_nodes[tid].no_of_edges + h_graph_nodes[tid].starting);
i++) {
int id =
h_graph_edges[i]; //--cambine: node id is connected with node tid
if (!h_graph_visited[id]) { //--cambine: if node id has not been
//visited, enter the body below
h_cost_ref[id] = h_cost_ref[tid] + 1;
h_updating_graph_mask[id] = true;
}
}
}
}
for (int tid = 0; tid < no_of_nodes; tid++) {
if (h_updating_graph_mask[tid] == true) {
h_graph_mask[tid] = true;
h_graph_visited[tid] = true;
stop = true;
h_updating_graph_mask[tid] = false;
}
}
k++;
} while (stop);
}
//----------------------------------------------------------
//--breadth first search on GPUs
//----------------------------------------------------------
void run_bfs_gpu(int no_of_nodes, Node *h_graph_nodes, int edge_list_size,
int *h_graph_edges, char *h_graph_mask,
char *h_updating_graph_mask, char *h_graph_visited,
int *h_cost) throw(std::string) {
// int number_elements = height*width;
char h_over;
cl_mem d_graph_nodes, d_graph_edges, d_graph_mask, d_updating_graph_mask,
d_graph_visited, d_cost, d_over;
try {
//--1 transfer data from host to device
_clInit();
d_graph_nodes = _clMalloc(no_of_nodes * sizeof(Node), h_graph_nodes);
d_graph_edges = _clMalloc(edge_list_size * sizeof(int), h_graph_edges);
d_graph_mask = _clMallocRW(no_of_nodes * sizeof(char), h_graph_mask);
d_updating_graph_mask =
_clMallocRW(no_of_nodes * sizeof(char), h_updating_graph_mask);
d_graph_visited = _clMallocRW(no_of_nodes * sizeof(char), h_graph_visited);
d_cost = _clMallocRW(no_of_nodes * sizeof(int), h_cost);
d_over = _clMallocRW(sizeof(char), &h_over);
_clMemcpyH2D(d_graph_nodes, no_of_nodes * sizeof(Node), h_graph_nodes);
_clMemcpyH2D(d_graph_edges, edge_list_size * sizeof(int), h_graph_edges);
_clMemcpyH2D(d_graph_mask, no_of_nodes * sizeof(char), h_graph_mask);
_clMemcpyH2D(d_updating_graph_mask, no_of_nodes * sizeof(char),
h_updating_graph_mask);
_clMemcpyH2D(d_graph_visited, no_of_nodes * sizeof(char), h_graph_visited);
_clMemcpyH2D(d_cost, no_of_nodes * sizeof(int), h_cost);
//--2 invoke kernel
#ifdef PROFILING
timer kernel_timer;
double kernel_time = 0.0;
kernel_timer.reset();
kernel_timer.start();
#endif
do {
h_over = false;
_clMemcpyH2D(d_over, sizeof(char), &h_over);
//--kernel 0
int kernel_id = 0;
int kernel_idx = 0;
_clSetArgs(kernel_id, kernel_idx++, d_graph_nodes);
_clSetArgs(kernel_id, kernel_idx++, d_graph_edges);
_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
_clSetArgs(kernel_id, kernel_idx++, d_cost);
_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
// int work_items = no_of_nodes;
_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
//--kernel 1
kernel_id = 1;
kernel_idx = 0;
_clSetArgs(kernel_id, kernel_idx++, d_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_updating_graph_mask);
_clSetArgs(kernel_id, kernel_idx++, d_graph_visited);
_clSetArgs(kernel_id, kernel_idx++, d_over);
_clSetArgs(kernel_id, kernel_idx++, &no_of_nodes, sizeof(int));
// work_items = no_of_nodes;
_clInvokeKernel(kernel_id, no_of_nodes, work_group_size);
_clMemcpyD2H(d_over, sizeof(char), &h_over);
} while (h_over);
_clFinish();
#ifdef PROFILING
kernel_timer.stop();
kernel_time = kernel_timer.getTimeInSeconds();
#endif
//--3 transfer data from device to host
_clMemcpyD2H(d_cost, no_of_nodes * sizeof(int), h_cost);
//--statistics
#ifdef PROFILING
std::cout << "kernel time(s):" << kernel_time << std::endl;
#endif
//--4 release cl resources.
_clFree(d_graph_nodes);
_clFree(d_graph_edges);
_clFree(d_graph_mask);
_clFree(d_updating_graph_mask);
_clFree(d_graph_visited);
_clFree(d_cost);
_clFree(d_over);
_clRelease();
} catch (std::string msg) {
_clFree(d_graph_nodes);
_clFree(d_graph_edges);
_clFree(d_graph_mask);
_clFree(d_updating_graph_mask);
_clFree(d_graph_visited);
_clFree(d_cost);
_clFree(d_over);
_clRelease();
std::string e_str = "in run_transpose_gpu -> ";
e_str += msg;
throw(e_str);
}
return;
}
//----------------------------------------------------------
//--cambine: main function
//--author: created by Jianbin Fang
//--date: 25/01/2011
//----------------------------------------------------------
int main(int argc, char *argv[]) {
printf("enter demo main\n");
int no_of_nodes;
int edge_list_size;
FILE *fp;
Node *h_graph_nodes;
char *h_graph_mask, *h_updating_graph_mask, *h_graph_visited;
try {
char *input_f = "graph4096.txt";
printf("Reading File\n");
// Read in Graph from a file
fp = fopen(input_f, "r");
if (!fp) {
printf("Error Reading graph file\n");
return 0;
}
printf("Reading File completed!\n");
int source = 0;
fscanf(fp, "%d", &no_of_nodes);
int num_of_blocks = 1;
int num_of_threads_per_block = no_of_nodes;
// Make execution Parameters according to the number of nodes
// Distribute threads across multiple Blocks if necessary
if (no_of_nodes > MAX_THREADS_PER_BLOCK) {
num_of_blocks = (int)ceil(no_of_nodes / (double)MAX_THREADS_PER_BLOCK);
num_of_threads_per_block = MAX_THREADS_PER_BLOCK;
}
work_group_size = num_of_threads_per_block;
// allocate host memory
h_graph_nodes = (Node *)malloc(sizeof(Node) * no_of_nodes);
h_graph_mask = (char *)malloc(sizeof(char) * no_of_nodes);
h_updating_graph_mask = (char *)malloc(sizeof(char) * no_of_nodes);
h_graph_visited = (char *)malloc(sizeof(char) * no_of_nodes);
int start, edgeno;
// initalize the memory
for (int i = 0; i < no_of_nodes; i++) {
fscanf(fp, "%d %d", &start, &edgeno);
h_graph_nodes[i].starting = start;
h_graph_nodes[i].no_of_edges = edgeno;
h_graph_mask[i] = false;
h_updating_graph_mask[i] = false;
h_graph_visited[i] = false;
}
// read the source node from the file
fscanf(fp, "%d", &source);
source = 0;
// set the source node as true in the mask
h_graph_mask[source] = true;
h_graph_visited[source] = true;
fscanf(fp, "%d", &edge_list_size);
int id, cost;
int *h_graph_edges = (int *)malloc(sizeof(int) * edge_list_size);
for (int i = 0; i < edge_list_size; i++) {
fscanf(fp, "%d", &id);
fscanf(fp, "%d", &cost);
h_graph_edges[i] = id;
}
if (fp)
fclose(fp);
// allocate mem for the result on host side
int *h_cost = (int *)malloc(sizeof(int) * no_of_nodes);
int *h_cost_ref = (int *)malloc(sizeof(int) * no_of_nodes);
for (int i = 0; i < no_of_nodes; i++) {
h_cost[i] = -1;
h_cost_ref[i] = -1;
}
h_cost[source] = 0;
h_cost_ref[source] = 0;
//---------------------------------------------------------
//--gpu entry
run_bfs_gpu(no_of_nodes, h_graph_nodes, edge_list_size, h_graph_edges,
h_graph_mask, h_updating_graph_mask, h_graph_visited, h_cost);
//---------------------------------------------------------
//--cpu entry
// initalize the memory again
for (int i = 0; i < no_of_nodes; i++) {
h_graph_mask[i] = false;
h_updating_graph_mask[i] = false;
h_graph_visited[i] = false;
}
// set the source node as true in the mask
source = 0;
h_graph_mask[source] = true;
h_graph_visited[source] = true;
run_bfs_cpu(no_of_nodes, h_graph_nodes, edge_list_size, h_graph_edges,
h_graph_mask, h_updating_graph_mask, h_graph_visited,
h_cost_ref);
//---------------------------------------------------------
//--result varification
compare_results<int>(h_cost_ref, h_cost, no_of_nodes);
// release host memory
free(h_graph_nodes);
free(h_graph_mask);
free(h_updating_graph_mask);
free(h_graph_visited);
} catch (std::string msg) {
std::cout << "--cambine: exception in main ->" << msg << std::endl;
// release host memory
free(h_graph_nodes);
free(h_graph_mask);
free(h_updating_graph_mask);
free(h_graph_visited);
}
return 0;
}

1
benchmarks/old_opencl/bfs/run Executable file
View File

@@ -0,0 +1 @@
./bfs ../../data/bfs/graph1MW_6.txt

View File

@@ -0,0 +1,78 @@
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <iomanip>
#include "timer.h"
using namespace std;
double timer::CPU_speed_in_MHz = timer::get_CPU_speed_in_MHz();
double timer::get_CPU_speed_in_MHz()
{
#if defined __linux__
ifstream infile("/proc/cpuinfo");
char buffer[256], *colon;
while (infile.good()) {
infile.getline(buffer, 256);
if (strncmp("cpu MHz", buffer, 7) == 0 && (colon = strchr(buffer, ':')) != 0)
return atof(colon + 2);
}
#endif
return 0.0;
}
void timer::print_time(ostream &str, const char *which, double time) const
{
static const char *units[] = { " ns", " us", " ms", " s", " ks", 0 };
const char **unit = units;
time = 1000.0 * time / CPU_speed_in_MHz;
while (time >= 999.5 && unit[1] != 0) {
time /= 1000.0;
++ unit;
}
str << which << " = " << setprecision(3) << setw(4) << time << *unit;
}
ostream &timer::print(ostream &str)
{
str << left << setw(25) << (name != 0 ? name : "timer") << ": " << right;
if (CPU_speed_in_MHz == 0)
str << "could not determine CPU speed\n";
else if (count > 0) {
double total = static_cast<double>(total_time);
print_time(str, "avg", total / static_cast<double>(count));
print_time(str, ", total", total);
str << ", count = " << setw(9) << count << '\n';
}
else
str << "not used\n";
return str;
}
ostream &operator << (ostream &str, class timer &timer)
{
return timer.print(str);
}
double timer::getTimeInSeconds()
{
double total = static_cast<double>(total_time);
double res = (total / 1000000.0) / CPU_speed_in_MHz;
return res;
}

101
benchmarks/old_opencl/bfs/timer.h Executable file
View File

@@ -0,0 +1,101 @@
#ifndef timer_h
#define timer_h
#include <iostream>
class timer {
public:
timer(const char *name = 0);
timer(const char *name, std::ostream &write_on_exit);
~timer();
void start(), stop();
void reset();
std::ostream &print(std::ostream &);
double getTimeInSeconds();
private:
void print_time(std::ostream &, const char *which, double time) const;
union {
long long total_time;
struct {
#if defined __PPC__
int high, low;
#else
int low, high;
#endif
};
};
unsigned long long count;
const char *const name;
std::ostream *const write_on_exit;
static double CPU_speed_in_MHz, get_CPU_speed_in_MHz();
};
std::ostream &operator<<(std::ostream &, class timer &);
inline void timer::reset() {
total_time = 0;
count = 0;
}
inline timer::timer(const char *name) : name(name), write_on_exit(0) {
reset();
}
inline timer::timer(const char *name, std::ostream &write_on_exit)
: name(name), write_on_exit(&write_on_exit) {
reset();
}
inline timer::~timer() {
if (write_on_exit != 0)
print(*write_on_exit);
}
inline void timer::start() {
#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
unsigned eax, edx;
asm volatile("rdtsc" : "=a"(eax), "=d"(edx));
total_time -= ((unsigned long long)edx << 32) + eax;
#elif (defined __GNUC__ || defined __INTEL_COMPILER) && \
(defined __i386 || defined __x86_64)
asm volatile("rdtsc\n\t"
"subl %%eax, %0\n\t"
"sbbl %%edx, %1"
: "+m"(low), "+m"(high)
:
: "eax", "edx");
#else
#error Compiler/Architecture not recognized
#endif
}
inline void timer::stop() {
#if (defined __PATHSCALE__) && (defined __i386 || defined __x86_64)
unsigned eax, edx;
asm volatile("rdtsc" : "=a"(eax), "=d"(edx));
total_time += ((unsigned long long)edx << 32) + eax;
#elif (defined __GNUC__ || defined __INTEL_COMPILER) && \
(defined __i386 || defined __x86_64)
asm volatile("rdtsc\n\t"
"addl %%eax, %0\n\t"
"adcl %%edx, %1"
: "+m"(low), "+m"(high)
:
: "eax", "edx");
#endif
++count;
}
#endif

View File

@@ -0,0 +1,72 @@
#ifndef _C_UTIL_
#define _C_UTIL_
#include <math.h>
#include <iostream>
//-------------------------------------------------------------------
//--initialize array with maximum limit
//-------------------------------------------------------------------
template<typename datatype>
void fill(datatype *A, const int n, const datatype maxi){
for (int j = 0; j < n; j++)
{
A[j] = ((datatype) maxi * (rand() / (RAND_MAX + 1.0f)));
}
}
//--print matrix
template<typename datatype>
void print_matrix(datatype *A, int height, int width){
for(int i=0; i<height; i++){
for(int j=0; j<width; j++){
int idx = i*width + j;
std::cout<<A[idx]<<" ";
}
std::cout<<std::endl;
}
return;
}
//-------------------------------------------------------------------
//--verify results
//-------------------------------------------------------------------
#define MAX_RELATIVE_ERROR .002
template<typename datatype>
void verify_array(const datatype *cpuResults, const datatype *gpuResults, const int size){
char passed = true;
#pragma omp parallel for
for (int i=0; i<size; i++){
if (fabs(cpuResults[i] - gpuResults[i]) / cpuResults[i] > MAX_RELATIVE_ERROR){
passed = false;
}
}
if (passed){
std::cout << "--cambine:passed:-)" << endl;
}
else{
std::cout << "--cambine: failed:-(" << endl;
}
return ;
}
template<typename datatype>
void compare_results(const datatype *cpu_results, const datatype *gpu_results, const int size){
char passed = true;
//#pragma omp parallel for
for (int i=0; i<size; i++){
if (cpu_results[i]!=gpu_results[i]){
passed = false;
}
}
if (passed){
std::cout << "--cambine:passed:-)" << endl;
}
else{
std::cout << "--cambine: failed:-(" << endl;
}
return ;
}
#endif

View File

@@ -0,0 +1,68 @@
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH)
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT = convolution
SRCS = main.cpp utils.cpp
all: $(PROJECT).dump $(PROJECT).hex
lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug

Binary file not shown.

After

Width:  |  Height:  |  Size: 44 KiB

View File

@@ -0,0 +1,54 @@
__kernel
void convolution(
__read_only image2d_t sourceImage,
__write_only image2d_t outputImage,
int rows,
int cols,
__constant float* filter,
int filterWidth,
sampler_t sampler)
{
// Store each work-items unique row and column
int column = get_global_id(0);
int row = get_global_id(1);
// Half the width of the filter is needed for indexing
// memory later
int halfWidth = (int)(filterWidth/2);
// All accesses to images return data as four-element vector
// (i.e., float4), although only the 'x' component will contain
// meaningful data in this code
float4 sum = {0.0f, 0.0f, 0.0f, 0.0f};
// Iterator for the filter
int filterIdx = 0;
// Each work-item iterates around its local area based on the
// size of the filter
int2 coords; // Coordinates for accessing the image
// Iterate the filter rows
for(int i = -halfWidth; i <= halfWidth; i++) {
coords.y = row + i;
// Iterate over the filter columns
for(int j = -halfWidth; j <= halfWidth; j++) {
coords.x = column + j;
float4 pixel;
// Read a pixel from the image. A single channel image
// stores the pixel in the 'x' coordinate of the returned
// vector.
pixel = read_imagef(sourceImage, sampler, coords);
sum.x += pixel.x * filter[filterIdx++];
}
}
// Copy the data to the output image if the
// work-item is in bounds
if(row < rows && column < cols) {
coords.x = column;
coords.y = row;
write_imagef(outputImage, coords, sum);
}
}

View File

@@ -0,0 +1,261 @@
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include "utils.h"
// This function takes a positive integer and rounds it up to
// the nearest multiple of another provided integer
unsigned int roundUp(unsigned int value, unsigned int multiple) {
// Determine how far past the nearest multiple the value is
unsigned int remainder = value % multiple;
// Add the difference to make the value a multiple
if(remainder != 0) {
value += (multiple-remainder);
}
return value;
}
// This function reads in a text file and stores it as a char pointer
char* readSource(char* kernelPath) {
cl_int status;
FILE *fp;
char *source;
long int size;
printf("Program file is: %s\n", kernelPath);
fp = fopen(kernelPath, "rb");
if(!fp) {
printf("Could not open kernel file\n");
exit(-1);
}
status = fseek(fp, 0, SEEK_END);
if(status != 0) {
printf("Error seeking to end of file\n");
exit(-1);
}
size = ftell(fp);
if(size < 0) {
printf("Error getting file position\n");
exit(-1);
}
rewind(fp);
source = (char *)malloc(size + 1);
int i;
for (i = 0; i < size+1; i++) {
source[i]='\0';
}
if(source == NULL) {
printf("Error allocating space for the kernel source\n");
exit(-1);
}
fread(source, 1, size, fp);
source[size] = '\0';
return source;
}
void chk(cl_int status, const char* cmd) {
if(status != CL_SUCCESS) {
printf("%s failed (%d)\n", cmd, status);
exit(-1);
}
}
int main() {
int i, j, k, l;
// Rows and columns in the input image
int imageHeight;
int imageWidth;
const char* inputFile = "input.bmp";
const char* outputFile = "output.bmp";
// Homegrown function to read a BMP from file
float* inputImage = readImage(inputFile, &imageWidth,
&imageHeight);
// Size of the input and output images on the host
int dataSize = imageHeight*imageWidth*sizeof(float);
// Output image on the host
float* outputImage = NULL;
outputImage = (float*)malloc(dataSize);
float* refImage = NULL;
refImage = (float*)malloc(dataSize);
// 45 degree motion blur
float filter[49] =
{0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, -1, 0, 1, 0, 0,
0, 0, -2, 0, 2, 0, 0,
0, 0, -1, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0};
// The convolution filter is 7x7
int filterWidth = 7;
int filterSize = filterWidth*filterWidth; // Assume a square kernel
// Set up the OpenCL environment
cl_int status;
// Discovery platform
cl_platform_id platform;
status = clGetPlatformIDs(1, &platform, NULL);
chk(status, "clGetPlatformIDs");
// Discover device
cl_device_id device;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
chk(status, "clGetDeviceIDs");
// Create context
cl_context_properties props[3] = {CL_CONTEXT_PLATFORM,
(cl_context_properties)(platform), 0};
cl_context context;
context = clCreateContext(props, 1, &device, NULL, NULL, &status);
chk(status, "clCreateContext");
// Create command queue
cl_command_queue queue;
queue = clCreateCommandQueue(context, device, 0, &status);
chk(status, "clCreateCommandQueue");
// The image format describes how the data will be stored in memory
cl_image_format format;
format.image_channel_order = CL_R; // single channel
format.image_channel_data_type = CL_FLOAT; // float data type
// Create space for the source image on the device
cl_mem d_inputImage = clCreateImage2D(context, 0, &format, imageWidth,
imageHeight, 0, NULL, &status);
chk(status, "clCreateImage2D");
// Create space for the output image on the device
cl_mem d_outputImage = clCreateImage2D(context, 0, &format, imageWidth,
imageHeight, 0, NULL, &status);
chk(status, "clCreateImage2D");
// Create space for the 7x7 filter on the device
cl_mem d_filter = clCreateBuffer(context, 0, filterSize*sizeof(float),
NULL, &status);
chk(status, "clCreateBuffer");
// Copy the source image to the device
size_t origin[3] = {0, 0, 0}; // Offset within the image to copy from
size_t region[3] = {imageWidth, imageHeight, 1}; // Elements to per dimension
status = clEnqueueWriteImage(queue, d_inputImage, CL_FALSE, origin, region,
0, 0, inputImage, 0, NULL, NULL);
chk(status, "clEnqueueWriteImage");
// Copy the 7x7 filter to the device
status = clEnqueueWriteBuffer(queue, d_filter, CL_FALSE, 0,
filterSize*sizeof(float), filter, 0, NULL, NULL);
chk(status, "clEnqueueWriteBuffer");
// Create the image sampler
cl_sampler sampler = clCreateSampler(context, CL_FALSE,
CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &status);
chk(status, "clCreateSampler");
const char* source = readSource("kernel.cl");
// Create a program object with source and build it
cl_program program;
program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);
chk(status, "clCreateProgramWithSource");
status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
chk(status, "clBuildProgram");
// Create the kernel object
cl_kernel kernel;
kernel = clCreateKernel(program, "convolution", &status);
chk(status, "clCreateKernel");
// Set the kernel arguments
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_inputImage);
status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_outputImage);
status |= clSetKernelArg(kernel, 2, sizeof(int), &imageHeight);
status |= clSetKernelArg(kernel, 3, sizeof(int), &imageWidth);
status |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_filter);
status |= clSetKernelArg(kernel, 5, sizeof(int), &filterWidth);
status |= clSetKernelArg(kernel, 6, sizeof(cl_sampler), &sampler);
chk(status, "clSetKernelArg");
// Set the work item dimensions
size_t globalSize[2] = {imageWidth, imageHeight};
status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, NULL, 0,
NULL, NULL);
chk(status, "clEnqueueNDRange");
// Read the image back to the host
status = clEnqueueReadImage(queue, d_outputImage, CL_TRUE, origin,
region, 0, 0, outputImage, 0, NULL, NULL);
chk(status, "clEnqueueReadImage");
// Write the output image to file
storeImage(outputImage, outputFile, imageHeight, imageWidth, inputFile);
// Compute the reference image
for(i = 0; i < imageHeight; i++) {
for(j = 0; j < imageWidth; j++) {
refImage[i*imageWidth+j] = 0;
}
}
// Iterate over the rows of the source image
int halfFilterWidth = filterWidth/2;
float sum;
for(i = 0; i < imageHeight; i++) {
// Iterate over the columns of the source image
for(j = 0; j < imageWidth; j++) {
sum = 0; // Reset sum for new source pixel
// Apply the filter to the neighborhood
for(k = - halfFilterWidth; k <= halfFilterWidth; k++) {
for(l = - halfFilterWidth; l <= halfFilterWidth; l++) {
if(i+k >= 0 && i+k < imageHeight &&
j+l >= 0 && j+l < imageWidth) {
sum += inputImage[(i+k)*imageWidth + j+l] *
filter[(k+halfFilterWidth)*filterWidth +
l+halfFilterWidth];
}
}
}
refImage[i*imageWidth+j] = sum;
}
}
int failed = 0;
for(i = 0; i < imageHeight; i++) {
for(j = 0; j < imageWidth; j++) {
if(abs(outputImage[i*imageWidth+j]-refImage[i*imageWidth+j]) > 0.01) {
printf("Results are INCORRECT\n");
printf("Pixel mismatch at <%d,%d> (%f vs. %f)\n", i, j,
outputImage[i*imageWidth+j], refImage[i*imageWidth+j]);
failed = 1;
}
if(failed) break;
}
if(failed) break;
}
if(!failed) {
printf("Results are correct\n");
}
return 0;
}

View File

@@ -0,0 +1,180 @@
#include <stdio.h>
#include <stdlib.h>
#include "utils.h"
void storeImage(float *imageOut,
const char *filename,
int rows,
int cols,
const char* refFilename) {
FILE *ifp, *ofp;
unsigned char tmp;
int offset;
unsigned char *buffer;
int i, j;
int bytes;
int height, width;
ifp = fopen(refFilename, "rb");
if(ifp == NULL) {
perror(filename);
exit(-1);
}
fseek(ifp, 10, SEEK_SET);
fread(&offset, 4, 1, ifp);
fseek(ifp, 18, SEEK_SET);
fread(&width, 4, 1, ifp);
fread(&height, 4, 1, ifp);
fseek(ifp, 0, SEEK_SET);
buffer = (unsigned char *)malloc(offset);
if(buffer == NULL) {
perror("malloc");
exit(-1);
}
fread(buffer, 1, offset, ifp);
printf("Writing output image to %s\n", filename);
ofp = fopen(filename, "wb");
if(ofp == NULL) {
perror("opening output file");
exit(-1);
}
bytes = fwrite(buffer, 1, offset, ofp);
if(bytes != offset) {
printf("error writing header!\n");
exit(-1);
}
// NOTE bmp formats store data in reverse raster order (see comment in
// readImage function), so we need to flip it upside down here.
int mod = width % 4;
if(mod != 0) {
mod = 4 - mod;
}
// printf("mod = %d\n", mod);
for(i = height-1; i >= 0; i--) {
for(j = 0; j < width; j++) {
tmp = (unsigned char)imageOut[i*cols+j];
fwrite(&tmp, sizeof(char), 1, ofp);
}
// In bmp format, rows must be a multiple of 4-bytes.
// So if we're not at a multiple of 4, add junk padding.
for(j = 0; j < mod; j++) {
fwrite(&tmp, sizeof(char), 1, ofp);
}
}
fclose(ofp);
fclose(ifp);
free(buffer);
}
/*
* Read bmp image and convert to byte array. Also output the width and height
*/
float* readImage(const char *filename, int* widthOut, int* heightOut) {
uchar* imageData;
int height, width;
uchar tmp;
int offset;
int i, j;
printf("Reading input image from %s\n", filename);
FILE *fp = fopen(filename, "rb");
if(fp == NULL) {
perror(filename);
exit(-1);
}
fseek(fp, 10, SEEK_SET);
fread(&offset, 4, 1, fp);
fseek(fp, 18, SEEK_SET);
fread(&width, 4, 1, fp);
fread(&height, 4, 1, fp);
printf("width = %d\n", width);
printf("height = %d\n", height);
*widthOut = width;
*heightOut = height;
imageData = (uchar*)malloc(width*height);
if(imageData == NULL) {
perror("malloc");
exit(-1);
}
fseek(fp, offset, SEEK_SET);
fflush(NULL);
int mod = width % 4;
if(mod != 0) {
mod = 4 - mod;
}
// NOTE bitmaps are stored in upside-down raster order. So we begin
// reading from the bottom left pixel, then going from left-to-right,
// read from the bottom to the top of the image. For image analysis,
// we want the image to be right-side up, so we'll modify it here.
// First we read the image in upside-down
// Read in the actual image
for(i = 0; i < height; i++) {
// add actual data to the image
for(j = 0; j < width; j++) {
fread(&tmp, sizeof(char), 1, fp);
imageData[i*width + j] = tmp;
}
// For the bmp format, each row has to be a multiple of 4,
// so I need to read in the junk data and throw it away
for(j = 0; j < mod; j++) {
fread(&tmp, sizeof(char), 1, fp);
}
}
// Then we flip it over
int flipRow;
for(i = 0; i < height/2; i++) {
flipRow = height - (i+1);
for(j = 0; j < width; j++) {
tmp = imageData[i*width+j];
imageData[i*width+j] = imageData[flipRow*width+j];
imageData[flipRow*width+j] = tmp;
}
}
fclose(fp);
// Input image on the host
float* floatImage = NULL;
floatImage = (float*)malloc(sizeof(float)*width*height);
if(floatImage == NULL) {
perror("malloc");
exit(-1);
}
// Convert the BMP image to float (not required)
for(i = 0; i < height; i++) {
for(j = 0; j < width; j++) {
floatImage[i*width+j] = (float)imageData[i*width+j];
}
}
free(imageData);
return floatImage;
}

View File

@@ -0,0 +1,11 @@
#ifndef __UTILS__
#define __UTILS__
typedef unsigned char uchar;
float* readImage(const char *filename, int* widthOut, int* heightOut);
void storeImage(float *imageOut, const char *filename, int rows, int cols,
const char* refFilename);
#endif

View File

@@ -0,0 +1,68 @@
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH) -I.
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT = cutcp
SRCS = main.cc args.c parboil_opencl.c ocl.c gpu_info.c cutoff.c cutcpu.c output.c readatom.c excl.c
all: $(PROJECT).dump $(PROJECT).hex
lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug

View File

@@ -0,0 +1,617 @@
#include <parboil.h>
#include <errno.h>
#include <limits.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
/*****************************************************************************/
/* Memory management routines */
/* Free an array of owned strings. */
void
pb_FreeStringArray(char **string_array)
{
char **p;
if (!string_array) return;
for (p = string_array; *p; p++) free(*p);
free(string_array);
}
struct pb_PlatformParam *
pb_PlatformParam(char *name, char *version)
{
if (name == NULL) {
fprintf(stderr, "pb_PlatformParam: Invalid argument\n");
exit(-1);
}
struct pb_PlatformParam *ret =
(struct pb_PlatformParam *)malloc(sizeof (struct pb_PlatformParam));
ret->name = name;
ret->version = version;
return ret;
}
void
pb_FreePlatformParam(struct pb_PlatformParam *p)
{
if (p == NULL) return;
free(p->name);
free(p->version);
free(p);
}
struct pb_DeviceParam *
pb_DeviceParam_index(int index)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_INDEX;
ret->index = index;
return ret;
}
struct pb_DeviceParam *
pb_DeviceParam_cpu(void)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_CPU;
return ret;
}
struct pb_DeviceParam *
pb_DeviceParam_gpu(void)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_GPU;
return ret;
}
struct pb_DeviceParam *
pb_DeviceParam_accelerator(void)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_ACCELERATOR;
return ret;
}
struct pb_DeviceParam *
pb_DeviceParam_name(char *name)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_NAME;
ret->name = name;
return ret;
}
void
pb_FreeDeviceParam(struct pb_DeviceParam *p)
{
if (p == NULL) return;
switch(p->criterion) {
case pb_Device_NAME:
free(p->name);
break;
case pb_Device_INDEX:
case pb_Device_CPU:
case pb_Device_ACCELERATOR:
break;
default:
fprintf(stderr, "pb_FreeDeviceParam: Invalid argument\n");
exit(-1);
}
}
void
pb_FreeParameters(struct pb_Parameters *p)
{
free(p->outFile);
pb_FreeStringArray(p->inpFiles);
pb_FreePlatformParam(p->platform);
pb_FreeDeviceParam(p->device);
free(p);
}
/*****************************************************************************/
/* Parse a comma-delimited list of strings into an
* array of strings. */
static char **
read_string_array(char *in)
{
char **ret;
int i;
int count; /* Number of items in the input */
char *substring; /* Current substring within 'in' */
/* Count the number of items in the string */
count = 1;
for (i = 0; in[i]; i++) if (in[i] == ',') count++;
/* Allocate storage */
ret = (char **)malloc((count + 1) * sizeof(char *));
/* Create copies of the strings from the list */
substring = in;
for (i = 0; i < count; i++) {
char *substring_end;
int substring_length;
/* Find length of substring */
for (substring_end = substring;
(*substring_end != ',') && (*substring_end != 0);
substring_end++);
substring_length = substring_end - substring;
/* Allocate memory and copy the substring */
ret[i] = (char *)malloc(substring_length + 1);
memcpy(ret[i], substring, substring_length);
ret[i][substring_length] = 0;
/* go to next substring */
substring = substring_end + 1;
}
ret[i] = NULL; /* Write the sentinel value */
return ret;
}
static void
report_parse_error(const char *str)
{
fputs(str, stderr);
}
/* Interpret a string as a 'pb_DeviceParam' value.
* Return a pointer to a new value, or NULL on failure.
*/
static struct pb_DeviceParam *
read_device_param(char *str)
{
/* Try different ways of interpreting 'device_string' until one works */
/* If argument is an integer, then interpret it as a device index */
errno = 0;
char *end;
long device_int = strtol(str, &end, 10);
if (!errno) {
/* Negative numbers are not valid */
if (device_int < 0 || device_int > INT_MAX) return NULL;
return pb_DeviceParam_index(device_int);
}
/* Match against predefined strings */
if (strcmp(str, "CPU") == 0)
return pb_DeviceParam_cpu();
if (strcmp(str, "GPU") == 0)
return pb_DeviceParam_gpu();
if (strcmp(str, "ACCELERATOR") == 0)
return pb_DeviceParam_accelerator();
/* Assume any other string is a device name */
return pb_DeviceParam_name(strdup(str));
}
/* Interpret a string as a 'pb_PlatformParam' value.
* Return a pointer to a new value, or NULL on failure.
*/
static struct pb_PlatformParam *
read_platform_param(char *str)
{
int separator_index; /* Index of the '-' character separating
* name and version number. It's -1 if
* there's no '-' character. */
/* Find the last occurrence of '-' in 'str' */
{
char *cur;
separator_index = -1;
for (cur = str; *cur; cur++) {
if (*cur == '-') separator_index = cur - str;
}
}
/* The platform name is either the entire string, or all characters before
* the separator */
int name_length = separator_index == -1 ? strlen(str) : separator_index;
char *name_str = (char *)malloc(name_length + 1);
memcpy(name_str, str, name_length);
name_str[name_length] = 0;
/* The version is either NULL, or all characters after the separator */
char *version_str;
if (separator_index == -1) {
version_str = NULL;
}
else {
const char *version_input_str = str + separator_index + 1;
int version_length = strlen(version_input_str);
version_str = (char *)malloc(version_length + 1);
memcpy(version_str, version_input_str, version_length);
version_str[version_length] = 0;
}
/* Create output structure */
return pb_PlatformParam(name_str, version_str);
}
/****************************************************************************/
/* Argument parsing state */
/* Argument parsing state.
*
* Arguments that are interpreted by the argument parser are removed from
* the list. Variables 'argc' and 'argn' do not count arguments that have
* been removed.
*
* During argument parsing, the array of arguments is compacted, overwriting
* the erased arguments. Variable 'argv_put' points to the array element
* where the next argument will be written. Variable 'argv_get' points to
* the array element where the next argument will be read from.
*/
struct argparse {
int argc; /* Number of arguments. Mutable. */
int argn; /* Current argument index. */
char **argv_get; /* Argument value being read. */
char **argv_put; /* Argument value being written.
* argv_put <= argv_get. */
};
static void
initialize_argparse(struct argparse *ap, int argc, char **argv)
{
ap->argc = argc;
ap->argn = 0;
ap->argv_get = ap->argv_put = argv;
}
/* Finish argument parsing, without processing the remaining arguments.
* Write new argument count into _argc. */
static void
finalize_argparse(struct argparse *ap, int *_argc, char **argv)
{
/* Move the remaining arguments */
for(; ap->argn < ap->argc; ap->argn++)
*ap->argv_put++ = *ap->argv_get++;
/* Update the argument count */
*_argc = ap->argc;
/* Insert a terminating NULL */
argv[ap->argc] = NULL;
}
/* Delete the current argument. The argument will not be visible
* when argument parsing is done. */
static void
delete_argument(struct argparse *ap)
{
if (ap->argn >= ap->argc) {
fprintf(stderr, "delete_argument\n");
}
ap->argc--;
ap->argv_get++;
}
/* Go to the next argument. Also, move the current argument to its
* final location in argv. */
static void
next_argument(struct argparse *ap)
{
if (ap->argn >= ap->argc) {
fprintf(stderr, "next_argument\n");
}
/* Move argument to its new location. */
*ap->argv_put++ = *ap->argv_get++;
ap->argn++;
}
static int
is_end_of_arguments(struct argparse *ap)
{
return ap->argn == ap->argc;
}
/* Get the current argument */
static char *
get_argument(struct argparse *ap)
{
return *ap->argv_get;
}
/* Get the current argument, and also delete it */
static char *
consume_argument(struct argparse *ap)
{
char *ret = get_argument(ap);
delete_argument(ap);
return ret;
}
/****************************************************************************/
/* The result of parsing a command-line argument */
typedef enum {
ARGPARSE_OK, /* Success */
ARGPARSE_ERROR, /* Error */
ARGPARSE_DONE /* Success, and do not continue parsing */
} result;
typedef result parse_action(struct argparse *ap, struct pb_Parameters *params);
/* A command-line option */
struct option {
char short_name; /* If not 0, the one-character
* name of this option */
const char *long_name; /* If not NULL, the long name of this option */
parse_action *action; /* What to do when this option occurs.
* Sentinel value is NULL.
*/
};
/* Output file
*
* -o FILE
*/
static result
parse_output_file(struct argparse *ap, struct pb_Parameters *params)
{
if (is_end_of_arguments(ap))
{
report_parse_error("Expecting file name after '-o'\n");
return ARGPARSE_ERROR;
}
/* Replace the output file name */
free(params->outFile);
params->outFile = strdup(consume_argument(ap));
return ARGPARSE_OK;
}
/* Input files
*
* -i FILE,FILE,...
*/
static result
parse_input_files(struct argparse *ap, struct pb_Parameters *params)
{
if (is_end_of_arguments(ap))
{
report_parse_error("Expecting file name after '-i'\n");
return ARGPARSE_ERROR;
}
/* Replace the input file list */
pb_FreeStringArray(params->inpFiles);
params->inpFiles = read_string_array(consume_argument(ap));
return ARGPARSE_OK;
}
/* End of options
*
* --
*/
static result
parse_end_options(struct argparse *ap, struct pb_Parameters *params)
{
return ARGPARSE_DONE;
}
/* OpenCL device
*
* --device X
*/
static result
parse_device(struct argparse *ap, struct pb_Parameters *params)
{
/* Read the next argument, which specifies a device */
if (is_end_of_arguments(ap))
{
report_parse_error("Expecting device specification after '--device'\n");
return ARGPARSE_ERROR;
}
char *device_string = consume_argument(ap);
struct pb_DeviceParam *device_param = read_device_param(device_string);
if (!device_param) {
report_parse_error("Unrecognized device specification format on command line\n");
return ARGPARSE_ERROR;
}
/* Save the result */
pb_FreeDeviceParam(params->device);
params->device = device_param;
return ARGPARSE_OK;
}
static result
parse_platform(struct argparse *ap, struct pb_Parameters *params)
{
/* Read the next argument, which specifies a platform */
if (is_end_of_arguments(ap))
{
report_parse_error("Expecting device specification after '--platform'\n");
return ARGPARSE_ERROR;
}
char *platform_string = consume_argument(ap);
struct pb_PlatformParam *platform_param = read_platform_param(platform_string);
if (!platform_param) {
report_parse_error("Unrecognized platform specification format on command line\n");
return ARGPARSE_ERROR;
}
/* Save the result */
pb_FreePlatformParam(params->platform);
params->platform = platform_param;
return ARGPARSE_OK;
}
static struct option options[] = {
{ 'o', NULL, &parse_output_file },
{ 'i', NULL, &parse_input_files },
{ '-', NULL, &parse_end_options },
{ 0, "device", &parse_device },
{ 0, "platform", &parse_platform },
{ 0, NULL, NULL }
};
static int
is_last_option(struct option *op)
{
return op->action == NULL;
}
/****************************************************************************/
/* Parse command-line parameters.
* Return zero on error, nonzero otherwise.
* On error, the other outputs may be invalid.
*
* The information collected from parameters is used to update
* 'ret'. 'ret' should be initialized.
*
* '_argc' and 'argv' are updated to contain only the unprocessed arguments.
*/
static int
pb_ParseParameters (struct pb_Parameters *ret, int *_argc, char **argv)
{
char *err_message;
struct argparse ap;
/* Each argument */
initialize_argparse(&ap, *_argc, argv);
while(!is_end_of_arguments(&ap)) {
result arg_result; /* Result of parsing this option */
char *arg = get_argument(&ap);
/* Process this argument */
if (arg[0] == '-') {
/* Single-character flag */
if ((arg[1] != 0) && (arg[2] == 0)) {
delete_argument(&ap); /* This argument is consumed here */
/* Find a matching short option */
struct option *op;
for (op = options; !is_last_option(op); op++) {
if (op->short_name == arg[1]) {
arg_result = (*op->action)(&ap, ret);
goto option_was_processed;
}
}
/* No option matches */
report_parse_error("Unexpected command-line parameter\n");
arg_result = ARGPARSE_ERROR;
goto option_was_processed;
}
/* Long flag */
if (arg[1] == '-') {
delete_argument(&ap); /* This argument is consumed here */
/* Find a matching long option */
struct option *op;
for (op = options; !is_last_option(op); op++) {
if (op->long_name && strcmp(&arg[2], op->long_name) == 0) {
arg_result = (*op->action)(&ap, ret);
goto option_was_processed;
}
}
/* No option matches */
report_parse_error("Unexpected command-line parameter\n");
arg_result = ARGPARSE_ERROR;
goto option_was_processed;
}
}
else {
/* Other arguments are ignored */
next_argument(&ap);
arg_result = ARGPARSE_OK;
goto option_was_processed;
}
option_was_processed:
/* Decide what to do next based on 'arg_result' */
switch(arg_result) {
case ARGPARSE_OK:
/* Continue processing */
break;
case ARGPARSE_ERROR:
/* Error exit from the function */
return 0;
case ARGPARSE_DONE:
/* Normal exit from the argument parsing loop */
goto end_of_options;
}
} /* end for each argument */
/* If all arguments were processed, then normal exit from the loop */
end_of_options:
finalize_argparse(&ap, _argc, argv);
return 1;
}
/*****************************************************************************/
/* Other exported functions */
struct pb_Parameters *
pb_ReadParameters(int *_argc, char **argv)
{
struct pb_Parameters *ret =
(struct pb_Parameters *)malloc(sizeof(struct pb_Parameters));
/* Initialize the parameters structure */
ret->outFile = NULL;
ret->inpFiles = (char **)malloc(sizeof(char *));
ret->inpFiles[0] = NULL;
ret->platform = NULL;
ret->device = NULL;
/* Read parameters and update _argc, argv */
if (!pb_ParseParameters(ret, _argc, argv)) {
/* Parse error */
pb_FreeParameters(ret);
return NULL;
}
return ret;
}
int
pb_Parameters_CountInputs(struct pb_Parameters *p)
{
int n;
for (n = 0; p->inpFiles[n]; n++);
return n;
}

View File

@@ -0,0 +1,37 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#ifndef ATOM_H
#define ATOM_H
#ifdef __cplusplus
extern "C" {
#endif
typedef struct Atom_t {
float x, y, z, q;
} Atom;
typedef struct Atoms_t {
Atom *atoms;
int size;
} Atoms;
typedef struct Vec3_t {
float x, y, z;
} Vec3;
Atoms *read_atom_file(const char *fname);
void free_atom(Atoms *atom);
void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom);
#ifdef __cplusplus
}
#endif
#endif /* ATOM_H */

View File

@@ -0,0 +1,195 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <parboil.h>
#include "atom.h"
#include "cutoff.h"
#undef DEBUG_PASS_RATE
#define CHECK_CYLINDER_CPU
#define CELLEN 4.f
#define INV_CELLEN (1.f/CELLEN)
extern int cpu_compute_cutoff_potential_lattice(
Lattice *lattice, /* the lattice */
float cutoff, /* cutoff distance */
Atoms *atoms /* array of atoms */
)
{
int nx = lattice->dim.nx;
int ny = lattice->dim.ny;
int nz = lattice->dim.nz;
float xlo = lattice->dim.lo.x;
float ylo = lattice->dim.lo.y;
float zlo = lattice->dim.lo.z;
float gridspacing = lattice->dim.h;
int natoms = atoms->size;
Atom *atom = atoms->atoms;
const float a2 = cutoff * cutoff;
const float inv_a2 = 1.f / a2;
float s;
const float inv_gridspacing = 1.f / gridspacing;
const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
/* lattice point radius about each atom */
int n;
int i, j, k;
int ia, ib, ic;
int ja, jb, jc;
int ka, kb, kc;
int index;
int koff, jkoff;
float x, y, z, q;
float dx, dy, dz;
float dz2, dydz2, r2;
float e;
float xstart, ystart;
float *pg;
int gindex;
int ncell, nxcell, nycell, nzcell;
int *first, *next;
float inv_cellen = INV_CELLEN;
Vec3 minext, maxext; /* Extent of atom bounding box */
float xmin, ymin, zmin;
float xmax, ymax, zmax;
#if DEBUG_PASS_RATE
unsigned long long pass_count = 0;
unsigned long long fail_count = 0;
#endif
/* find min and max extent */
get_atom_extent(&minext, &maxext, atoms);
/* number of cells in each dimension */
nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
ncell = nxcell * nycell * nzcell;
/* allocate for cursor link list implementation */
first = (int *) malloc(ncell * sizeof(int));
for (gindex = 0; gindex < ncell; gindex++) {
first[gindex] = -1;
}
next = (int *) malloc(natoms * sizeof(int));
for (n = 0; n < natoms; n++) {
next[n] = -1;
}
/* geometric hashing */
for (n = 0; n < natoms; n++) {
if (0==atom[n].q) continue; /* skip any non-contributing atoms */
i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
gindex = (k*nycell + j)*nxcell + i;
next[n] = first[gindex];
first[gindex] = n;
}
/* traverse the grid cells */
for (gindex = 0; gindex < ncell; gindex++) {
for (n = first[gindex]; n != -1; n = next[n]) {
x = atom[n].x - xlo;
y = atom[n].y - ylo;
z = atom[n].z - zlo;
q = atom[n].q;
/* find closest grid point with position less than or equal to atom */
ic = (int) (x * inv_gridspacing);
jc = (int) (y * inv_gridspacing);
kc = (int) (z * inv_gridspacing);
/* find extent of surrounding box of grid points */
ia = ic - radius;
ib = ic + radius + 1;
ja = jc - radius;
jb = jc + radius + 1;
ka = kc - radius;
kb = kc + radius + 1;
/* trim box edges so that they are within grid point lattice */
if (ia < 0) ia = 0;
if (ib >= nx) ib = nx-1;
if (ja < 0) ja = 0;
if (jb >= ny) jb = ny-1;
if (ka < 0) ka = 0;
if (kb >= nz) kb = nz-1;
/* loop over surrounding grid points */
xstart = ia*gridspacing - x;
ystart = ja*gridspacing - y;
dz = ka*gridspacing - z;
for (k = ka; k <= kb; k++, dz += gridspacing) {
koff = k*ny;
dz2 = dz*dz;
dy = ystart;
for (j = ja; j <= jb; j++, dy += gridspacing) {
jkoff = (koff + j)*nx;
dydz2 = dy*dy + dz2;
#ifdef CHECK_CYLINDER_CPU
if (dydz2 >= a2) continue;
#endif
dx = xstart;
index = jkoff + ia;
pg = lattice->lattice + index;
#if defined(__INTEL_COMPILER)
for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
r2 = dx*dx + dydz2;
s = (1.f - r2 * inv_a2) * (1.f - r2 * inv_a2);
e = q * (1/sqrtf(r2)) * s;
*pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */
}
#else
for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
r2 = dx*dx + dydz2;
if (r2 >= a2)
{
#ifdef DEBUG_PASS_RATE
fail_count++;
#endif
continue;
}
#ifdef DEBUG_PASS_RATE
pass_count++;
#endif
s = (1.f - r2 * inv_a2);
e = q * (1/sqrtf(r2)) * s * s;
*pg += e;
}
#endif
}
} /* end loop over surrounding grid points */
} /* end loop over atoms in a gridcell */
} /* end loop over gridcells */
/* free memory */
free(next);
free(first);
/* For debugging: print the number of times that the test passed/failed */
#ifdef DEBUG_PASS_RATE
printf ("Pass :%lld\n", pass_count);
printf ("Fail :%lld\n", fail_count);
#endif
return 0;
}

View File

@@ -0,0 +1,499 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#include <CL/cl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <parboil.h>
#include "atom.h"
#include "cutoff.h"
#include "macros.h"
#include "ocl.h"
// OpenCL 1.1 support for int3 is not uniform on all implementations, so
// we use int4 instead. Only the 'x', 'y', and 'z' fields of xyz are used.
typedef cl_int4 xyz;
//extern "C" int gpu_compute_cutoff_potential_lattice(
int gpu_compute_cutoff_potential_lattice(
struct pb_TimerSet *timers,
Lattice *lattice, /* the lattice */
float cutoff, /* cutoff distance */
Atoms *atoms, /* array of atoms */
int verbose, /* print info/debug messages */
struct pb_Parameters *parameters
)
{
int nx = lattice->dim.nx;
int ny = lattice->dim.ny;
int nz = lattice->dim.nz;
float xlo = lattice->dim.lo.x;
float ylo = lattice->dim.lo.y;
float zlo = lattice->dim.lo.z;
float h = lattice->dim.h;
int natoms = atoms->size;
Atom *atom = atoms->atoms;
xyz nbrlist[NBRLIST_MAXLEN];
int nbrlistlen = 0;
int binHistoFull[BIN_DEPTH+1] = { 0 }; /* clear every array element */
int binHistoCover[BIN_DEPTH+1] = { 0 }; /* clear every array element */
int num_excluded = 0;
int xRegionDim, yRegionDim, zRegionDim;
int xRegionIndex, yRegionIndex, zRegionIndex;
int xOffset, yOffset, zOffset;
int lnx, lny, lnz, lnall;
float *regionZeroAddr, *thisRegion;
cl_mem regionZeroCl;
int index, indexRegion;
int c;
xyz binDim;
int nbins;
cl_float4 *binBaseAddr, *binZeroAddr;
cl_mem binBaseCl, binZeroCl;
int *bincntBaseAddr, *bincntZeroAddr;
Atoms *extra = NULL;
cl_mem NbrListLen;
cl_mem NbrList;
int i, j, k, n;
int sum, total;
float avgFillFull, avgFillCover;
const float cutoff2 = cutoff * cutoff;
const float inv_cutoff2 = 1.f / cutoff2;
size_t gridDim[3], blockDim[3];
// The "compute" timer should be active upon entry to this function
/* pad lattice to be factor of 8 in each dimension */
xRegionDim = (int) ceilf(nx/8.f);
yRegionDim = (int) ceilf(ny/8.f);
zRegionDim = (int) ceilf(nz/8.f);
lnx = 8 * xRegionDim;
lny = 8 * yRegionDim;
lnz = 8 * zRegionDim;
lnall = lnx * lny * lnz;
/* will receive energies from OpenCL */
regionZeroAddr = (float *) malloc(lnall * sizeof(float));
/* create bins */
c = (int) ceil(cutoff * BIN_INVLEN); /* count extra bins around lattice */
binDim.x = (int) ceil(lnx * h * BIN_INVLEN) + 2*c;
binDim.y = (int) ceil(lny * h * BIN_INVLEN) + 2*c;
binDim.z = (int) ceil(lnz * h * BIN_INVLEN) + 2*c;
nbins = binDim.x * binDim.y * binDim.z;
binBaseAddr = (cl_float4 *) calloc(nbins * BIN_DEPTH, sizeof(cl_float4));
binZeroAddr = binBaseAddr + ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;
bincntBaseAddr = (int *) calloc(nbins, sizeof(int));
bincntZeroAddr = bincntBaseAddr + (c * binDim.y + c) * binDim.x + c;
/* create neighbor list */
if (ceilf(BIN_LENGTH / (8*h)) == floorf(BIN_LENGTH / (8*h))) {
float s = sqrtf(3);
float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH);
int cnt = 0;
/* develop neighbor list around 1 cell */
if (2*c + 1 > NBRLIST_DIM) {
fprintf(stderr, "must have cutoff <= %f\n",
(NBRLIST_DIM-1)/2 * BIN_LENGTH);
return -1;
}
for (k = -c; k <= c; k++) {
for (j = -c; j <= c; j++) {
for (i = -c; i <= c; i++) {
if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue;
nbrlist[cnt].x = i;
nbrlist[cnt].y = j;
nbrlist[cnt].z = k;
cnt++;
}
}
}
nbrlistlen = cnt;
}
else if (8*h <= 2*BIN_LENGTH) {
float s = 2.f*sqrtf(3);
float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH);
int cnt = 0;
/* develop neighbor list around 3-cube of cells */
if (2*c + 3 > NBRLIST_DIM) {
fprintf(stderr, "must have cutoff <= %f\n",
(NBRLIST_DIM-3)/2 * BIN_LENGTH);
return -1;
}
for (k = -c; k <= c; k++) {
for (j = -c; j <= c; j++) {
for (i = -c; i <= c; i++) {
if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue;
nbrlist[cnt].x = i;
nbrlist[cnt].y = j;
nbrlist[cnt].z = k;
cnt++;
}
}
}
nbrlistlen = cnt;
}
else {
fprintf(stderr, "must have h <= %f\n", 0.25 * BIN_LENGTH);
return -1;
}
/* perform geometric hashing of atoms into bins */
{
/* array of extra atoms, permit average of one extra per bin */
Atom *extra_atoms = (Atom *) calloc(nbins, sizeof(Atom));
int extra_len = 0;
for (n = 0; n < natoms; n++) {
cl_float4 p;
p.x = atom[n].x - xlo;
p.y = atom[n].y - ylo;
p.z = atom[n].z - zlo;
p.w = atom[n].q;
i = (int) floorf(p.x * BIN_INVLEN);
j = (int) floorf(p.y * BIN_INVLEN);
k = (int) floorf(p.z * BIN_INVLEN);
if (i >= -c && i < binDim.x - c &&
j >= -c && j < binDim.y - c &&
k >= -c && k < binDim.z - c &&
atom[n].q != 0) {
int index = (k * binDim.y + j) * binDim.x + i;
cl_float4 *bin = binZeroAddr + index * BIN_DEPTH;
int bindex = bincntZeroAddr[index];
if (bindex < BIN_DEPTH) {
/* copy atom into bin and increase counter for this bin */
bin[bindex] = p;
bincntZeroAddr[index]++;
}
else {
/* add index to array of extra atoms to be computed with CPU */
if (extra_len >= nbins) {
fprintf(stderr, "exceeded space for storing extra atoms\n");
return -1;
}
extra_atoms[extra_len] = atom[n];
extra_len++;
}
}
else {
/* excluded atoms are either outside bins or neutrally charged */
num_excluded++;
}
}
/* Save result */
extra = (Atoms *)malloc(sizeof(Atoms));
extra->atoms = extra_atoms;
extra->size = extra_len;
}
/* bin stats */
sum = total = 0;
for (n = 0; n < nbins; n++) {
binHistoFull[ bincntBaseAddr[n] ]++;
sum += bincntBaseAddr[n];
total += BIN_DEPTH;
}
avgFillFull = sum / (float) total;
sum = total = 0;
for (k = 0; k < binDim.z - 2*c; k++) {
for (j = 0; j < binDim.y - 2*c; j++) {
for (i = 0; i < binDim.x - 2*c; i++) {
int index = (k * binDim.y + j) * binDim.x + i;
binHistoCover[ bincntZeroAddr[index] ]++;
sum += bincntZeroAddr[index];
total += BIN_DEPTH;
}
}
}
avgFillCover = sum / (float) total;
if (verbose) {
/* report */
printf("number of atoms = %d\n", natoms);
printf("lattice spacing = %g\n", h);
printf("cutoff distance = %g\n", cutoff);
printf("\n");
printf("requested lattice dimensions = %d %d %d\n", nx, ny, nz);
printf("requested space dimensions = %g %g %g\n", nx*h, ny*h, nz*h);
printf("expanded lattice dimensions = %d %d %d\n", lnx, lny, lnz);
printf("expanded space dimensions = %g %g %g\n", lnx*h, lny*h, lnz*h);
printf("number of bytes for lattice data = %u\n", (unsigned int) (lnall*sizeof(float)));
printf("\n");
printf("bin padding thickness = %d\n", c);
printf("bin cover dimensions = %d %d %d\n",
binDim.x - 2*c, binDim.y - 2*c, binDim.z - 2*c);
printf("bin full dimensions = %d %d %d\n", binDim.x, binDim.y, binDim.z);
printf("number of bins = %d\n", nbins);
printf("total number of atom slots = %d\n", nbins * BIN_DEPTH);
printf("%% overhead space = %g\n",
(natoms / (double) (nbins * BIN_DEPTH)) * 100);
printf("number of bytes for bin data = %u\n",
(unsigned int)(nbins * BIN_DEPTH * sizeof(cl_float4)));
printf("\n");
printf("bin histogram with padding:\n");
sum = 0;
for (n = 0; n <= BIN_DEPTH; n++) {
printf(" number of bins with %d atoms: %d\n", n, binHistoFull[n]);
sum += binHistoFull[n];
}
printf(" total number of bins: %d\n", sum);
printf(" %% average fill: %g\n", avgFillFull * 100);
printf("\n");
printf("bin histogram excluding padding:\n");
sum = 0;
for (n = 0; n <= BIN_DEPTH; n++) {
printf(" number of bins with %d atoms: %d\n", n, binHistoCover[n]);
sum += binHistoCover[n];
}
printf(" total number of bins: %d\n", sum);
printf(" %% average fill: %g\n", avgFillCover * 100);
printf("\n");
printf("number of extra atoms = %d\n", extra->size);
printf("%% atoms that are extra = %g\n", (extra->size / (double) natoms) * 100);
printf("\n");
/* sanity check on bins */
sum = 0;
for (n = 0; n <= BIN_DEPTH; n++) {
sum += n * binHistoFull[n];
}
sum += extra->size + num_excluded;
printf("sanity check on bin histogram with edges: "
"sum + others = %d\n", sum);
sum = 0;
for (n = 0; n <= BIN_DEPTH; n++) {
sum += n * binHistoCover[n];
}
sum += extra->size + num_excluded;
printf("sanity check on bin histogram excluding edges: "
"sum + others = %d\n", sum);
printf("\n");
/* neighbor list */
printf("neighbor list length = %d\n", nbrlistlen);
printf("\n");
}
printf("Ok!\n");
pb_Context* pb_context;
pb_context = pb_InitOpenCLContext(parameters);
if (pb_context == NULL) {
fprintf (stderr, "Error: No OpenCL platform/device can be found.");
return -1;
}
printf("Ok!\n");
cl_int clStatus;
cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId;
cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId;
cl_context clContext = (cl_context) pb_context->clContext;
cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
CHECK_ERROR("clCreateCommandQueue")
pb_SetOpenCL(&clContext, &clCommandQueue);
//const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
//cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
cl_program clProgram = clCreateProgramWithBuiltInKernels(
clContext, 1, &clDevice, "opencl_cutoff_potential_lattice", &clStatus);
CHECK_ERROR("clCreateProgramWithSource")
char clOptions[50];
sprintf(clOptions,"-I src/opencl_base"); //-cl-nv-verbose
clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
if (clStatus != CL_SUCCESS) {
size_t string_size = 0;
clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG,
0, NULL, &string_size);
char* string = (char*)malloc(string_size*sizeof(char));
clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG,
string_size, string, NULL);
puts(string);
}
CHECK_ERROR("clBuildProgram")
cl_kernel clKernel = clCreateKernel(clProgram,"opencl_cutoff_potential_lattice",&clStatus);
CHECK_ERROR("clCreateKernel")
/* setup OpenCL kernel parameters */
blockDim[0] = 8;
blockDim[1] = 8;
blockDim[2] = 2;
gridDim[0] = 4 * xRegionDim * blockDim[0];
gridDim[1] = yRegionDim * blockDim[1];
gridDim[2] = 1 * blockDim[2];
/* allocate and initialize memory on OpenCL device */
pb_SwitchToTimer(timers, pb_TimerID_COPY);
if (verbose) {
printf("Allocating %.2fMB on OpenCL device for potentials\n",
lnall * sizeof(float) / (double) (1024*1024));
}
regionZeroCl = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,lnall*sizeof(float),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
// clMemSet(clCommandQueue,regionZeroCl,0,lnall*sizeof(float));
if (verbose) {
printf("Allocating %.2fMB on OpenCL device for atom bins\n",
nbins * BIN_DEPTH * sizeof(cl_float4) / (double) (1024*1024));
}
binBaseCl = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nbins*BIN_DEPTH*sizeof(cl_float4),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
clStatus = clEnqueueWriteBuffer(clCommandQueue,binBaseCl,CL_TRUE,0,nbins*BIN_DEPTH*sizeof(cl_float4),binBaseAddr,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
//Sub buffers are not supported in OpenCL v1.0
int offset = ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;
NbrListLen = clCreateBuffer(clContext,CL_MEM_READ_ONLY,sizeof(int),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrListLen,CL_TRUE,0,sizeof(int),&nbrlistlen,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
NbrList = clCreateBuffer(clContext,CL_MEM_READ_ONLY,NBRLIST_MAXLEN*sizeof(xyz),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrList,CL_TRUE,0,nbrlistlen*sizeof(xyz),nbrlist,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
if (verbose)
printf("\n");
clStatus = clSetKernelArg(clKernel,0,sizeof(int),&(binDim.x));
clStatus = clSetKernelArg(clKernel,1,sizeof(int),&(binDim.y));
clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&binBaseCl);
clStatus = clSetKernelArg(clKernel,3,sizeof(int),&offset);
clStatus = clSetKernelArg(clKernel,4,sizeof(float),&h);
clStatus = clSetKernelArg(clKernel,5,sizeof(float),&cutoff2);
clStatus = clSetKernelArg(clKernel,6,sizeof(float),&inv_cutoff2);
clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&regionZeroCl);
clStatus = clSetKernelArg(clKernel,9,sizeof(cl_mem),&NbrListLen);
clStatus = clSetKernelArg(clKernel,10,sizeof(cl_mem),&NbrList);
CHECK_ERROR("clSetKernelArg")
printf("Ok!!\n");
/* loop over z-dimension, invoke OpenCL kernel for each x-y plane */
pb_SwitchToTimer(timers, pb_TimerID_KERNEL);
printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim);
for (zRegionIndex = 0; zRegionIndex < zRegionDim; zRegionIndex++) {
printf(" computing plane %d\r", zRegionIndex);
fflush(stdout);
clStatus = clSetKernelArg(clKernel,8,sizeof(int),&zRegionIndex);
CHECK_ERROR("clSetKernelArg")
printf("Ok**!2\n");
clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,gridDim,blockDim,0,NULL,NULL);
printf("Ok**!2\n");
CHECK_ERROR("clEnqueueNDRangeKernel")
printf("Ok**!2\n");
clStatus = clFinish(clCommandQueue);
printf("Ok**!2\n");
CHECK_ERROR("clFinish")
}
printf("Ok++!\n");
printf("Finished OpenCL kernel calls \n");
/* copy result regions from OpenCL device */
pb_SwitchToTimer(timers, pb_TimerID_COPY);
clStatus = clEnqueueReadBuffer(clCommandQueue,regionZeroCl,CL_TRUE,0,lnall*sizeof(float),regionZeroAddr,0,NULL,NULL);
CHECK_ERROR("clEnqueueReadBuffer")
/* free OpenCL memory allocations */
clStatus = clReleaseMemObject(regionZeroCl);
clStatus = clReleaseMemObject(binBaseCl);
clStatus = clReleaseMemObject(NbrListLen);
clStatus = clReleaseMemObject(NbrList);
CHECK_ERROR("clReleaseMemObject")
clStatus = clReleaseKernel(clKernel);
clStatus = clReleaseProgram(clProgram);
clStatus = clReleaseCommandQueue(clCommandQueue);
clStatus = clReleaseContext(clContext);
//free((void*)clSource[0]);
/* transpose regions back into lattice */
pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
for (k = 0; k < nz; k++) {
zRegionIndex = (k >> 3);
zOffset = (k & 7);
for (j = 0; j < ny; j++) {
yRegionIndex = (j >> 3);
yOffset = (j & 7);
for (i = 0; i < nx; i++) {
xRegionIndex = (i >> 3);
xOffset = (i & 7);
thisRegion = regionZeroAddr
+ ((zRegionIndex * yRegionDim + yRegionIndex) * xRegionDim
+ xRegionIndex) * REGION_SIZE;
indexRegion = (zOffset * 8 + yOffset) * 8 + xOffset;
index = (k * ny + j) * nx + i;
lattice->lattice[index] = thisRegion[indexRegion];
}
}
}
/* handle extra atoms */
if (extra->size > 0) {
printf("computing extra atoms on CPU\n");
if (cpu_compute_cutoff_potential_lattice(lattice, cutoff, extra)) {
fprintf(stderr, "cpu_compute_cutoff_potential_lattice() failed "
"for extra atoms\n");
return -1;
}
printf("\n");
}
/* cleanup memory allocations */
free(regionZeroAddr);
free(binBaseAddr);
free(bincntBaseAddr);
free_atom(extra);
return 0;
}

View File

@@ -0,0 +1,72 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#ifndef CUTOFF_H
#define CUTOFF_H
#ifdef __cplusplus
extern "C" {
#endif
#define SHIFTED
/* A structure to record how points in 3D space map to array
elements. Array element (z, y, x)
where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz
maps to coordinate (xlo, ylo, zlo) + h * (x, y, z).
*/
typedef struct LatticeDim_t {
/* Number of lattice points in x, y, z dimensions */
int nx, ny, nz;
/* Lowest corner of lattice */
Vec3 lo;
/* Lattice spacing */
float h;
} LatticeDim;
/* An electric potential field sampled on a regular grid. The
lattice size and grid point positions are specified by 'dim'.
*/
typedef struct Lattice_t {
LatticeDim dim;
float *lattice;
} Lattice;
LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h);
Lattice *create_lattice(LatticeDim dim);
void destroy_lattice(Lattice *);
int gpu_compute_cutoff_potential_lattice(
struct pb_TimerSet *timers,
Lattice *lattice,
float cutoff, /* cutoff distance */
Atoms *atom, /* array of atoms */
int verbose, /* print info/debug messages */
struct pb_Parameters *parameters
);
int cpu_compute_cutoff_potential_lattice(
Lattice *lattice, /* the lattice */
float cutoff, /* cutoff distance */
Atoms *atoms /* array of atoms */
);
int remove_exclusions(
Lattice *lattice, /* the lattice */
float exclcutoff, /* exclusion cutoff distance */
Atoms *atom /* array of atoms */
);
#ifdef __cplusplus
}
#endif
#endif /* CUTOFF_H */

View File

@@ -0,0 +1,157 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <parboil.h>
#include "atom.h"
#include "cutoff.h"
#define CELLEN 4.f
#define INV_CELLEN (1.f/CELLEN)
extern int remove_exclusions(
Lattice *lattice, /* the lattice */
float cutoff, /* exclusion cutoff distance */
Atoms *atoms /* array of atoms */
)
{
int nx = lattice->dim.nx;
int ny = lattice->dim.ny;
int nz = lattice->dim.nz;
float xlo = lattice->dim.lo.x;
float ylo = lattice->dim.lo.y;
float zlo = lattice->dim.lo.z;
float gridspacing = lattice->dim.h;
Atom *atom = atoms->atoms;
const float a2 = cutoff * cutoff;
const float inv_gridspacing = 1.f / gridspacing;
const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
/* lattice point radius about each atom */
int n;
int i, j, k;
int ia, ib, ic;
int ja, jb, jc;
int ka, kb, kc;
int index;
int koff, jkoff;
float x, y, z, q;
float dx, dy, dz;
float dz2, dydz2, r2;
float e;
float xstart, ystart;
float *pg;
int gindex;
int ncell, nxcell, nycell, nzcell;
int *first, *next;
float inv_cellen = INV_CELLEN;
Vec3 minext, maxext;
/* find min and max extent */
get_atom_extent(&minext, &maxext, atoms);
/* number of cells in each dimension */
nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
ncell = nxcell * nycell * nzcell;
/* allocate for cursor link list implementation */
first = (int *) malloc(ncell * sizeof(int));
for (gindex = 0; gindex < ncell; gindex++) {
first[gindex] = -1;
}
next = (int *) malloc(atoms->size * sizeof(int));
for (n = 0; n < atoms->size; n++) {
next[n] = -1;
}
/* geometric hashing */
for (n = 0; n < atoms->size; n++) {
if (0==atom[n].q) continue; /* skip any non-contributing atoms */
i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
gindex = (k*nycell + j)*nxcell + i;
next[n] = first[gindex];
first[gindex] = n;
}
/* traverse the grid cells */
for (gindex = 0; gindex < ncell; gindex++) {
for (n = first[gindex]; n != -1; n = next[n]) {
x = atom[n].x - xlo;
y = atom[n].y - ylo;
z = atom[n].z - zlo;
q = atom[n].q;
/* find closest grid point with position less than or equal to atom */
ic = (int) (x * inv_gridspacing);
jc = (int) (y * inv_gridspacing);
kc = (int) (z * inv_gridspacing);
/* find extent of surrounding box of grid points */
ia = ic - radius;
ib = ic + radius + 1;
ja = jc - radius;
jb = jc + radius + 1;
ka = kc - radius;
kb = kc + radius + 1;
/* trim box edges so that they are within grid point lattice */
if (ia < 0) ia = 0;
if (ib >= nx) ib = nx-1;
if (ja < 0) ja = 0;
if (jb >= ny) jb = ny-1;
if (ka < 0) ka = 0;
if (kb >= nz) kb = nz-1;
/* loop over surrounding grid points */
xstart = ia*gridspacing - x;
ystart = ja*gridspacing - y;
dz = ka*gridspacing - z;
for (k = ka; k <= kb; k++, dz += gridspacing) {
koff = k*ny;
dz2 = dz*dz;
dy = ystart;
for (j = ja; j <= jb; j++, dy += gridspacing) {
jkoff = (koff + j)*nx;
dydz2 = dy*dy + dz2;
dx = xstart;
index = jkoff + ia;
pg = lattice->lattice + index;
for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
r2 = dx*dx + dydz2;
/* If atom and lattice point are too close, set the lattice value
* to zero */
if (r2 < a2) *pg = 0;
}
}
} /* end loop over surrounding grid points */
} /* end loop over atoms in a gridcell */
} /* end loop over gridcells */
/* free memory */
free(next);
free(first);
return 0;
}

View File

@@ -0,0 +1,55 @@
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
//#include <endian.h>
#include <stdlib.h>
#include <malloc.h>
#include <stdio.h>
#include <inttypes.h>
#include "gpu_info.h"
void compute_active_thread(size_t *thread,
size_t *grid,
int task,
int pad,
int major,
int minor,
int sm)
{
int max_thread;
int max_block=8;
if(major==1)
{
if(minor>=2)
max_thread=1024;
else
max_thread=768;
}
else if(major==2)
max_thread=1536;
else
//newer GPU //keep using 2.0
max_thread=1536;
int _grid;
int _thread;
if(task*pad>sm*max_thread)
{
_thread=max_thread/max_block;
_grid = ((task*pad+_thread-1)/_thread)*_thread;
}
else
{
_thread=pad;
_grid=task*pad;
}
thread[0]=_thread;
grid[0]=_grid;
}

View File

@@ -0,0 +1,20 @@
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#ifndef __GPUINFOH__
#define __GPUINFOH__
void compute_active_thread(size_t *thread,
size_t *grid,
int task,
int pad,
int major,
int minor,
int sm);
#endif

View File

@@ -0,0 +1,104 @@
/*
* potential lattice is decomposed into size 8^3 lattice point "regions"
*
* THIS IMPLEMENTATION: one thread per lattice point
* thread block size 128 gives 4 thread blocks per region
* kernel is invoked for each x-y plane of regions,
* where gridDim.x is 4*(x region dimension) so that blockIdx.x
* can absorb the z sub-region index in its 2 lowest order bits
*
* Regions are stored contiguously in memory in row-major order
*
* The bins have to not only cover the region, but they need to surround
* the outer edges so that region sides and corners can still use
* neighbor list stencil. The binZeroAddr is actually a shifted pointer into
* the bin array (binZeroAddr = binBaseAddr + (c*binDim_y + c)*binDim_x + c)
* where c = ceil(cutoff / binsize). This allows for negative offsets to
* be added to myBinIndex.
*
* The (0,0,0) spatial origin corresponds to lower left corner of both
* regionZeroAddr and binZeroAddr. The atom coordinates are translated
* during binning to enforce this assumption.
*/
#include "macros.h"
// OpenCL 1.1 support for int3 is not uniform on all implementations, so
// we use int4 instead. Only the 'x', 'y', and 'z' fields of xyz are used.
typedef int4 xyz;
__kernel void opencl_cutoff_potential_lattice(
int binDim_x,
int binDim_y,
__global float4 *binBaseAddr,
int offset,
float h, /* lattice spacing */
float cutoff2, /* square of cutoff distance */
float inv_cutoff2,
__global float *regionZeroAddr, /* address of lattice regions starting at origin */
int zRegionIndex,
__constant int *NbrListLen,
__constant xyz *NbrList
)
{
__global float4* binZeroAddr = binBaseAddr + offset;
__global float *myRegionAddr;
int Bx, By, Bz;
/* thread id */
const int tid = (get_local_id(2)*get_local_size(1) +
get_local_id(1))*get_local_size(0) + get_local_id(0);
/* this is the start of the sub-region indexed by tid */
myRegionAddr = regionZeroAddr + ((zRegionIndex*get_num_groups(1)
+ get_group_id(1))*(get_num_groups(0)>>2) + (get_group_id(0)>>2))*REGION_SIZE
+ (get_group_id(0)&3)*SUB_REGION_SIZE;
/* spatial coordinate of this lattice point */
float x = (8 * (get_group_id(0) >> 2) + get_local_id(0)) * h;
float y = (8 * get_group_id(1) + get_local_id(1)) * h;
float z = (8 * zRegionIndex + 2*(get_group_id(0)&3) + get_local_id(2)) * h;
float dx;
float dy;
float dz;
float r2;
float s;
int totalbins = 0;
/* bin number determined by center of region */
Bx = (int) floor((8 * (get_group_id(0) >> 2) + 4) * h * BIN_INVLEN);
By = (int) floor((8 * get_group_id(1) + 4) * h * BIN_INVLEN);
Bz = (int) floor((8 * zRegionIndex + 4) * h * BIN_INVLEN);
float energy = 0.f;
int bincnt;
for (bincnt = 0; bincnt < *NbrListLen; bincnt++) {
int i = Bx + NbrList[bincnt].x;
int j = By + NbrList[bincnt].y;
int k = Bz + NbrList[bincnt].z;
__global float4* p_global = binZeroAddr +
(((k*binDim_y + j)*binDim_x + i) * BIN_DEPTH);
int m;
for (m = 0; m < BIN_DEPTH; m++) {
float aq = p_global[m].w;
if (0.f != aq) {
dx = p_global[m].x - x;
dy = p_global[m].y - y;
dz = p_global[m].z - z;
r2 = dx*dx + dy*dy + dz*dz;
if (r2 < cutoff2) {
s = (1.f - r2 * inv_cutoff2);
energy += aq * rsqrt(r2) * s * s;
}
}
} /* end loop over atoms in bin */
} /* end loop over neighbor list */
/* store into global memory */
myRegionAddr[tid+0] = energy;
}

Binary file not shown.

View File

@@ -0,0 +1,69 @@
#ifndef __MACROSH__
#define __MACROSH__
#ifdef __DEVICE_EMULATION__
#define DEBUG
/* define which grid block and which thread to examine */
#define BX 0
#define BY 0
#define TX 0
#define TY 0
#define TZ 0
#define EMU(code) do { \
if (blockIdx.x==BX && blockIdx.y==BY && \
threadIdx.x==TX && threadIdx.y==TY && threadIdx.z==TZ) { \
code; \
} \
} while (0)
#define INT(n) printf("%s = %d\n", #n, n)
#define FLOAT(f) printf("%s = %g\n", #f, (double)(f))
#define INT3(n) printf("%s = %d %d %d\n", #n, (n).x, (n).y, (n).z)
#define FLOAT4(f) printf("%s = %g %g %g %g\n", #f, (double)(f).x, \
(double)(f).y, (double)(f).z, (double)(f).w)
#else
#define EMU(code)
#define INT(n)
#define FLOAT(f)
#define INT3(n)
#define FLOAT4(f)
#endif
/* report error from OpenCL */
#define CHECK_ERROR(errorMessage) \
if(clStatus != CL_SUCCESS) \
{ \
printf("Error: %s!\n",errorMessage); \
printf("Line: %d\n",__LINE__); \
exit(1); \
}
/*
* neighbor list:
* stored in constant memory as table of offsets
* flat index addressing is computed by kernel
*
* reserve enough memory for 11^3 stencil of grid cells
* this fits within 16K of memory
*/
#define NBRLIST_DIM 11
#define NBRLIST_MAXLEN (NBRLIST_DIM * NBRLIST_DIM * NBRLIST_DIM)
/*
* atom bins cached into shared memory for processing
*
* this reserves 4K of shared memory for 32 atom bins each containing 8 atoms,
* should permit scheduling of up to 3 thread blocks per SM
*/
#define BIN_DEPTH 8 /* max number of atoms per bin */
#define BIN_SIZE 32 /* size of bin in floats */
#define BIN_CACHE_MAXLEN 32 /* max number of atom bins to cache */
#define BIN_LENGTH 4.f /* spatial length in Angstroms */
#define BIN_INVLEN (1.f / BIN_LENGTH)
/* assuming density of 1 atom / 10 A^3, expectation is 6.4 atoms per bin
* so that bin fill should be 80% (for non-empty regions of space) */
#define REGION_SIZE 512 /* number of floats in lattice region */
#define SUB_REGION_SIZE 128 /* number of floats in lattice sub-region */
#endif

View File

@@ -0,0 +1,194 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <parboil.h>
#include "atom.h"
#include "cutoff.h"
#include "output.h"
#define ERRTOL 1e-4f
#define NOKERNELS 0
#define CUTOFF1 1
#define CUTOFF6 32
#define CUTOFF6OVERLAP 64
#define CUTOFFCPU 16384
int appenddata(const char *filename, int size, double time) {
FILE *fp;
fp=fopen(filename, "a");
if (fp == NULL) {
printf("error appending to file %s..\n", filename);
return -1;
}
fprintf(fp, "%d %.3f\n", size, time);
fclose(fp);
return 0;
}
LatticeDim
lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h)
{
LatticeDim ret;
ret.nx = (int) floorf((hi.x-lo.x)/h) + 1;
ret.ny = (int) floorf((hi.y-lo.y)/h) + 1;
ret.nz = (int) floorf((hi.z-lo.z)/h) + 1;
ret.lo = lo;
ret.h = h;
return ret;
}
Lattice *
create_lattice(LatticeDim dim)
{
int size;
Lattice *lat = (Lattice *)malloc(sizeof(Lattice));
if (lat == NULL) {
fprintf(stderr, "Out of memory\n");
exit(1);
}
lat->dim = dim;
/* Round up the allocated size to a multiple of 8 */
size = ((dim.nx * dim.ny * dim.nz) + 7) & ~7;
lat->lattice = (float *)calloc(size, sizeof(float));
if (lat->lattice == NULL) {
fprintf(stderr, "Out of memory\n");
exit(1);
}
return lat;
}
void
destroy_lattice(Lattice *lat)
{
if (lat) {
free(lat->lattice);
free(lat);
}
}
int main(int argc, char *argv[]) {
Atoms *atom;
LatticeDim lattice_dim;
Lattice *gpu_lattice;
Vec3 min_ext, max_ext; /* Bounding box of atoms */
Vec3 lo, hi; /* Bounding box with padding */
float h = 0.5f; /* Lattice spacing */
float cutoff = 12.f; /* Cutoff radius */
float exclcutoff = 1.f; /* Radius for exclusion */
float padding = 0.5f; /* Bounding box padding distance */
int n;
struct pb_Parameters *parameters;
struct pb_TimerSet timers;
/* Read input parameters */
parameters = pb_ReadParameters(&argc, argv);
if (parameters == NULL) {
exit(1);
}
parameters->inpFiles = (char **)malloc(sizeof(char *) * 2);
parameters->inpFiles[0] = (char *)malloc(100);
parameters->inpFiles[1] = NULL;
strncpy(parameters->inpFiles[0], "watbox.sl40.pqr", 100);
/* Expect one input file */
if (pb_Parameters_CountInputs(parameters) != 1) {
fprintf(stderr, "Expecting one input file\n");
exit(1);
}
pb_InitializeTimerSet(&timers);
pb_SwitchToTimer(&timers, pb_TimerID_IO);
printf("OK\n");
{
const char *pqrfilename = parameters->inpFiles[0];
if (!(atom = read_atom_file(pqrfilename))) {
fprintf(stderr, "read_atom_file() failed\n");
exit(1);
}
printf("read %d atoms from file '%s'\n", atom->size, pqrfilename);
}
printf("OK\n");
/* find extent of domain */
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
get_atom_extent(&min_ext, &max_ext, atom);
printf("extent of domain is:\n");
printf(" minimum %g %g %g\n", min_ext.x, min_ext.y, min_ext.z);
printf(" maximum %g %g %g\n", max_ext.x, max_ext.y, max_ext.z);
printf("padding domain by %g Angstroms\n", padding);
lo = (Vec3) {min_ext.x - padding, min_ext.y - padding, min_ext.z - padding};
hi = (Vec3) {max_ext.x + padding, max_ext.y + padding, max_ext.z + padding};
printf("domain lengths are %g by %g by %g\n", hi.x-lo.x, hi.y-lo.y, hi.z-lo.z);
lattice_dim = lattice_from_bounding_box(lo, hi, h);
gpu_lattice = create_lattice(lattice_dim);
printf("\n");
/*
* Run OpenCL kernel
* (Begin and end with COMPUTE timer active)
*/
if (gpu_compute_cutoff_potential_lattice(&timers, gpu_lattice, cutoff, atom, 0, parameters)) {
fprintf(stderr, "Computation failed\n");
exit(1);
}
/*
* Zero the lattice points that are too close to an atom. This is
* necessary for numerical stability.
*/
if (remove_exclusions(gpu_lattice, exclcutoff, atom)) {
fprintf(stderr, "remove_exclusions() failed for gpu lattice\n");
exit(1);
}
printf("\n");
pb_SwitchToTimer(&timers, pb_TimerID_IO);
/* Print output */
if (parameters->outFile) {
//write_lattice_summary(parameters->outFile, gpu_lattice);
}
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
/* Cleanup */
destroy_lattice(gpu_lattice);
free_atom(atom);
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
pb_PrintTimerSet(&timers);
pb_FreeParameters(parameters);
return 0;
}

View File

@@ -0,0 +1,49 @@
#include <CL/cl.h>
#include <stdio.h>
#include <string.h>
#include "ocl.h"
char* readFile(const char* fileName)
{
FILE* fp;
fp = fopen(fileName,"r");
if(fp == NULL)
{
printf("Error 1!\n");
exit(1);
}
fseek(fp,0,SEEK_END);
long size = ftell(fp);
rewind(fp);
char* buffer = (char*)malloc(sizeof(char)*(size+1));
if(buffer == NULL)
{
printf("Error 2!\n");
fclose(fp);
exit(1);
}
size_t res = fread(buffer,1,size,fp);
if(res != size)
{
printf("Error 3!\n");
fclose(fp);
exit(1);
}
buffer[size] = 0;
fclose(fp);
return buffer;
}
void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
{
cl_int clStatus;
char* temp = (char*)malloc(size);
memset(temp,val,size);
clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
free(temp);
}

View File

@@ -0,0 +1,17 @@
#ifndef __OCLH__
#define __OCLH__
#include <stdlib.h>
void clMemSet(cl_command_queue, cl_mem, int, size_t);
char* readFile(const char*);
#define CHECK_ERROR(errorMessage) \
if(clStatus != CL_SUCCESS) \
{ \
printf("Error: %s!\n",errorMessage); \
printf("Line: %d\n",__LINE__); \
exit(1); \
}
#endif

View File

@@ -0,0 +1,67 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#include <math.h>
#include <parboil.h>
#include "atom.h"
#include "cutoff.h"
void
write_lattice_summary(const char *filename, Lattice *lattice)
{
float *lattice_data = lattice->lattice;
int nx = lattice->dim.nx;
int ny = lattice->dim.ny;
int nz = lattice->dim.nz;
/* Open output file */
FILE *outfile = fopen(filename, "w");
if (outfile == NULL) {
fprintf(stderr, "Cannot open output file\n");
exit(1);
}
/* Write the sum of the the absolute values of all lattice potentials */
{
double abspotential = 0.0;
float tmp;
int i;
for (i = 0; i < nx * ny * nz; i++)
abspotential += fabs((double) lattice_data[i]);
tmp = (float) abspotential;
fwrite(&tmp, 1, sizeof(float), outfile);
}
/* Write the size of a lattice plane */
{
uint32_t tmp;
tmp = (uint32_t) (lattice->dim.nx * lattice->dim.ny);
fwrite(&tmp, 1, sizeof(uint32_t), outfile);
}
/* Write the plane of lattice data at z=0 and z = nz-1 */
{
int plane_size = nx * ny;
fwrite(lattice_data, plane_size, sizeof(float), outfile);
fwrite(lattice_data + (nz-1) * plane_size, plane_size, sizeof(float),
outfile);
}
/* Cleanup */
fclose(outfile);
}

View File

@@ -0,0 +1,25 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#ifndef OUTPUT_H
#define OUTPUT_H
#include "cutoff.h"
#ifdef __cplusplus
extern "C" {
#endif
void
write_lattice_summary(const char *filename, Lattice *lattice);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -0,0 +1,348 @@
/*
* (c) 2010 The Board of Trustees of the University of Illinois.
*/
#ifndef PARBOIL_HEADER
#define PARBOIL_HEADER
#include <stdio.h>
#include <string.h>
#ifdef __cplusplus
extern "C" {
#endif
#include <unistd.h>
/* A platform as specified by the user on the command line */
struct pb_PlatformParam {
char *name; /* The platform name. This string is owned. */
char *version; /* The platform version; may be NULL.
* This string is owned. */
};
/* Create a PlatformParam from the given strings.
* 'name' must not be NULL. 'version' may be NULL.
* If not NULL, the strings should have been allocated by malloc(),
* and they will be owned by the returned object.
*/
struct pb_PlatformParam *
pb_PlatformParam(char *name, char *version);
void
pb_FreePlatformParam(struct pb_PlatformParam *);
/* A criterion for how to select a device */
enum pb_DeviceSelectionCriterion {
pb_Device_INDEX, /* Enumerate the devices and select one
* by its number */
pb_Device_CPU, /* Select a CPU device */
pb_Device_GPU, /* Select a GPU device */
pb_Device_ACCELERATOR, /* Select an accelerator device */
pb_Device_NAME /* Select a device by name */
};
/* A device as specified by the user on the command line */
struct pb_DeviceParam {
enum pb_DeviceSelectionCriterion criterion;
union {
int index; /* If criterion == pb_Device_INDEX,
* the index of the device */
char *name; /* If criterion == pb_Device_NAME,
* the name of the device.
* This string is owned. */
};
};
struct pb_DeviceParam *
pb_DeviceParam_index(int index);
struct pb_DeviceParam *
pb_DeviceParam_cpu(void);
struct pb_DeviceParam *
pb_DeviceParam_gpu(void);
struct pb_DeviceParam *
pb_DeviceParam_accelerator(void);
/* Create a by-name device selection criterion.
* The string should have been allocated by malloc(), and it will will be
* owned by the returned object.
*/
struct pb_DeviceParam *
pb_DeviceParam_name(char *name);
void
pb_FreeDeviceParam(struct pb_DeviceParam *);
/* Command line parameters for benchmarks */
struct pb_Parameters {
char *outFile; /* If not NULL, the raw output of the
* computation should be saved to this
* file. The string is owned. */
char **inpFiles; /* A NULL-terminated array of strings
* holding the input file(s) for the
* computation. The array and strings
* are owned. */
struct pb_PlatformParam *platform; /* If not NULL, the platform
* specified on the command line. */
struct pb_DeviceParam *device; /* If not NULL, the device
* specified on the command line. */
};
/* Read command-line parameters.
*
* The argc and argv parameters to main are read, and any parameters
* interpreted by this function are removed from the argument list.
*
* A new instance of struct pb_Parameters is returned.
* If there is an error, then an error message is printed on stderr
* and NULL is returned.
*/
struct pb_Parameters *
pb_ReadParameters(int *_argc, char **argv);
/* Free an instance of struct pb_Parameters.
*/
void
pb_FreeParameters(struct pb_Parameters *p);
void
pb_FreeStringArray(char **);
/* Count the number of input files in a pb_Parameters instance.
*/
int
pb_Parameters_CountInputs(struct pb_Parameters *p);
/* A time or duration. */
//#if _POSIX_VERSION >= 200112L
typedef unsigned long long pb_Timestamp; /* time in microseconds */
//#else
//# error "Timestamps not implemented"
//#endif
enum pb_TimerState {
pb_Timer_STOPPED,
pb_Timer_RUNNING,
};
struct pb_Timer {
enum pb_TimerState state;
pb_Timestamp elapsed; /* Amount of time elapsed so far */
pb_Timestamp init; /* Beginning of the current time interval,
* if state is RUNNING. End of the last
* recorded time interfal otherwise. */
};
/* Reset a timer.
* Use this to initialize a timer or to clear
* its elapsed time. The reset timer is stopped.
*/
void
pb_ResetTimer(struct pb_Timer *timer);
/* Start a timer. The timer is set to RUNNING mode and
* time elapsed while the timer is running is added to
* the timer.
* The timer should not already be running.
*/
void
pb_StartTimer(struct pb_Timer *timer);
/* Stop a timer.
* This stops adding elapsed time to the timer.
* The timer should not already be stopped.
*/
void
pb_StopTimer(struct pb_Timer *timer);
/* Get the elapsed time in seconds. */
double
pb_GetElapsedTime(struct pb_Timer *timer);
/* Execution time is assigned to one of these categories. */
enum pb_TimerID {
pb_TimerID_NONE = 0,
pb_TimerID_IO, /* Time spent in input/output */
pb_TimerID_KERNEL, /* Time spent computing on the device,
* recorded asynchronously */
pb_TimerID_COPY, /* Time spent synchronously moving data
* to/from device and allocating/freeing
* memory on the device */
pb_TimerID_DRIVER, /* Time spent in the host interacting with the
* driver, primarily for recording the time
* spent queueing asynchronous operations */
pb_TimerID_COPY_ASYNC, /* Time spent in asynchronous transfers */
pb_TimerID_COMPUTE, /* Time for all program execution other
* than parsing command line arguments,
* I/O, kernel, and copy */
pb_TimerID_OVERLAP, /* Time double-counted in asynchronous and
* host activity: automatically filled in,
* not intended for direct usage */
pb_TimerID_LAST /* Number of timer IDs */
};
/* Dynamic list of asynchronously tracked times between events */
struct pb_async_time_marker_list {
char *label; // actually just a pointer to a string
enum pb_TimerID timerID; /* The ID to which the interval beginning
* with this marker should be attributed */
void * marker;
//cudaEvent_t marker; /* The driver event for this marker */
struct pb_async_time_marker_list *next;
};
struct pb_SubTimer {
char *label;
struct pb_Timer timer;
struct pb_SubTimer *next;
};
struct pb_SubTimerList {
struct pb_SubTimer *current;
struct pb_SubTimer *subtimer_list;
};
/* A set of timers for recording execution times. */
struct pb_TimerSet {
enum pb_TimerID current;
struct pb_async_time_marker_list* async_markers;
pb_Timestamp async_begin;
pb_Timestamp wall_begin;
struct pb_Timer timers[pb_TimerID_LAST];
struct pb_SubTimerList *sub_timer_list[pb_TimerID_LAST];
};
/* Reset all timers in the set. */
void
pb_InitializeTimerSet(struct pb_TimerSet *timers);
void
pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category);
/* Select which timer the next interval of time should be accounted
* to. The selected timer is started and other timers are stopped.
* Using pb_TimerID_NONE stops all timers. */
void
pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer);
void
pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category);
/* Print timer values to standard output. */
void
pb_PrintTimerSet(struct pb_TimerSet *timers);
/* Release timer resources */
void
pb_DestroyTimerSet(struct pb_TimerSet * timers);
void
pb_SetOpenCL(void *clContextPtr, void *clCommandQueuePtr);
typedef struct pb_Device_tag {
char* name;
void* clDevice;
int id;
unsigned int in_use;
unsigned int available;
} pb_Device;
struct pb_Context_tag;
typedef struct pb_Context_tag pb_Context;
typedef struct pb_Platform_tag {
char* name;
char* version;
void* clPlatform;
unsigned int in_use;
pb_Context** contexts;
pb_Device** devices;
} pb_Platform;
struct pb_Context_tag {
void* clPlatformId;
void* clContext;
void* clDeviceId;
pb_Platform* pb_platform;
pb_Device* pb_device;
};
// verbosely print out list of platforms and their devices to the console.
pb_Platform**
pb_GetPlatforms();
// Choose a platform according to the given platform specification
pb_Platform*
pb_GetPlatform(struct pb_PlatformParam *platform);
// choose a platform: by name, name & version
pb_Platform*
pb_GetPlatformByName(const char* name);
pb_Platform*
pb_GetPlatformByNameAndVersion(const char* name, const char* version);
// Choose a device according to the given device specification
pb_Device*
pb_GetDevice(pb_Platform* pb_platform, struct pb_DeviceParam *device);
pb_Device**
pb_GetDevices(pb_Platform* pb_platform);
// choose a device by name.
pb_Device*
pb_GetDeviceByName(pb_Platform* pb_platform, const char* name);
pb_Platform*
pb_GetPlatformByEnvVars();
pb_Context*
pb_InitOpenCLContext(struct pb_Parameters* parameters);
void
pb_ReleasePlatforms();
void
pb_ReleaseContext(pb_Context* c);
void
pb_PrintPlatformInfo(pb_Context* c);
void
perf_init();
//#define MEASURE_KERNEL_TIME
#include <CL/cl.h>
#ifdef MEASURE_KERNEL_TIME
#define clEnqueueNDRangeKernel(q,k,d,o,dg,db,a,b,c) pb_clEnqueueNDRangeKernel((q), (k), (d), (o), (dg), (db), (a), (b), (c))
cl_int
pb_clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
cl_kernel /* kernel */,
cl_uint /* work_dim */,
const size_t * /* global_work_offset */,
const size_t * /* global_work_size */,
const size_t * /* local_work_size */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */);
#endif
enum { T_FLOAT, T_DOUBLE, T_SHORT, T_INT, T_UCHAR };
void pb_sig_float(char*, float*, int);
void pb_sig_double(char*, double*, int);
void pb_sig_short(char*, short*, int);
void pb_sig_int(char*, int*, int);
void pb_sig_uchar(char*, unsigned char*, unsigned int);
void pb_sig_clmem(char*, cl_command_queue, cl_mem, int);
#ifdef __cplusplus
}
#endif
#endif //PARBOIL_HEADER

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,139 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "atom.h"
#define LINELEN 96
#define INITLEN 20
Atoms *read_atom_file(const char *fname)
{
FILE *file;
char line[LINELEN];
Atom *atom; /* Atom array */
int len = INITLEN; /* Size of atom array */
int cnt = 0; /* Number of atoms read */
/* allocate initial atom array */
atom = (Atom *) malloc(len * sizeof(Atom));
if (NULL==atom) {
fprintf(stderr, "can't allocate memory\n");
return NULL;
}
int i;
for (i = 0; i < len; ++i) {
atom[i].x = i+0;
atom[i].y = i+1;
atom[i].z = i+2;
atom[i].q = 1;
}
#if 0
/* open atom "pqr" file */
file = fopen(fname, "r");
if (NULL==file) {
fprintf(stderr, "can't open file \"%s\" for reading\n", fname);
return NULL;
}
/* loop to read pqr file line by line */
while (fgets(line, LINELEN, file) != NULL) {
if (strncmp(line, "ATOM ", 6) != 0 && strncmp(line, "HETATM", 6) != 0) {
continue; /* skip anything that isn't an atom record */
}
if (cnt==len) { /* extend atom array */
void *tmp = realloc(atom, 2*len*sizeof(Atom));
if (NULL==tmp) {
fprintf(stderr, "can't allocate more memory\n");
return NULL;
}
atom = (Atom *) tmp;
len *= 2;
}
/* read position coordinates and charge from atom record */
if (sscanf(line, "%*s %*d %*s %*s %*d %f %f %f %f", &(atom[cnt].x),
&(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) {
fprintf(stderr, "atom record %d does not have expected format\n", cnt+1);
return NULL;
}
cnt++; /* count atoms as we store them */
}
/* verify EOF and close file */
if ( !feof(file) ) {
fprintf(stderr, "did not find EOF\n");
return NULL;
}
if (fclose(file)) {
fprintf(stderr, "can't close file\n");
return NULL;
}
#endif
/* Build the output data structure */
{
Atoms *out = (Atoms *)malloc(sizeof(Atoms));
if (NULL == out) {
fprintf(stderr, "can't allocate memory\n");
return NULL;
}
out->size = cnt;
out->atoms = atom;
return out;
}
}
void free_atom(Atoms *atom)
{
if (atom) {
free(atom->atoms);
free(atom);
}
}
void
get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom)
{
Atom *atoms = atom->atoms;
int natoms = atom->size;
Vec3 lo;
Vec3 hi;
int n;
hi.x = lo.x = atoms[0].x;
hi.y = lo.y = atoms[0].y;
hi.z = lo.z = atoms[0].z;
for (n = 1; n < natoms; n++) {
lo.x = fminf(lo.x, atoms[n].x);
hi.x = fmaxf(hi.x, atoms[n].x);
lo.y = fminf(lo.y, atoms[n].y);
hi.y = fmaxf(hi.y, atoms[n].y);
lo.z = fminf(lo.z, atoms[n].z);
hi.z = fmaxf(hi.z, atoms[n].z);
}
*out_lo = lo;
*out_hi = hi;
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,68 @@
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH)
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT = gaussian
SRCS = main.cc clutils.cpp utils.cpp
all: $(PROJECT).dump $(PROJECT).hex
lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug

View File

@@ -0,0 +1,241 @@
/*-----------------------------------------------------------
** ge_p.c -- The program is to solve a linear system Ax = b
** by using Gaussian Elimination. The algorithm on page 101
** ("Foundations of Parallel Programming") is used.
** The sequential version is ge_s.c. This parallel
** implementation converts three independent for() loops
** into three Fans. Use the data file ge_3.dat to verify
** the correction of the output.
**
** Written by Andreas Kura, 02/15/95
** Modified by Chong-wei Xu, /04/20/95
**-----------------------------------------------------------
*/
#include <us.h>
#include <stdio.h>
int Size, t;
float **a, *b;
BEGIN_SHARED_DECL
float **m;
END_SHARED_DECL;
FILE *fp;
void InitProblemOnce();
void InitPerRun();
void ForwardSub();
void Fan1();
void Fan2();
void Fan3();
void InitMat();
void InitAry();
void PrintMat();
void PrintAry();
main ()
{
InitializeUs();
MakeSharedVariables; /* to make SHARED m */
InitProblemOnce();
InitPerRun();
ForwardSub();
printf("The result of matrix m is: \n");
PrintMat(SHARED m, Size, Size);
printf("The result of matrix a is: \n");
PrintMat(a, Size, Size);
printf("The result of array b is: \n");
PrintAry(b, Size);
}
/*------------------------------------------------------
** InitProblemOnce -- Initialize all of matrices and
** vectors by opening a data file specified by the user.
**
** We used dynamic array **a, *b, and **m to allocate
** the memory storages.
**------------------------------------------------------
*/
void InitProblemOnce()
{
char filename[30];
printf("Enter the data file name: ");
scanf("%s", filename);
printf("The file name is: %s\n", filename);
fp = fopen(filename, "r");
fscanf(fp, "%d", &Size);
a = (float **) UsAllocScatterMatrix(Size, Size, sizeof(float));
/*
a = (float **) malloc(Size * sizeof(float *));
for (i=0; i<Size; i++) {
a[i] = (float *) malloc(Size * sizeof(float));
}
*/
InitMat(a, Size, Size);
printf("The input matrix a is:\n");
PrintMat(a, Size, Size);
b = (float *) UsAlloc(Size * sizeof(float));
/*
b = (float *) malloc(Size * sizeof(float));
*/
InitAry(b, Size);
printf("The input array b is:\n");
PrintAry(b, Size);
SHARED m = (float **) UsAllocScatterMatrix(Size, Size, sizeof(float));
/*
m = (float **) malloc(Size * sizeof(float *));
for (i=0; i<Size; i++) {
m[i] = (float *) malloc(Size * sizeof(float));
}
*/
Share(&Size);
Share(&a);
Share(&b);
}
/*------------------------------------------------------
** InitPerRun() -- Initialize the contents of the
** multipier matrix **m
**------------------------------------------------------
*/
void InitPerRun()
{
int i, j;
for (i=0; i<Size; i++)
for (j=0; j<Size; j++)
SHARED m[i][j] = 0.0;
}
/*------------------------------------------------------
** ForwardSub() -- Forward substitution of Gaussian
** elimination.
**------------------------------------------------------
*/
void ForwardSub()
{
for (t=0; t<(Size-1); t++) {
Share(&t);
GenOnI(Fan1, Size-1-t); /* t=0 to (Size-2), the range is
** Size-2-t+1 = Size-1-t
*/
GenOnA(Fan2, Size-1-t, Size-t);
GenOnI(Fan3, Size-1-t);
}
}
/*-------------------------------------------------------
** Fan1() -- Calculate multiplier matrix
** Pay attention to the index. Index i give the range
** which starts from 0 to range-1. The real values of
** the index should be adjust and related with the value
** of t which is defined on the ForwardSub().
**-------------------------------------------------------
*/
void Fan1(dummy, i)
int dummy, i;
{
/* Use these printf() to display the nodes and index */
printf("from node #%d\n", PhysProcToUsProc(Proc_Node));
SHARED m[i+t+1][t] = a[i+t+1][t] / a[t][t];
printf("i=%d, a[%d][%d]=%.2f, a[%d][%d]=%.2f, m[%d][%d]=%.2f\n",
(i+t+1),t,t,a[t][t],(i+t+1),t,a[i+t+1][t],(i+t+1),t,
SHARED m[i+t+1][t]);
}
/*-------------------------------------------------------
** Fan2() -- Modify the matrix A into LUD
**-------------------------------------------------------
*/
void Fan2(dummy, i, j)
int dummy, i, j;
{
a[i+1+t][j+t] -= SHARED m[i+1+t][t] * a[t][j+t];
Share (&a);
}
/*-------------------------------------------------------
** Fan3() -- Modify the array b
**-------------------------------------------------------
*/
void Fan3(dummy, i)
int dummy, i;
{
b[i+1+t] -= SHARED m[i+1+t][t] * b[t];
}
/*------------------------------------------------------
** InitMat() -- Initialize the matrix by reading data
** from the data file
**------------------------------------------------------
*/
void InitMat(ary, nrow, ncol)
float **ary;
int nrow, ncol;
{
int i, j;
for (i=0; i<nrow; i++) {
for (j=0; j<ncol; j++) {
fscanf(fp, "%f", &ary[i][j]);
}
}
}
/*------------------------------------------------------
** PrintMat() -- Print the contents of the matrix
**------------------------------------------------------
*/
void PrintMat(ary, nrow, ncol)
float **ary;
int nrow, ncol;
{
int i, j;
for (i=0; i<nrow; i++) {
for (j=0; j<ncol; j++) {
printf("%8.2f ", ary[i][j]);
}
printf("\n");
}
printf("\n");
}
/*------------------------------------------------------
** InitAry() -- Initialize the array (vector) by reading
** data from the data file
**------------------------------------------------------
*/
void InitAry(ary, ary_size)
float *ary;
int ary_size;
{
int i;
for (i=0; i<ary_size; i++) {
fscanf(fp, "%f", &ary[i]);
}
}
/*------------------------------------------------------
** PrintAry() -- Print the contents of the array (vector)
**------------------------------------------------------
*/
void PrintAry(ary, ary_size)
float *ary;
int ary_size;
{
int i;
for (i=0; i<ary_size; i++) {
printf("%.2f ", ary[i]);
}
printf("\n");
}

View File

@@ -0,0 +1,60 @@
The Gaussian Elimination application solves systems of equations using the
gaussian elimination method.
The application analyzes an n x n matrix and an associated 1 x n vector to solve a
set of equations with n variables and n unknowns. The matrix and vector describe equations
of the form:
a0x + b0y + c0z + d0w = e0
a1x + b1y + c1z + d1w = e1
a2x + b2y + c2z + d2w = e2
a3x + b3y + c3z + d3w = e3
where in this case n=4. The matrix for the above equations would be as follows:
[a0 b0 c0 d0]
[a1 b1 c1 d1]
[a2 b2 c2 d2]
[a3 b3 c3 d3]
and the vector would be:
[e0]
[e1]
[e2]
[e3]
The application creates a solution vector:
[x]
[y]
[z]
[w]
The Makefile may need to be adjusted for different machines, but it was written for Mac OS X and
Linux with either NVIDIA or AMD OpenCL SDKs.
Additional input files can be created with the matrixGenerator.py file in the data folder.
Gaussian Elimination Usage
gaussianElimination [filename] [-hqt] [-p [int] -d [int]]
example:
$ ./gaussianElimination matrix4.txt
filename the filename that holds the matrix data
-h, --help Display the help file
-q Quiet mode. Suppress all text output.
-t Print timing information.
-p [int] Choose the platform (must choose both platform and device)
-d [int] Choose the device (must choose both platform and device)
Notes: 1. The filename is required as the first parameter.
2. If you declare either the device or the platform,
you must declare both.

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,281 @@
/****************************************************************************\
* Copyright (c) 2011, Advanced Micro Devices, Inc. *
* All rights reserved. *
* *
* Redistribution and use in source and binary forms, with or without *
* modification, are permitted provided that the following conditions *
* are met: *
* *
* Redistributions of source code must retain the above copyright notice, *
* this list of conditions and the following disclaimer. *
* *
* Redistributions in binary form must reproduce the above copyright notice, *
* this list of conditions and the following disclaimer in the documentation *
* and/or other materials provided with the distribution. *
* *
* Neither the name of the copyright holder nor the names of its contributors *
* may be used to endorse or promote products derived from this software *
* without specific prior written permission. *
* *
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS *
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED *
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR *
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR *
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, *
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, *
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR *
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
* *
* If you use the software (in whole or in part), you shall adhere to all *
* applicable U.S., European, and other export laws, including but not *
* limited to the U.S. Export Administration Regulations (“EAR”), (15 C.F.R. *
* Sections 730 through 774), and E.U. Council Regulation (EC) No 1334/2000 *
* of 22 June 2000. Further, pursuant to Section 740.6 of the EAR, you *
* hereby certify that, except pursuant to a license granted by the United *
* States Department of Commerce Bureau of Industry and Security or as *
* otherwise permitted pursuant to a License Exception under the U.S. Export *
* Administration Regulations ("EAR"), you will not (1) export, re-export or *
* release to a national of a country in Country Groups D:1, E:1 or E:2 any *
* restricted technology, software, or source code you receive hereunder, *
* or (2) export to Country Groups D:1, E:1 or E:2 the direct product of such *
* technology or software, if such foreign produced direct product is subject *
* to national security controls as identified on the Commerce Control List *
*(currently found in Supplement 1 to Part 774 of EAR). For the most current *
* Country Group listings, or for additional information about the EAR or *
* your obligations under those regulations, please refer to the U.S. Bureau *
* of Industry and Security’s website at http://www.bis.doc.gov/. *
\****************************************************************************/
#ifndef __CL_UTILS_H__
#define __CL_UTILS_H__
#include <CL/cl.h>
// The cl_time type is OS specific
#ifdef _WIN32
#include <tchar.h>
#include <Windows.h>
typedef __int64 cl_time;
#else
#include <sys/time.h>
typedef double cl_time;
#endif
//-------------------------------------------------------
// Initialization and Cleanup
//-------------------------------------------------------
// Detects platforms and devices, creates context and command queue
cl_context cl_init(char devicePreference='\0');
// Creates a context given a platform and a device
cl_context cl_init_context(int platform,int dev,int quiet=0);
// Releases resources used by clutils
void cl_cleanup();
// Releases a kernel object
void cl_freeKernel(cl_kernel kernel);
// Releases a memory object
void cl_freeMem(cl_mem mem);
// Releases a program object
void cl_freeProgram(cl_program program);
// Returns the global command queue
cl_command_queue cl_getCommandQueue();
//-------------------------------------------------------
// Synchronization functions
//-------------------------------------------------------
// Performs a clFinish on the command queue
void cl_sync();
//-------------------------------------------------------
// Memory allocation
//-------------------------------------------------------
// Allocates a regular buffer on the device
cl_mem cl_allocBuffer(size_t mem_size,
cl_mem_flags flags = CL_MEM_READ_WRITE);
// XXX I don't think this does exactly what we want it to do
// Allocates a read-only buffer and transfers the data
cl_mem cl_allocBufferConst(size_t mem_size, void* host_ptr);
// Allocates pinned memory on the host
cl_mem cl_allocBufferPinned(size_t mem_size);
// Allocates an image on the device
cl_mem cl_allocImage(size_t height, size_t width, char type,
cl_mem_flags flags = CL_MEM_READ_WRITE);
//-------------------------------------------------------
// Data transfers
//-------------------------------------------------------
// Copies a buffer from the device to pinned memory on the host and
// maps it so it can be read
void* cl_copyAndMapBuffer(cl_mem dst, cl_mem src, size_t size);
// Copies from one buffer to another
void cl_copyBufferToBuffer(cl_mem dst, cl_mem src, size_t size);
// Copies data to a buffer on the device
void cl_copyBufferToDevice(cl_mem dst, void *src, size_t mem_size,
cl_bool blocking = CL_TRUE);
// Copies data to an image on the device
void cl_copyImageToDevice(cl_mem dst, void* src, size_t height, size_t width);
// Copies an image from the device to the host
void cl_copyImageToHost(void* dst, cl_mem src, size_t height, size_t width);
// Copies data from a device buffer to the host
void cl_copyBufferToHost(void *dst, cl_mem src, size_t mem_size,
cl_bool blocking = CL_TRUE);
// Copies data from a buffer on the device to an image on the device
void cl_copyBufferToImage(cl_mem src, cl_mem dst, int height, int width);
// Maps a buffer
void* cl_mapBuffer(cl_mem mem, size_t mem_size, cl_mem_flags flags);
// Unmaps a buffer
void cl_unmapBuffer(cl_mem mem, void *ptr);
// Writes data to a zero-copy buffer on the device
void cl_writeToZCBuffer(cl_mem mem, void* data, size_t size);
//-------------------------------------------------------
// Program and kernels
//-------------------------------------------------------
// Compiles a program
cl_program cl_compileProgram(char* kernelPath, char* compileoptions,
bool verboseoptions = 0);
// Creates a kernel
cl_kernel cl_createKernel(cl_program program, const char* kernelName);
// Sets a kernel argument
void cl_setKernelArg(cl_kernel kernel, unsigned int index, size_t size,
void* data);
//-------------------------------------------------------
// Profiling/events
//-------------------------------------------------------
// Computes the execution time (start to end) for an event
double cl_computeExecTime(cl_event);
// Compute the elapsed time between two CPU timer values
double cl_computeTime(cl_time start, cl_time end);
// Creates an event from CPU timers
void cl_createUserEvent(cl_time start, cl_time end, char* desc);
// Disable logging of events
void cl_disableEvents();
// Enable logging of events
void cl_enableEvents();
// Query the current system time
void cl_getTime(cl_time* time);
// Calls a function which prints events to the terminal
void cl_printEvents();
// Calls a function which writes the events to a file
void cl_writeEventsToFile(char* path);
//-------------------------------------------------------
// Error handling
//-------------------------------------------------------
// Compare a status value to CL_SUCCESS and optionally exit on error
int cl_errChk(const cl_int status, const char *msg, bool exitOnErr);
// Queries the supported image formats for the device and prints
// them to the screen
void printSupportedImageFormats();
//-------------------------------------------------------
// Platform and device information
//-------------------------------------------------------
bool cl_deviceIsAMD(cl_device_id dev=NULL);
bool cl_deviceIsNVIDIA(cl_device_id dev=NULL);
bool cl_platformIsNVIDIA(cl_platform_id plat=NULL);
char* cl_getDeviceDriverVersion(cl_device_id dev=NULL);
char* cl_getDeviceName(cl_device_id dev=NULL);
char* cl_getDeviceVendor(cl_device_id dev=NULL);
char* cl_getDeviceVersion(cl_device_id dev=NULL);
char* cl_getPlatformName(cl_platform_id platform);
char* cl_getPlatformVendor(cl_platform_id platform);
//-------------------------------------------------------
// Utility functions
//-------------------------------------------------------
char* catStringWithInt(const char* str, int integer);
char* itoa_portable(int value, char* result, int base);
//-------------------------------------------------------
// Data types
//-------------------------------------------------------
typedef struct{
int x;
int y;
} int2;
typedef struct{
float x;
float y;
}float2;
typedef struct{
float x;
float y;
float z;
float w;
}float4;
//-------------------------------------------------------
// Defines
//-------------------------------------------------------
#define MAX_ERR_VAL 64
#define NUM_PROGRAMS 7
#define NUM_KERNELS 13
#define KERNEL_INIT_DET 0
#define KERNEL_BUILD_DET 1
#define KERNEL_SURF_DESC 2
#define KERNEL_NORM_DESC 3
#define KERNEL_NON_MAX_SUP 4
#define KERNEL_GET_ORIENT1 5
#define KERNEL_GET_ORIENT2 6
#define KERNEL_NN 7
#define KERNEL_SCAN 8
#define KERNEL_SCAN4 9
#define KERNEL_TRANSPOSE 10
#define KERNEL_SCANIMAGE 11
#define KERNEL_TRANSPOSEIMAGE 12
#endif

View File

@@ -0,0 +1,40 @@
#ifndef _GAUSSIANELIM
#define _GAUSSIANELIM
#include <iostream>
#include <vector>
#include <float.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include "clutils.h"
// All OpenCL headers
#if defined (__APPLE__) || defined(MACOSX)
#include <OpenCL/opencl.h>
#else
#include <CL/opencl.h>
#endif
float *OpenClGaussianElimination(
cl_context context,
int timing);
void printUsage();
int parseCommandline(int argc, char *argv[], char* filename,
int *q, int *t, int *p, int *d);
void InitPerRun(int size,float *m);
void ForwardSub(cl_context context, float *a, float *b, float *m, int size,int timing);
void BackSub(float *a, float *b, float *finalVec, int size);
void Fan1(float *m, float *a, int Size, int t);
void Fan2(float *m, float *a, float *b,int Size, int j1, int t);
//void Fan3(float *m, float *b, int Size, int t);
void InitMat(FILE *fp, int size, float *ary, int nrow, int ncol);
void InitAry(FILE *fp, float *ary, int ary_size);
void PrintMat(float *ary, int size, int nrow, int ncolumn);
void PrintAry(float *ary, int ary_size);
float eventTime(cl_event event,cl_command_queue command_queue);
#endif

View File

@@ -0,0 +1,74 @@
#include "stdio.h"
#include <time.h>
#include <windows.h>
#include <iostream>
//using namespace System;
using namespace std;
#if defined(_MSC_VER) || defined(_MSC_EXTENSIONS)
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000Ui64
#else
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
struct timezone
{
int tz_minuteswest; /* minutes W of Greenwich */
int tz_dsttime; /* type of dst correction */
};
// Definition of a gettimeofday function
int gettimeofday(struct timeval *tv, struct timezone *tz)
{
// Define a structure to receive the current Windows filetime
FILETIME ft;
// Initialize the present time to 0 and the timezone to UTC
unsigned __int64 tmpres = 0;
static int tzflag = 0;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
// The GetSystemTimeAsFileTime returns the number of 100 nanosecond
// intervals since Jan 1, 1601 in a structure. Copy the high bits to
// the 64 bit tmpres, shift it left by 32 then or in the low 32 bits.
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
// Convert to microseconds by dividing by 10
tmpres /= 10;
// The Unix epoch starts on Jan 1 1970. Need to subtract the difference
// in seconds from Jan 1 1601.
tmpres -= DELTA_EPOCH_IN_MICROSECS;
// Finally change microseconds to seconds and place in the seconds value.
// The modulus picks up the microseconds.
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
if (NULL != tz)
{
if (!tzflag)
{
_tzset();
tzflag++;
}
// Adjust for the timezone west of Greenwich
long seconds_diff;
_get_timezone(&seconds_diff);
tz->tz_minuteswest = seconds_diff / 60;
int hours_offset;
_get_daylight(&hours_offset);
tz->tz_dsttime = hours_offset;
}
return 0;
}

View File

@@ -0,0 +1,17 @@
#ifdef _WIN32
#include <WinSock.h>
/**
Based on code seen at.
http://www.winehq.org/pipermail/wine-devel/2003-June/018082.html
http://msdn.microsoft.com/en-us/library/ms740560
*/
int gettimeofday(struct timeval *tv, struct timezone *tz);
#else
#include <sys/time.h>
#endif

View File

@@ -0,0 +1,49 @@
//#pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
typedef struct latLong
{
float lat;
float lng;
} LatLong;
__kernel void Fan1(__global float *m_dev,
__global float *a_dev,
__global float *b_dev,
const int size,
const int t) {
int globalId = get_global_id(0);
if (globalId < size-1-t) {
*(m_dev + size * (globalId + t + 1)+t) = *(a_dev + size * (globalId + t + 1) + t) / *(a_dev + size * t + t);
}
}
__kernel void Fan2(__global float *m_dev,
__global float *a_dev,
__global float *b_dev,
const int size,
const int t) {
int globalId = get_global_id(0);
int globalIdx = get_global_id(0);
int globalIdy = get_global_id(1);
if (globalIdx < size-1-t && globalIdy < size-t) {
a_dev[size*(globalIdx+1+t)+(globalIdy+t)] -= m_dev[size*(globalIdx+1+t)+t] * a_dev[size*t+(globalIdy+t)];
if(globalIdy == 0){
b_dev[globalIdx+1+t] -= m_dev[size*(globalIdx+1+t)+(globalIdy+t)] * b_dev[t];
}
}
// One dimensional
// int globalIdx = globalId % size;
// int globalIdy = globalId / size;
//
// if (globalIdx < size-1-t && globalIdy < size-t) {
// a_dev[size*(globalIdx+1+t)+(globalIdy+t)] -= m_dev[size*(globalIdx+1+t)+t] * a_dev[size*t+(globalIdy+t)];
// }
// if(globalIdy == 0){
// b_dev[globalIdx+1+t] -= m_dev[size*(globalIdx+1+t)+(globalIdy+t)] * b_dev[t];
// }
}

Binary file not shown.

View File

@@ -0,0 +1,412 @@
#ifndef __GAUSSIAN_ELIMINATION__
#define __GAUSSIAN_ELIMINATION__
#include "gaussianElim.h"
cl_context context = NULL;
int main(int argc, char *argv[]) {
printf("enter demo main\n");
float *a = NULL, *b = NULL, *finalVec = NULL;
float *m = NULL;
int size;
FILE *fp;
// args
char filename[100];
int quiet = 0, timing = 0, platform = -1, device = -1;
// parse command line
if (parseCommandline(argc, argv, filename, &quiet, &timing, &platform,
&device)) {
printUsage();
return 0;
}
context = cl_init_context(platform, device, quiet);
fp = fopen(filename, "r");
fscanf(fp, "%d", &size);
a = (float *)malloc(size * size * sizeof(float));
printf("OK\n");
InitMat(fp, size, a, size, size);
// printf("The input matrix a is:\n");
// PrintMat(a, size, size, size);
b = (float *)malloc(size * sizeof(float));
InitAry(fp, b, size);
// printf("The input array b is:\n");
// PrintAry(b, size);
// create the solution matrix
m = (float *)malloc(size * size * sizeof(float));
// create a new vector to hold the final answer
finalVec = (float *)malloc(size * sizeof(float));
InitPerRun(size, m);
// begin timing
// run kernels
ForwardSub(context, a, b, m, size, timing);
// end timing
if (!quiet) {
printf("The result of matrix m is: \n");
PrintMat(m, size, size, size);
printf("The result of matrix a is: \n");
PrintMat(a, size, size, size);
printf("The result of array b is: \n");
PrintAry(b, size);
BackSub(a, b, finalVec, size);
printf("The final solution is: \n");
PrintAry(finalVec, size);
}
fclose(fp);
free(m);
free(a);
free(b);
free(finalVec);
// OpenClGaussianElimination(context,timing);
return 0;
}
/*------------------------------------------------------
** ForwardSub() -- Forward substitution of Gaussian
** elimination.
**------------------------------------------------------
*/
void ForwardSub(cl_context context, float *a, float *b, float *m, int size,
int timing) {
// 1. set up kernels
cl_kernel fan1_kernel, fan2_kernel;
cl_int status = 0;
cl_program gaussianElim_program;
cl_event writeEvent, kernelEvent, readEvent;
float writeTime = 0, readTime = 0, kernelTime = 0;
float writeMB = 0, readMB = 0;
gaussianElim_program =
cl_compileProgram((char *)"gaussianElim_kernels.cl", NULL);
fan1_kernel = clCreateKernel(gaussianElim_program, "Fan1", &status);
status = cl_errChk(status, (char *)"Error Creating Fan1 kernel", true);
if (status)
exit(1);
fan2_kernel = clCreateKernel(gaussianElim_program, "Fan2", &status);
status = cl_errChk(status, (char *)"Error Creating Fan2 kernel", true);
if (status)
exit(1);
// 2. set up memory on device and send ipts data to device
cl_mem a_dev, b_dev, m_dev;
cl_int error = 0;
a_dev = clCreateBuffer(context, CL_MEM_READ_WRITE,
sizeof(float) * size * size, NULL, &error);
b_dev = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * size, NULL,
&error);
m_dev = clCreateBuffer(context, CL_MEM_READ_WRITE,
sizeof(float) * size * size, NULL, &error);
cl_command_queue command_queue = cl_getCommandQueue();
error = clEnqueueWriteBuffer(command_queue, a_dev,
1, // change to 0 for nonblocking write
0, // offset
sizeof(float) * size * size, a, 0, NULL,
&writeEvent);
if (timing)
writeTime += eventTime(writeEvent, command_queue);
clReleaseEvent(writeEvent);
error = clEnqueueWriteBuffer(command_queue, b_dev,
1, // change to 0 for nonblocking write
0, // offset
sizeof(float) * size, b, 0, NULL, &writeEvent);
if (timing)
writeTime += eventTime(writeEvent, command_queue);
clReleaseEvent(writeEvent);
error = clEnqueueWriteBuffer(command_queue, m_dev,
1, // change to 0 for nonblocking write
0, // offset
sizeof(float) * size * size, m, 0, NULL,
&writeEvent);
if (timing)
writeTime += eventTime(writeEvent, command_queue);
clReleaseEvent(writeEvent);
writeMB = (float)(sizeof(float) * size * (size + size + 1) / 1e6);
// 3. Determine block sizes
size_t globalWorksizeFan1[1];
size_t globalWorksizeFan2[2];
globalWorksizeFan1[0] = size;
globalWorksizeFan2[0] = size;
globalWorksizeFan2[1] = size;
int t;
// 4. Setup and Run kernels
for (t = 0; t < (size - 1); t++) {
// kernel args
cl_int argchk;
argchk = clSetKernelArg(fan1_kernel, 0, sizeof(cl_mem), (void *)&m_dev);
argchk |= clSetKernelArg(fan1_kernel, 1, sizeof(cl_mem), (void *)&a_dev);
argchk |= clSetKernelArg(fan1_kernel, 2, sizeof(cl_mem), (void *)&b_dev);
argchk |= clSetKernelArg(fan1_kernel, 3, sizeof(int), (void *)&size);
argchk |= clSetKernelArg(fan1_kernel, 4, sizeof(int), (void *)&t);
cl_errChk(argchk, "ERROR in Setting Fan1 kernel args", true);
// launch kernel
error =
clEnqueueNDRangeKernel(command_queue, fan1_kernel, 1, 0,
globalWorksizeFan1, NULL, 0, NULL, &kernelEvent);
cl_errChk(error, "ERROR in Executing Fan1 Kernel", true);
if (timing) {
// printf("here1a\n");
kernelTime += eventTime(kernelEvent, command_queue);
// printf("here1b\n");
}
clReleaseEvent(kernelEvent);
// Fan1<<<dimGrid,dimBlock>>>(m_cuda,a_cuda,Size,t);
// cudaThreadSynchronize();
// kernel args
argchk = clSetKernelArg(fan2_kernel, 0, sizeof(cl_mem), (void *)&m_dev);
argchk |= clSetKernelArg(fan2_kernel, 1, sizeof(cl_mem), (void *)&a_dev);
argchk |= clSetKernelArg(fan2_kernel, 2, sizeof(cl_mem), (void *)&b_dev);
argchk |= clSetKernelArg(fan2_kernel, 3, sizeof(int), (void *)&size);
argchk |= clSetKernelArg(fan2_kernel, 4, sizeof(int), (void *)&t);
cl_errChk(argchk, "ERROR in Setting Fan2 kernel args", true);
// launch kernel
error =
clEnqueueNDRangeKernel(command_queue, fan2_kernel, 2, 0,
globalWorksizeFan2, NULL, 0, NULL, &kernelEvent);
cl_errChk(error, "ERROR in Executing Fan1 Kernel", true);
if (timing) {
// printf("here2a\n");
kernelTime += eventTime(kernelEvent, command_queue);
// printf("here2b\n");
}
clReleaseEvent(kernelEvent);
// Fan2<<<dimGridXY,dimBlockXY>>>(m_cuda,a_cuda,b_cuda,Size,Size-t,t);
// cudaThreadSynchronize();
}
// 5. transfer data off of device
error =
clEnqueueReadBuffer(command_queue, a_dev,
1, // change to 0 for nonblocking write
0, // offset
sizeof(float) * size * size, a, 0, NULL, &readEvent);
cl_errChk(error, "ERROR with clEnqueueReadBuffer", true);
if (timing)
readTime += eventTime(readEvent, command_queue);
clReleaseEvent(readEvent);
error = clEnqueueReadBuffer(command_queue, b_dev,
1, // change to 0 for nonblocking write
0, // offset
sizeof(float) * size, b, 0, NULL, &readEvent);
cl_errChk(error, "ERROR with clEnqueueReadBuffer", true);
if (timing)
readTime += eventTime(readEvent, command_queue);
clReleaseEvent(readEvent);
error =
clEnqueueReadBuffer(command_queue, m_dev,
1, // change to 0 for nonblocking write
0, // offset
sizeof(float) * size * size, m, 0, NULL, &readEvent);
cl_errChk(error, "ERROR with clEnqueueReadBuffer", true);
if (timing)
readTime += eventTime(readEvent, command_queue);
clReleaseEvent(readEvent);
readMB = (float)(sizeof(float) * size * (size + size + 1) / 1e6);
if (timing) {
printf("Matrix Size\tWrite(s) [size]\t\tKernel(s)\tRead(s) "
"[size]\t\tTotal(s)\n");
printf("%dx%d \t", size, size);
printf("%f [%.2fMB]\t", writeTime, writeMB);
printf("%f\t", kernelTime);
printf("%f [%.2fMB]\t", readTime, readMB);
printf("%f\n\n", writeTime + kernelTime + readTime);
}
}
float eventTime(cl_event event, cl_command_queue command_queue) {
cl_int error = 0;
cl_ulong eventStart, eventEnd;
clFinish(command_queue);
error = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START,
sizeof(cl_ulong), &eventStart, NULL);
cl_errChk(error, "ERROR in Event Profiling.", true);
error = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &eventEnd, NULL);
cl_errChk(error, "ERROR in Event Profiling.", true);
return (float)((eventEnd - eventStart) / 1e9);
}
int parseCommandline(int argc, char *argv[], char *filename, int *q, int *t,
int *p, int *d) {
int i;
// if (argc < 2) return 1; // error
strncpy(filename, "matrix4.txt", 100);
char flag;
for (i = 1; i < argc; i++) {
if (argv[i][0] == '-') { // flag
flag = argv[i][1];
switch (flag) {
case 'h': // help
return 1;
break;
case 'q': // quiet
*q = 1;
break;
case 't': // timing
*t = 1;
break;
case 'p': // platform
i++;
*p = atoi(argv[i]);
break;
case 'd': // device
i++;
*d = atoi(argv[i]);
break;
}
}
}
if ((*d >= 0 && *p < 0) ||
(*p >= 0 &&
*d < 0)) // both p and d must be specified if either are specified
return 1;
return 0;
}
void printUsage() {
printf("Gaussian Elimination Usage\n");
printf("\n");
printf("gaussianElimination [filename] [-hqt] [-p [int] -d [int]]\n");
printf("\n");
printf("example:\n");
printf("$ ./gaussianElimination matrix4.txt\n");
printf("\n");
printf("filename the filename that holds the matrix data\n");
printf("\n");
printf("-h Display the help file\n");
printf("-q Quiet mode. Suppress all text output.\n");
printf("-t Print timing information.\n");
printf("\n");
printf("-p [int] Choose the platform (must choose both platform and "
"device)\n");
printf("-d [int] Choose the device (must choose both platform and "
"device)\n");
printf("\n");
printf("\n");
printf("Notes: 1. The filename is required as the first parameter.\n");
printf(" 2. If you declare either the device or the platform,\n");
printf(" you must declare both.\n\n");
}
/*------------------------------------------------------
** InitPerRun() -- Initialize the contents of the
** multipier matrix **m
**------------------------------------------------------
*/
void InitPerRun(int size, float *m) {
int i;
for (i = 0; i < size * size; i++)
*(m + i) = 0.0;
}
void BackSub(float *a, float *b, float *finalVec, int size) {
// solve "bottom up"
int i, j;
for (i = 0; i < size; i++) {
finalVec[size - i - 1] = b[size - i - 1];
for (j = 0; j < i; j++) {
finalVec[size - i - 1] -= *(a + size * (size - i - 1) + (size - j - 1)) *
finalVec[size - j - 1];
}
finalVec[size - i - 1] =
finalVec[size - i - 1] / *(a + size * (size - i - 1) + (size - i - 1));
}
}
void InitMat(FILE *fp, int size, float *ary, int nrow, int ncol) {
int i, j;
for (i = 0; i < nrow; i++) {
for (j = 0; j < ncol; j++) {
fscanf(fp, "%f", ary + size * i + j);
}
}
}
/*------------------------------------------------------
** InitAry() -- Initialize the array (vector) by reading
** data from the data file
**------------------------------------------------------
*/
void InitAry(FILE *fp, float *ary, int ary_size) {
int i;
for (i = 0; i < ary_size; i++) {
fscanf(fp, "%f", &ary[i]);
}
}
/*------------------------------------------------------
** PrintMat() -- Print the contents of the matrix
**------------------------------------------------------
*/
void PrintMat(float *ary, int size, int nrow, int ncol) {
int i, j;
for (i = 0; i < nrow; i++) {
for (j = 0; j < ncol; j++) {
printf("%8.2f ", *(ary + size * i + j));
}
printf("\n");
}
printf("\n");
}
/*------------------------------------------------------
** PrintAry() -- Print the contents of the array (vector)
**------------------------------------------------------
*/
void PrintAry(float *ary, int ary_size) {
int i;
for (i = 0; i < ary_size; i++) {
printf("%.2f ", ary[i]);
}
printf("\n\n");
}
#endif

View File

@@ -0,0 +1,11 @@
4
-0.6 -0.5 0.7 0.3
-0.3 -0.9 0.3 0.7
-0.4 -0.5 -0.3 -0.8
0.0 -0.1 0.2 0.9
-0.85 -0.68 0.24 -0.53
0.7 0.0 -0.4 -0.5

View File

@@ -0,0 +1 @@
./gaussian ../../data/gaussian/matrix4.txt

View File

@@ -0,0 +1,204 @@
/****************************************************************************\
* Copyright (c) 2011, Advanced Micro Devices, Inc. *
* All rights reserved. *
* *
* Redistribution and use in source and binary forms, with or without *
* modification, are permitted provided that the following conditions *
* are met: *
* *
* Redistributions of source code must retain the above copyright notice, *
* this list of conditions and the following disclaimer. *
* *
* Redistributions in binary form must reproduce the above copyright notice, *
* this list of conditions and the following disclaimer in the documentation *
* and/or other materials provided with the distribution. *
* *
* Neither the name of the copyright holder nor the names of its contributors *
* may be used to endorse or promote products derived from this software *
* without specific prior written permission. *
* *
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS *
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED *
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR *
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR *
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, *
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, *
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR *
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
* *
* If you use the software (in whole or in part), you shall adhere to all *
* applicable U.S., European, and other export laws, including but not *
* limited to the U.S. Export Administration Regulations (“EAR”), (15 C.F.R. *
* Sections 730 through 774), and E.U. Council Regulation (EC) No 1334/2000 *
* of 22 June 2000. Further, pursuant to Section 740.6 of the EAR, you *
* hereby certify that, except pursuant to a license granted by the United *
* States Department of Commerce Bureau of Industry and Security or as *
* otherwise permitted pursuant to a License Exception under the U.S. Export *
* Administration Regulations ("EAR"), you will not (1) export, re-export or *
* release to a national of a country in Country Groups D:1, E:1 or E:2 any *
* restricted technology, software, or source code you receive hereunder, *
* or (2) export to Country Groups D:1, E:1 or E:2 the direct product of such *
* technology or software, if such foreign produced direct product is subject *
* to national security controls as identified on the Commerce Control List *
*(currently found in Supplement 1 to Part 774 of EAR). For the most current *
* Country Group listings, or for additional information about the EAR or *
* your obligations under those regulations, please refer to the U.S. Bureau *
* of Industry and Security’s website at http://www.bis.doc.gov/. *
\****************************************************************************/
#include <stdio.h>
#include <sys/stat.h>
#include <string.h>
#include <stdlib.h>
#include "utils.h"
static bool usingImages = true;
//! A wrapper for malloc that checks the return value
void* alloc(size_t size) {
void* ptr = NULL;
ptr = malloc(size);
if(ptr == NULL) {
perror("malloc");
exit(-1);
}
return ptr;
}
// This function checks to make sure a file exists before we open it
void checkFile(char* filename)
{
struct stat fileStatus;
if(stat(filename, &fileStatus) != 0) {
printf("Error opening file: %s\n", filename);
exit(-1);
}
else {
if(!(S_IFREG & fileStatus.st_mode)) {
printf("File %s is not a regular file\n", filename);
exit(-1);
}
}
}
// This function checks to make sure a directory exists
void checkDir(char* dirpath)
{
struct stat fileStatus;
if(stat(dirpath, &fileStatus) != 0) {
printf("Directory does not exist: %s\n", dirpath);
exit(-1);
}
else {
if(!(S_IFDIR & fileStatus.st_mode)) {
printf("Directory was not provided: %s\n", dirpath);
exit(-1);
}
}
}
// Parse the command line arguments
void parseArguments(int argc, char** argv, char** input, char** events,
char** ipts, char* devicePref, bool* verifyResults)
{
for(int i = 2; i < argc; i++) {
if(strcmp(argv[i], "-d") == 0) { // Event dump found
if(i == argc-1) {
printf("Usage: -e Needs directory path\n");
exit(-1);
}
devicePref[0] = argv[i+1][0];
i++;
continue;
}
if(strcmp(argv[i], "-e") == 0) { // Event dump found
if(i == argc-1) {
printf("Usage: -e Needs directory path\n");
exit(-1);
}
*events = argv[i+1];
i++;
continue;
}
if(strcmp(argv[i], "-i") == 0) { // Input found
if(i == argc-1) {
printf("Usage: -i Needs directory path\n");
exit(-1);
}
*input = argv[i+1];
i++;
continue;
}
if(strcmp(argv[i], "-l") == 0) { // Ipts dump found
if(i == argc-1) {
printf("Usage: -l Needs directory path\n");
exit(-1);
}
*ipts = argv[i+1];
i++;
continue;
}
if(strcmp(argv[i], "-n") == 0) { // Don't use OpenCL images
setUsingImages(false);
continue;
}
if(strcmp(argv[i], "-v") == 0) { // Verify results
*verifyResults = true;
continue;
}
}
}
// This function that takes a positive integer 'value' and returns
// the nearest multiple of 'multiple' (used for padding columns)
unsigned int roundUp(unsigned int value, unsigned int multiple) {
unsigned int remainder = value % multiple;
// Make the value a multiple of multiple
if(remainder != 0) {
value += (multiple-remainder);
}
return value;
}
// Concatenate two strings and return a pointer to the new string
char* smartStrcat(char* str1, char* str2)
{
char* newStr = NULL;
newStr = (char*)alloc((strlen(str1)+strlen(str2)+1)*sizeof(char));
strcpy(newStr, str1);
strcat(newStr, str2);
return newStr;
}
// Set the value of using images to true if they are being
// used, or false if they are not
void setUsingImages(bool val)
{
usingImages = val;
}
// Return whether or not images are being used
bool isUsingImages()
{
return usingImages;
}

View File

@@ -0,0 +1,84 @@
/****************************************************************************\
* Copyright (c) 2011, Advanced Micro Devices, Inc. *
* All rights reserved. *
* *
* Redistribution and use in source and binary forms, with or without *
* modification, are permitted provided that the following conditions *
* are met: *
* *
* Redistributions of source code must retain the above copyright notice, *
* this list of conditions and the following disclaimer. *
* *
* Redistributions in binary form must reproduce the above copyright notice, *
* this list of conditions and the following disclaimer in the documentation *
* and/or other materials provided with the distribution. *
* *
* Neither the name of the copyright holder nor the names of its contributors *
* may be used to endorse or promote products derived from this software *
* without specific prior written permission. *
* *
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS *
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED *
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR *
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR *
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, *
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, *
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR *
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
* *
* If you use the software (in whole or in part), you shall adhere to all *
* applicable U.S., European, and other export laws, including but not *
* limited to the U.S. Export Administration Regulations (“EAR”), (15 C.F.R. *
* Sections 730 through 774), and E.U. Council Regulation (EC) No 1334/2000 *
* of 22 June 2000. Further, pursuant to Section 740.6 of the EAR, you *
* hereby certify that, except pursuant to a license granted by the United *
* States Department of Commerce Bureau of Industry and Security or as *
* otherwise permitted pursuant to a License Exception under the U.S. Export *
* Administration Regulations ("EAR"), you will not (1) export, re-export or *
* release to a national of a country in Country Groups D:1, E:1 or E:2 any *
* restricted technology, software, or source code you receive hereunder, *
* or (2) export to Country Groups D:1, E:1 or E:2 the direct product of such *
* technology or software, if such foreign produced direct product is subject *
* to national security controls as identified on the Commerce Control List *
*(currently found in Supplement 1 to Part 774 of EAR). For the most current *
* Country Group listings, or for additional information about the EAR or *
* your obligations under those regulations, please refer to the U.S. Bureau *
* of Industry and Security’s website at http://www.bis.doc.gov/. *
\****************************************************************************/
#ifndef _UTILS_
#define _UTILS_
// Wrapper for malloc
void* alloc(size_t size);
// Checks for existence of directory
void checkDir(char* dirpath);
// Check for existence of file
void checkFile(char* filename);
// Parse the input command line options to the program
void parseArguments(int argc, char** argv, char** input, char** events,
char** ipts, char* devicePref, bool* verifyResults);
// Print the program usage information
void printUsage();
// Rounds up size to the nearest multiple of multiple
unsigned int roundUp(unsigned int value, unsigned int multiple);
// Concatenate two strings, creating a new one
char* smartStrcat(char* str1, char* str2);
// Set the value of usingImages
void setUsingImages(bool val);
// Return whether or not images are being used
bool isUsingImages();
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,131 @@
/**********************************************************************************
* Copyright (c) 2008-2015 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
* "Materials"), to deal in the Materials without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Materials, and to
* permit persons to whom the Materials are furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
* MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
* KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
* SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
* https://www.khronos.org/registry/
*
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
**********************************************************************************/
/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
#ifndef __OPENCL_CL_D3D10_H
#define __OPENCL_CL_D3D10_H
#include <d3d10.h>
#include <CL/cl.h>
#include <CL/cl_platform.h>
#ifdef __cplusplus
extern "C" {
#endif
/******************************************************************************
* cl_khr_d3d10_sharing */
#define cl_khr_d3d10_sharing 1
typedef cl_uint cl_d3d10_device_source_khr;
typedef cl_uint cl_d3d10_device_set_khr;
/******************************************************************************/
/* Error Codes */
#define CL_INVALID_D3D10_DEVICE_KHR -1002
#define CL_INVALID_D3D10_RESOURCE_KHR -1003
#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR -1004
#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR -1005
/* cl_d3d10_device_source_nv */
#define CL_D3D10_DEVICE_KHR 0x4010
#define CL_D3D10_DXGI_ADAPTER_KHR 0x4011
/* cl_d3d10_device_set_nv */
#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR 0x4012
#define CL_ALL_DEVICES_FOR_D3D10_KHR 0x4013
/* cl_context_info */
#define CL_CONTEXT_D3D10_DEVICE_KHR 0x4014
#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
/* cl_mem_info */
#define CL_MEM_D3D10_RESOURCE_KHR 0x4015
/* cl_image_info */
#define CL_IMAGE_D3D10_SUBRESOURCE_KHR 0x4016
/* cl_command_type */
#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR 0x4017
#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR 0x4018
/******************************************************************************/
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
cl_platform_id platform,
cl_d3d10_device_source_khr d3d_device_source,
void * d3d_object,
cl_d3d10_device_set_khr d3d_device_set,
cl_uint num_entries,
cl_device_id * devices,
cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
cl_context context,
cl_mem_flags flags,
ID3D10Buffer * resource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
cl_context context,
cl_mem_flags flags,
ID3D10Texture2D * resource,
UINT subresource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
cl_context context,
cl_mem_flags flags,
ID3D10Texture3D * resource,
UINT subresource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_D3D10_H */

View File

@@ -0,0 +1,131 @@
/**********************************************************************************
* Copyright (c) 2008-2015 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
* "Materials"), to deal in the Materials without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Materials, and to
* permit persons to whom the Materials are furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
* MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
* KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
* SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
* https://www.khronos.org/registry/
*
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
**********************************************************************************/
/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
#ifndef __OPENCL_CL_D3D11_H
#define __OPENCL_CL_D3D11_H
#include <d3d11.h>
#include <CL/cl.h>
#include <CL/cl_platform.h>
#ifdef __cplusplus
extern "C" {
#endif
/******************************************************************************
* cl_khr_d3d11_sharing */
#define cl_khr_d3d11_sharing 1
typedef cl_uint cl_d3d11_device_source_khr;
typedef cl_uint cl_d3d11_device_set_khr;
/******************************************************************************/
/* Error Codes */
#define CL_INVALID_D3D11_DEVICE_KHR -1006
#define CL_INVALID_D3D11_RESOURCE_KHR -1007
#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR -1008
#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR -1009
/* cl_d3d11_device_source */
#define CL_D3D11_DEVICE_KHR 0x4019
#define CL_D3D11_DXGI_ADAPTER_KHR 0x401A
/* cl_d3d11_device_set */
#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR 0x401B
#define CL_ALL_DEVICES_FOR_D3D11_KHR 0x401C
/* cl_context_info */
#define CL_CONTEXT_D3D11_DEVICE_KHR 0x401D
#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D
/* cl_mem_info */
#define CL_MEM_D3D11_RESOURCE_KHR 0x401E
/* cl_image_info */
#define CL_IMAGE_D3D11_SUBRESOURCE_KHR 0x401F
/* cl_command_type */
#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR 0x4020
#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR 0x4021
/******************************************************************************/
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
cl_platform_id platform,
cl_d3d11_device_source_khr d3d_device_source,
void * d3d_object,
cl_d3d11_device_set_khr d3d_device_set,
cl_uint num_entries,
cl_device_id * devices,
cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(
cl_context context,
cl_mem_flags flags,
ID3D11Buffer * resource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(
cl_context context,
cl_mem_flags flags,
ID3D11Texture2D * resource,
UINT subresource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(
cl_context context,
cl_mem_flags flags,
ID3D11Texture3D * resource,
UINT subresource,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_2;
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_D3D11_H */

View File

@@ -0,0 +1,132 @@
/**********************************************************************************
* Copyright (c) 2008-2015 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
* "Materials"), to deal in the Materials without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Materials, and to
* permit persons to whom the Materials are furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
* MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
* KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
* SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
* https://www.khronos.org/registry/
*
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
**********************************************************************************/
/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H
#define __OPENCL_CL_DX9_MEDIA_SHARING_H
#include <CL/cl.h>
#include <CL/cl_platform.h>
#ifdef __cplusplus
extern "C" {
#endif
/******************************************************************************/
/* cl_khr_dx9_media_sharing */
#define cl_khr_dx9_media_sharing 1
typedef cl_uint cl_dx9_media_adapter_type_khr;
typedef cl_uint cl_dx9_media_adapter_set_khr;
#if defined(_WIN32)
#include <d3d9.h>
typedef struct _cl_dx9_surface_info_khr
{
IDirect3DSurface9 *resource;
HANDLE shared_handle;
} cl_dx9_surface_info_khr;
#endif
/******************************************************************************/
/* Error Codes */
#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR -1010
#define CL_INVALID_DX9_MEDIA_SURFACE_KHR -1011
#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR -1012
#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR -1013
/* cl_media_adapter_type_khr */
#define CL_ADAPTER_D3D9_KHR 0x2020
#define CL_ADAPTER_D3D9EX_KHR 0x2021
#define CL_ADAPTER_DXVA_KHR 0x2022
/* cl_media_adapter_set_khr */
#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2023
#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2024
/* cl_context_info */
#define CL_CONTEXT_ADAPTER_D3D9_KHR 0x2025
#define CL_CONTEXT_ADAPTER_D3D9EX_KHR 0x2026
#define CL_CONTEXT_ADAPTER_DXVA_KHR 0x2027
/* cl_mem_info */
#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR 0x2028
#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR 0x2029
/* cl_image_info */
#define CL_IMAGE_DX9_MEDIA_PLANE_KHR 0x202A
/* cl_command_type */
#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR 0x202B
#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR 0x202C
/******************************************************************************/
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
cl_platform_id platform,
cl_uint num_media_adapters,
cl_dx9_media_adapter_type_khr * media_adapter_type,
void * media_adapters,
cl_dx9_media_adapter_set_khr media_adapter_set,
cl_uint num_entries,
cl_device_id * devices,
cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
cl_context context,
cl_mem_flags flags,
cl_dx9_media_adapter_type_khr adapter_type,
void * surface_info,
cl_uint plane,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_2;
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_DX9_MEDIA_SHARING_H */

View File

@@ -0,0 +1,182 @@
/**********************************************************************************
* Copyright (c) 2008-2019 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
* "Materials"), to deal in the Materials without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Materials, and to
* permit persons to whom the Materials are furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
* MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
* KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
* SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
* https://www.khronos.org/registry/
*
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
**********************************************************************************/
/*****************************************************************************\
Copyright (c) 2013-2019 Intel Corporation All Rights Reserved.
THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
File Name: cl_dx9_media_sharing_intel.h
Abstract:
Notes:
\*****************************************************************************/
#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
#define __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
#include <CL/cl.h>
#include <CL/cl_platform.h>
#include <d3d9.h>
#include <dxvahd.h>
#include <wtypes.h>
#include <d3d9types.h>
#ifdef __cplusplus
extern "C" {
#endif
/***************************************
* cl_intel_dx9_media_sharing extension *
****************************************/
#define cl_intel_dx9_media_sharing 1
typedef cl_uint cl_dx9_device_source_intel;
typedef cl_uint cl_dx9_device_set_intel;
/* error codes */
#define CL_INVALID_DX9_DEVICE_INTEL -1010
#define CL_INVALID_DX9_RESOURCE_INTEL -1011
#define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL -1012
#define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL -1013
/* cl_dx9_device_source_intel */
#define CL_D3D9_DEVICE_INTEL 0x4022
#define CL_D3D9EX_DEVICE_INTEL 0x4070
#define CL_DXVA_DEVICE_INTEL 0x4071
/* cl_dx9_device_set_intel */
#define CL_PREFERRED_DEVICES_FOR_DX9_INTEL 0x4024
#define CL_ALL_DEVICES_FOR_DX9_INTEL 0x4025
/* cl_context_info */
#define CL_CONTEXT_D3D9_DEVICE_INTEL 0x4026
#define CL_CONTEXT_D3D9EX_DEVICE_INTEL 0x4072
#define CL_CONTEXT_DXVA_DEVICE_INTEL 0x4073
/* cl_mem_info */
#define CL_MEM_DX9_RESOURCE_INTEL 0x4027
#define CL_MEM_DX9_SHARED_HANDLE_INTEL 0x4074
/* cl_image_info */
#define CL_IMAGE_DX9_PLANE_INTEL 0x4075
/* cl_command_type */
#define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL 0x402A
#define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL 0x402B
/******************************************************************************/
extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceIDsFromDX9INTEL(
cl_platform_id platform,
cl_dx9_device_source_intel dx9_device_source,
void* dx9_object,
cl_dx9_device_set_intel dx9_device_set,
cl_uint num_entries,
cl_device_id* devices,
cl_uint* num_devices) CL_EXT_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY cl_int (CL_API_CALL* clGetDeviceIDsFromDX9INTEL_fn)(
cl_platform_id platform,
cl_dx9_device_source_intel dx9_device_source,
void* dx9_object,
cl_dx9_device_set_intel dx9_device_set,
cl_uint num_entries,
cl_device_id* devices,
cl_uint* num_devices) CL_EXT_SUFFIX__VERSION_1_1;
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromDX9MediaSurfaceINTEL(
cl_context context,
cl_mem_flags flags,
IDirect3DSurface9* resource,
HANDLE sharedHandle,
UINT plane,
cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceINTEL_fn)(
cl_context context,
cl_mem_flags flags,
IDirect3DSurface9* resource,
HANDLE sharedHandle,
UINT plane,
cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireDX9ObjectsINTEL(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem* mem_objects,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_EXT_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9ObjectsINTEL_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem* mem_objects,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_EXT_SUFFIX__VERSION_1_1;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseDX9ObjectsINTEL(
cl_command_queue command_queue,
cl_uint num_objects,
cl_mem* mem_objects,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_EXT_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9ObjectsINTEL_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
cl_mem* mem_objects,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_EXT_SUFFIX__VERSION_1_1;
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H */

View File

@@ -0,0 +1,132 @@
/*******************************************************************************
* Copyright (c) 2008-2019 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
* "Materials"), to deal in the Materials without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Materials, and to
* permit persons to whom the Materials are furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
* MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
* KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
* SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
* https://www.khronos.org/registry/
*
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
******************************************************************************/
#ifndef __OPENCL_CL_EGL_H
#define __OPENCL_CL_EGL_H
#include <CL/cl.h>
#ifdef __cplusplus
extern "C" {
#endif
/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR 0x202F
#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR 0x202D
#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR 0x202E
/* Error type for clCreateFromEGLImageKHR */
#define CL_INVALID_EGL_OBJECT_KHR -1093
#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR -1092
/* CLeglImageKHR is an opaque handle to an EGLImage */
typedef void* CLeglImageKHR;
/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
typedef void* CLeglDisplayKHR;
/* CLeglSyncKHR is an opaque handle to an EGLSync object */
typedef void* CLeglSyncKHR;
/* properties passed to clCreateFromEGLImageKHR */
typedef intptr_t cl_egl_image_properties_khr;
#define cl_khr_egl_image 1
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromEGLImageKHR(cl_context context,
CLeglDisplayKHR egldisplay,
CLeglImageKHR eglimage,
cl_mem_flags flags,
const cl_egl_image_properties_khr * properties,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
cl_context context,
CLeglDisplayKHR egldisplay,
CLeglImageKHR eglimage,
cl_mem_flags flags,
const cl_egl_image_properties_khr * properties,
cl_int * errcode_ret);
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireEGLObjectsKHR(cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event);
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseEGLObjectsKHR(cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event);
#define cl_khr_egl_event 1
extern CL_API_ENTRY cl_event CL_API_CALL
clCreateEventFromEGLSyncKHR(cl_context context,
CLeglSyncKHR sync,
CLeglDisplayKHR display,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
cl_context context,
CLeglSyncKHR sync,
CLeglDisplayKHR display,
cl_int * errcode_ret);
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_EGL_H */

View File

@@ -0,0 +1,762 @@
/*******************************************************************************
* Copyright (c) 2008-2019 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
* "Materials"), to deal in the Materials without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Materials, and to
* permit persons to whom the Materials are furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
* MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
* KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
* SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
* https://www.khronos.org/registry/
*
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
******************************************************************************/
/* cl_ext.h contains OpenCL extensions which don't have external */
/* (OpenGL, D3D) dependencies. */
#ifndef __CL_EXT_H
#define __CL_EXT_H
#ifdef __cplusplus
extern "C" {
#endif
#include <CL/cl.h>
/* cl_khr_fp64 extension - no extension #define since it has no functions */
/* CL_DEVICE_DOUBLE_FP_CONFIG is defined in CL.h for OpenCL >= 120 */
#if CL_TARGET_OPENCL_VERSION <= 110
#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032
#endif
/* cl_khr_fp16 extension - no extension #define since it has no functions */
#define CL_DEVICE_HALF_FP_CONFIG 0x1033
/* Memory object destruction
*
* Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
*
* Registers a user callback function that will be called when the memory object is deleted and its resources
* freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback
* stack associated with memobj. The registered user callback functions are called in the reverse order in
* which they were registered. The user callback functions are called and then the memory object is deleted
* and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be
* notified when the memory referenced by host_ptr, specified when the memory object is created and used as
* the storage bits for the memory object, can be reused or freed.
*
* The application may not call CL api's with the cl_mem object passed to the pfn_notify.
*
* Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
* before using.
*/
#define cl_APPLE_SetMemObjectDestructor 1
cl_int CL_API_ENTRY clSetMemObjectDestructorAPPLE( cl_mem memobj,
void (* pfn_notify)(cl_mem memobj, void * user_data),
void * user_data) CL_EXT_SUFFIX__VERSION_1_0;
/* Context Logging Functions
*
* The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
* Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
* before using.
*
* clLogMessagesToSystemLog forwards on all log messages to the Apple System Logger
*/
#define cl_APPLE_ContextLoggingFunctions 1
extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE( const char * errstr,
const void * private_info,
size_t cb,
void * user_data) CL_EXT_SUFFIX__VERSION_1_0;
/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE( const char * errstr,
const void * private_info,
size_t cb,
void * user_data) CL_EXT_SUFFIX__VERSION_1_0;
/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
extern void CL_API_ENTRY clLogMessagesToStderrAPPLE( const char * errstr,
const void * private_info,
size_t cb,
void * user_data) CL_EXT_SUFFIX__VERSION_1_0;
/************************
* cl_khr_icd extension *
************************/
#define cl_khr_icd 1
/* cl_platform_info */
#define CL_PLATFORM_ICD_SUFFIX_KHR 0x0920
/* Additional Error Codes */
#define CL_PLATFORM_NOT_FOUND_KHR -1001
extern CL_API_ENTRY cl_int CL_API_CALL
clIcdGetPlatformIDsKHR(cl_uint num_entries,
cl_platform_id * platforms,
cl_uint * num_platforms);
typedef CL_API_ENTRY cl_int
(CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(cl_uint num_entries,
cl_platform_id * platforms,
cl_uint * num_platforms);
/*******************************
* cl_khr_il_program extension *
*******************************/
#define cl_khr_il_program 1
/* New property to clGetDeviceInfo for retrieving supported intermediate
* languages
*/
#define CL_DEVICE_IL_VERSION_KHR 0x105B
/* New property to clGetProgramInfo for retrieving for retrieving the IL of a
* program
*/
#define CL_PROGRAM_IL_KHR 0x1169
extern CL_API_ENTRY cl_program CL_API_CALL
clCreateProgramWithILKHR(cl_context context,
const void * il,
size_t length,
cl_int * errcode_ret);
typedef CL_API_ENTRY cl_program
(CL_API_CALL *clCreateProgramWithILKHR_fn)(cl_context context,
const void * il,
size_t length,
cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
/* Extension: cl_khr_image2d_from_buffer
*
* This extension allows a 2D image to be created from a cl_mem buffer without
* a copy. The type associated with a 2D image created from a buffer in an
* OpenCL program is image2d_t. Both the sampler and sampler-less read_image
* built-in functions are supported for 2D images and 2D images created from
* a buffer. Similarly, the write_image built-ins are also supported for 2D
* images created from a buffer.
*
* When the 2D image from buffer is created, the client must specify the
* width, height, image format (i.e. channel order and channel data type)
* and optionally the row pitch.
*
* The pitch specified must be a multiple of
* CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR pixels.
* The base address of the buffer must be aligned to
* CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR pixels.
*/
#define CL_DEVICE_IMAGE_PITCH_ALIGNMENT_KHR 0x104A
#define CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT_KHR 0x104B
/**************************************
* cl_khr_initialize_memory extension *
**************************************/
#define CL_CONTEXT_MEMORY_INITIALIZE_KHR 0x2030
/**************************************
* cl_khr_terminate_context extension *
**************************************/
#define CL_DEVICE_TERMINATE_CAPABILITY_KHR 0x2031
#define CL_CONTEXT_TERMINATE_KHR 0x2032
#define cl_khr_terminate_context 1
extern CL_API_ENTRY cl_int CL_API_CALL
clTerminateContextKHR(cl_context context) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int
(CL_API_CALL *clTerminateContextKHR_fn)(cl_context context) CL_EXT_SUFFIX__VERSION_1_2;
/*
* Extension: cl_khr_spir
*
* This extension adds support to create an OpenCL program object from a
* Standard Portable Intermediate Representation (SPIR) instance
*/
#define CL_DEVICE_SPIR_VERSIONS 0x40E0
#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE 0x40E1
/*****************************************
* cl_khr_create_command_queue extension *
*****************************************/
#define cl_khr_create_command_queue 1
typedef cl_bitfield cl_queue_properties_khr;
extern CL_API_ENTRY cl_command_queue CL_API_CALL
clCreateCommandQueueWithPropertiesKHR(cl_context context,
cl_device_id device,
const cl_queue_properties_khr* properties,
cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_command_queue
(CL_API_CALL *clCreateCommandQueueWithPropertiesKHR_fn)(cl_context context,
cl_device_id device,
const cl_queue_properties_khr* properties,
cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
/******************************************
* cl_nv_device_attribute_query extension *
******************************************/
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
#define CL_DEVICE_WARP_SIZE_NV 0x4003
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
/*********************************
* cl_amd_device_attribute_query *
*********************************/
#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD 0x4036
/*********************************
* cl_arm_printf extension
*********************************/
#define CL_PRINTF_CALLBACK_ARM 0x40B0
#define CL_PRINTF_BUFFERSIZE_ARM 0x40B1
/***********************************
* cl_ext_device_fission extension
***********************************/
#define cl_ext_device_fission 1
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseDeviceEXT(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY cl_int
(CL_API_CALL *clReleaseDeviceEXT_fn)(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainDeviceEXT(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY cl_int
(CL_API_CALL *clRetainDeviceEXT_fn)(cl_device_id device) CL_EXT_SUFFIX__VERSION_1_1;
typedef cl_ulong cl_device_partition_property_ext;
extern CL_API_ENTRY cl_int CL_API_CALL
clCreateSubDevicesEXT(cl_device_id in_device,
const cl_device_partition_property_ext * properties,
cl_uint num_entries,
cl_device_id * out_devices,
cl_uint * num_devices) CL_EXT_SUFFIX__VERSION_1_1;
typedef CL_API_ENTRY cl_int
(CL_API_CALL * clCreateSubDevicesEXT_fn)(cl_device_id in_device,
const cl_device_partition_property_ext * properties,
cl_uint num_entries,
cl_device_id * out_devices,
cl_uint * num_devices) CL_EXT_SUFFIX__VERSION_1_1;
/* cl_device_partition_property_ext */
#define CL_DEVICE_PARTITION_EQUALLY_EXT 0x4050
#define CL_DEVICE_PARTITION_BY_COUNTS_EXT 0x4051
#define CL_DEVICE_PARTITION_BY_NAMES_EXT 0x4052
#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT 0x4053
/* clDeviceGetInfo selectors */
#define CL_DEVICE_PARENT_DEVICE_EXT 0x4054
#define CL_DEVICE_PARTITION_TYPES_EXT 0x4055
#define CL_DEVICE_AFFINITY_DOMAINS_EXT 0x4056
#define CL_DEVICE_REFERENCE_COUNT_EXT 0x4057
#define CL_DEVICE_PARTITION_STYLE_EXT 0x4058
/* error codes */
#define CL_DEVICE_PARTITION_FAILED_EXT -1057
#define CL_INVALID_PARTITION_COUNT_EXT -1058
#define CL_INVALID_PARTITION_NAME_EXT -1059
/* CL_AFFINITY_DOMAINs */
#define CL_AFFINITY_DOMAIN_L1_CACHE_EXT 0x1
#define CL_AFFINITY_DOMAIN_L2_CACHE_EXT 0x2
#define CL_AFFINITY_DOMAIN_L3_CACHE_EXT 0x3
#define CL_AFFINITY_DOMAIN_L4_CACHE_EXT 0x4
#define CL_AFFINITY_DOMAIN_NUMA_EXT 0x10
#define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT 0x100
/* cl_device_partition_property_ext list terminators */
#define CL_PROPERTIES_LIST_END_EXT ((cl_device_partition_property_ext) 0)
#define CL_PARTITION_BY_COUNTS_LIST_END_EXT ((cl_device_partition_property_ext) 0)
#define CL_PARTITION_BY_NAMES_LIST_END_EXT ((cl_device_partition_property_ext) 0 - 1)
/***********************************
* cl_ext_migrate_memobject extension definitions
***********************************/
#define cl_ext_migrate_memobject 1
typedef cl_bitfield cl_mem_migration_flags_ext;
#define CL_MIGRATE_MEM_OBJECT_HOST_EXT 0x1
#define CL_COMMAND_MIGRATE_MEM_OBJECT_EXT 0x4040
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueMigrateMemObjectEXT(cl_command_queue command_queue,
cl_uint num_mem_objects,
const cl_mem * mem_objects,
cl_mem_migration_flags_ext flags,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event);
typedef CL_API_ENTRY cl_int
(CL_API_CALL *clEnqueueMigrateMemObjectEXT_fn)(cl_command_queue command_queue,
cl_uint num_mem_objects,
const cl_mem * mem_objects,
cl_mem_migration_flags_ext flags,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event);
/*********************************
* cl_qcom_ext_host_ptr extension
*********************************/
#define cl_qcom_ext_host_ptr 1
#define CL_MEM_EXT_HOST_PTR_QCOM (1 << 29)
#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM 0x40A0
#define CL_DEVICE_PAGE_SIZE_QCOM 0x40A1
#define CL_IMAGE_ROW_ALIGNMENT_QCOM 0x40A2
#define CL_IMAGE_SLICE_ALIGNMENT_QCOM 0x40A3
#define CL_MEM_HOST_UNCACHED_QCOM 0x40A4
#define CL_MEM_HOST_WRITEBACK_QCOM 0x40A5
#define CL_MEM_HOST_WRITETHROUGH_QCOM 0x40A6
#define CL_MEM_HOST_WRITE_COMBINING_QCOM 0x40A7
typedef cl_uint cl_image_pitch_info_qcom;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceImageInfoQCOM(cl_device_id device,
size_t image_width,
size_t image_height,
const cl_image_format *image_format,
cl_image_pitch_info_qcom param_name,
size_t param_value_size,
void *param_value,
size_t *param_value_size_ret);
typedef struct _cl_mem_ext_host_ptr
{
/* Type of external memory allocation. */
/* Legal values will be defined in layered extensions. */
cl_uint allocation_type;
/* Host cache policy for this external memory allocation. */
cl_uint host_cache_policy;
} cl_mem_ext_host_ptr;
/*******************************************
* cl_qcom_ext_host_ptr_iocoherent extension
********************************************/
/* Cache policy specifying io-coherence */
#define CL_MEM_HOST_IOCOHERENT_QCOM 0x40A9
/*********************************
* cl_qcom_ion_host_ptr extension
*********************************/
#define CL_MEM_ION_HOST_PTR_QCOM 0x40A8
typedef struct _cl_mem_ion_host_ptr
{
/* Type of external memory allocation. */
/* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */
cl_mem_ext_host_ptr ext_host_ptr;
/* ION file descriptor */
int ion_filedesc;
/* Host pointer to the ION allocated memory */
void* ion_hostptr;
} cl_mem_ion_host_ptr;
/*********************************
* cl_qcom_android_native_buffer_host_ptr extension
*********************************/
#define CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM 0x40C6
typedef struct _cl_mem_android_native_buffer_host_ptr
{
/* Type of external memory allocation. */
/* Must be CL_MEM_ANDROID_NATIVE_BUFFER_HOST_PTR_QCOM for Android native buffers. */
cl_mem_ext_host_ptr ext_host_ptr;
/* Virtual pointer to the android native buffer */
void* anb_ptr;
} cl_mem_android_native_buffer_host_ptr;
/******************************************
* cl_img_yuv_image extension *
******************************************/
/* Image formats used in clCreateImage */
#define CL_NV21_IMG 0x40D0
#define CL_YV12_IMG 0x40D1
/******************************************
* cl_img_cached_allocations extension *
******************************************/
/* Flag values used by clCreateBuffer */
#define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG (1 << 26)
#define CL_MEM_USE_CACHED_CPU_MEMORY_IMG (1 << 27)
/******************************************
* cl_img_use_gralloc_ptr extension *
******************************************/
#define cl_img_use_gralloc_ptr 1
/* Flag values used by clCreateBuffer */
#define CL_MEM_USE_GRALLOC_PTR_IMG (1 << 28)
/* To be used by clGetEventInfo: */
#define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG 0x40D2
#define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG 0x40D3
/* Error code from clEnqueueReleaseGrallocObjectsIMG */
#define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG 0x40D4
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireGrallocObjectsIMG(cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseGrallocObjectsIMG(cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_EXT_SUFFIX__VERSION_1_2;
/*********************************
* cl_khr_subgroups extension
*********************************/
#define cl_khr_subgroups 1
#if !defined(CL_VERSION_2_1)
/* For OpenCL 2.1 and newer, cl_kernel_sub_group_info is declared in CL.h.
In hindsight, there should have been a khr suffix on this type for
the extension, but keeping it un-suffixed to maintain backwards
compatibility. */
typedef cl_uint cl_kernel_sub_group_info;
#endif
/* cl_kernel_sub_group_info */
#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR 0x2033
#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR 0x2034
extern CL_API_ENTRY cl_int CL_API_CALL
clGetKernelSubGroupInfoKHR(cl_kernel in_kernel,
cl_device_id in_device,
cl_kernel_sub_group_info param_name,
size_t input_value_size,
const void * input_value,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
typedef CL_API_ENTRY cl_int
(CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel in_kernel,
cl_device_id in_device,
cl_kernel_sub_group_info param_name,
size_t input_value_size,
const void * input_value,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
/*********************************
* cl_khr_mipmap_image extension
*********************************/
/* cl_sampler_properties */
#define CL_SAMPLER_MIP_FILTER_MODE_KHR 0x1155
#define CL_SAMPLER_LOD_MIN_KHR 0x1156
#define CL_SAMPLER_LOD_MAX_KHR 0x1157
/*********************************
* cl_khr_priority_hints extension
*********************************/
/* This extension define is for backwards compatibility.
It shouldn't be required since this extension has no new functions. */
#define cl_khr_priority_hints 1
typedef cl_uint cl_queue_priority_khr;
/* cl_command_queue_properties */
#define CL_QUEUE_PRIORITY_KHR 0x1096
/* cl_queue_priority_khr */
#define CL_QUEUE_PRIORITY_HIGH_KHR (1<<0)
#define CL_QUEUE_PRIORITY_MED_KHR (1<<1)
#define CL_QUEUE_PRIORITY_LOW_KHR (1<<2)
/*********************************
* cl_khr_throttle_hints extension
*********************************/
/* This extension define is for backwards compatibility.
It shouldn't be required since this extension has no new functions. */
#define cl_khr_throttle_hints 1
typedef cl_uint cl_queue_throttle_khr;
/* cl_command_queue_properties */
#define CL_QUEUE_THROTTLE_KHR 0x1097
/* cl_queue_throttle_khr */
#define CL_QUEUE_THROTTLE_HIGH_KHR (1<<0)
#define CL_QUEUE_THROTTLE_MED_KHR (1<<1)
#define CL_QUEUE_THROTTLE_LOW_KHR (1<<2)
/*********************************
* cl_khr_subgroup_named_barrier
*********************************/
/* This extension define is for backwards compatibility.
It shouldn't be required since this extension has no new functions. */
#define cl_khr_subgroup_named_barrier 1
/* cl_device_info */
#define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR 0x2035
/**********************************
* cl_arm_import_memory extension *
**********************************/
#define cl_arm_import_memory 1
typedef intptr_t cl_import_properties_arm;
/* Default and valid proporties name for cl_arm_import_memory */
#define CL_IMPORT_TYPE_ARM 0x40B2
/* Host process memory type default value for CL_IMPORT_TYPE_ARM property */
#define CL_IMPORT_TYPE_HOST_ARM 0x40B3
/* DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
#define CL_IMPORT_TYPE_DMA_BUF_ARM 0x40B4
/* Protected DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
#define CL_IMPORT_TYPE_PROTECTED_ARM 0x40B5
/* This extension adds a new function that allows for direct memory import into
* OpenCL via the clImportMemoryARM function.
*
* Memory imported through this interface will be mapped into the device's page
* tables directly, providing zero copy access. It will never fall back to copy
* operations and aliased buffers.
*
* Types of memory supported for import are specified as additional extension
* strings.
*
* This extension produces cl_mem allocations which are compatible with all other
* users of cl_mem in the standard API.
*
* This extension maps pages with the same properties as the normal buffer creation
* function clCreateBuffer.
*/
extern CL_API_ENTRY cl_mem CL_API_CALL
clImportMemoryARM( cl_context context,
cl_mem_flags flags,
const cl_import_properties_arm *properties,
void *memory,
size_t size,
cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_0;
/******************************************
* cl_arm_shared_virtual_memory extension *
******************************************/
#define cl_arm_shared_virtual_memory 1
/* Used by clGetDeviceInfo */
#define CL_DEVICE_SVM_CAPABILITIES_ARM 0x40B6
/* Used by clGetMemObjectInfo */
#define CL_MEM_USES_SVM_POINTER_ARM 0x40B7
/* Used by clSetKernelExecInfoARM: */
#define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM 0x40B8
#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM 0x40B9
/* To be used by clGetEventInfo: */
#define CL_COMMAND_SVM_FREE_ARM 0x40BA
#define CL_COMMAND_SVM_MEMCPY_ARM 0x40BB
#define CL_COMMAND_SVM_MEMFILL_ARM 0x40BC
#define CL_COMMAND_SVM_MAP_ARM 0x40BD
#define CL_COMMAND_SVM_UNMAP_ARM 0x40BE
/* Flag values returned by clGetDeviceInfo with CL_DEVICE_SVM_CAPABILITIES_ARM as the param_name. */
#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM (1 << 0)
#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM (1 << 1)
#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM (1 << 2)
#define CL_DEVICE_SVM_ATOMICS_ARM (1 << 3)
/* Flag values used by clSVMAllocARM: */
#define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM (1 << 10)
#define CL_MEM_SVM_ATOMICS_ARM (1 << 11)
typedef cl_bitfield cl_svm_mem_flags_arm;
typedef cl_uint cl_kernel_exec_info_arm;
typedef cl_bitfield cl_device_svm_capabilities_arm;
extern CL_API_ENTRY void * CL_API_CALL
clSVMAllocARM(cl_context context,
cl_svm_mem_flags_arm flags,
size_t size,
cl_uint alignment) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY void CL_API_CALL
clSVMFreeARM(cl_context context,
void * svm_pointer) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMFreeARM(cl_command_queue command_queue,
cl_uint num_svm_pointers,
void * svm_pointers[],
void (CL_CALLBACK * pfn_free_func)(cl_command_queue queue,
cl_uint num_svm_pointers,
void * svm_pointers[],
void * user_data),
void * user_data,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMMemcpyARM(cl_command_queue command_queue,
cl_bool blocking_copy,
void * dst_ptr,
const void * src_ptr,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMMemFillARM(cl_command_queue command_queue,
void * svm_ptr,
const void * pattern,
size_t pattern_size,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMMapARM(cl_command_queue command_queue,
cl_bool blocking_map,
cl_map_flags flags,
void * svm_ptr,
size_t size,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueSVMUnmapARM(cl_command_queue command_queue,
void * svm_ptr,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clSetKernelArgSVMPointerARM(cl_kernel kernel,
cl_uint arg_index,
const void * arg_value) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clSetKernelExecInfoARM(cl_kernel kernel,
cl_kernel_exec_info_arm param_name,
size_t param_value_size,
const void * param_value) CL_EXT_SUFFIX__VERSION_1_2;
/********************************
* cl_arm_get_core_id extension *
********************************/
#ifdef CL_VERSION_1_2
#define cl_arm_get_core_id 1
/* Device info property for bitfield of cores present */
#define CL_DEVICE_COMPUTE_UNITS_BITFIELD_ARM 0x40BF
#endif /* CL_VERSION_1_2 */
/*********************************
* cl_arm_job_slot_selection
*********************************/
#define cl_arm_job_slot_selection 1
/* cl_device_info */
#define CL_DEVICE_JOB_SLOTS_ARM 0x41E0
/* cl_command_queue_properties */
#define CL_QUEUE_JOB_SLOT_ARM 0x41E1
#ifdef __cplusplus
}
#endif
#endif /* __CL_EXT_H */

View File

@@ -0,0 +1,423 @@
/*******************************************************************************
* Copyright (c) 2008-2019 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
* "Materials"), to deal in the Materials without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Materials, and to
* permit persons to whom the Materials are furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
* MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
* KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
* SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
* https://www.khronos.org/registry/
*
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
******************************************************************************/
/*****************************************************************************\
Copyright (c) 2013-2019 Intel Corporation All Rights Reserved.
THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
File Name: cl_ext_intel.h
Abstract:
Notes:
\*****************************************************************************/
#ifndef __CL_EXT_INTEL_H
#define __CL_EXT_INTEL_H
#include <CL/cl.h>
#include <CL/cl_platform.h>
#ifdef __cplusplus
extern "C" {
#endif
/***************************************
* cl_intel_thread_local_exec extension *
****************************************/
#define cl_intel_thread_local_exec 1
#define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL (((cl_bitfield)1) << 31)
/***********************************************
* cl_intel_device_partition_by_names extension *
************************************************/
#define cl_intel_device_partition_by_names 1
#define CL_DEVICE_PARTITION_BY_NAMES_INTEL 0x4052
#define CL_PARTITION_BY_NAMES_LIST_END_INTEL -1
/************************************************
* cl_intel_accelerator extension *
* cl_intel_motion_estimation extension *
* cl_intel_advanced_motion_estimation extension *
*************************************************/
#define cl_intel_accelerator 1
#define cl_intel_motion_estimation 1
#define cl_intel_advanced_motion_estimation 1
typedef struct _cl_accelerator_intel* cl_accelerator_intel;
typedef cl_uint cl_accelerator_type_intel;
typedef cl_uint cl_accelerator_info_intel;
typedef struct _cl_motion_estimation_desc_intel {
cl_uint mb_block_type;
cl_uint subpixel_mode;
cl_uint sad_adjust_mode;
cl_uint search_path_type;
} cl_motion_estimation_desc_intel;
/* error codes */
#define CL_INVALID_ACCELERATOR_INTEL -1094
#define CL_INVALID_ACCELERATOR_TYPE_INTEL -1095
#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL -1096
#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL -1097
/* cl_accelerator_type_intel */
#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL 0x0
/* cl_accelerator_info_intel */
#define CL_ACCELERATOR_DESCRIPTOR_INTEL 0x4090
#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL 0x4091
#define CL_ACCELERATOR_CONTEXT_INTEL 0x4092
#define CL_ACCELERATOR_TYPE_INTEL 0x4093
/* cl_motion_detect_desc_intel flags */
#define CL_ME_MB_TYPE_16x16_INTEL 0x0
#define CL_ME_MB_TYPE_8x8_INTEL 0x1
#define CL_ME_MB_TYPE_4x4_INTEL 0x2
#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0
#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1
#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL 0x2
#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0
#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x1
#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL 0x0
#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL 0x1
#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL 0x5
#define CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL 0x0
#define CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL 0x1
#define CL_ME_LUMA_INTRA_PREDICT_ENABLED_INTEL 0x2
#define CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL 0x4
#define CL_ME_FORWARD_INPUT_MODE_INTEL 0x1
#define CL_ME_BACKWARD_INPUT_MODE_INTEL 0x2
#define CL_ME_BIDIRECTION_INPUT_MODE_INTEL 0x3
#define CL_ME_BIDIR_WEIGHT_QUARTER_INTEL 16
#define CL_ME_BIDIR_WEIGHT_THIRD_INTEL 21
#define CL_ME_BIDIR_WEIGHT_HALF_INTEL 32
#define CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 43
#define CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 48
#define CL_ME_COST_PENALTY_NONE_INTEL 0x0
#define CL_ME_COST_PENALTY_LOW_INTEL 0x1
#define CL_ME_COST_PENALTY_NORMAL_INTEL 0x2
#define CL_ME_COST_PENALTY_HIGH_INTEL 0x3
#define CL_ME_COST_PRECISION_QPEL_INTEL 0x0
#define CL_ME_COST_PRECISION_HPEL_INTEL 0x1
#define CL_ME_COST_PRECISION_PEL_INTEL 0x2
#define CL_ME_COST_PRECISION_DPEL_INTEL 0x3
#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0
#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
#define CL_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2
#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3
#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
#define CL_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4
#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5
#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6
#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7
#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8
#define CL_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0
#define CL_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
#define CL_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2
#define CL_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3
/* cl_device_info */
#define CL_DEVICE_ME_VERSION_INTEL 0x407E
#define CL_ME_VERSION_LEGACY_INTEL 0x0
#define CL_ME_VERSION_ADVANCED_VER_1_INTEL 0x1
#define CL_ME_VERSION_ADVANCED_VER_2_INTEL 0x2
extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL
clCreateAcceleratorINTEL(
cl_context context,
cl_accelerator_type_intel accelerator_type,
size_t descriptor_size,
const void* descriptor,
cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_accelerator_intel (CL_API_CALL *clCreateAcceleratorINTEL_fn)(
cl_context context,
cl_accelerator_type_intel accelerator_type,
size_t descriptor_size,
const void* descriptor,
cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetAcceleratorInfoINTEL(
cl_accelerator_intel accelerator,
cl_accelerator_info_intel param_name,
size_t param_value_size,
void* param_value,
size_t* param_value_size_ret) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)(
cl_accelerator_intel accelerator,
cl_accelerator_info_intel param_name,
size_t param_value_size,
void* param_value,
size_t* param_value_size_ret) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clRetainAcceleratorINTEL(
cl_accelerator_intel accelerator) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clRetainAcceleratorINTEL_fn)(
cl_accelerator_intel accelerator) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clReleaseAcceleratorINTEL(
cl_accelerator_intel accelerator) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clReleaseAcceleratorINTEL_fn)(
cl_accelerator_intel accelerator) CL_EXT_SUFFIX__VERSION_1_2;
/******************************************
* cl_intel_simultaneous_sharing extension *
*******************************************/
#define cl_intel_simultaneous_sharing 1
#define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL 0x4104
#define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL 0x4105
/***********************************
* cl_intel_egl_image_yuv extension *
************************************/
#define cl_intel_egl_image_yuv 1
#define CL_EGL_YUV_PLANE_INTEL 0x4107
/********************************
* cl_intel_packed_yuv extension *
*********************************/
#define cl_intel_packed_yuv 1
#define CL_YUYV_INTEL 0x4076
#define CL_UYVY_INTEL 0x4077
#define CL_YVYU_INTEL 0x4078
#define CL_VYUY_INTEL 0x4079
/********************************************
* cl_intel_required_subgroup_size extension *
*********************************************/
#define cl_intel_required_subgroup_size 1
#define CL_DEVICE_SUB_GROUP_SIZES_INTEL 0x4108
#define CL_KERNEL_SPILL_MEM_SIZE_INTEL 0x4109
#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL 0x410A
/****************************************
* cl_intel_driver_diagnostics extension *
*****************************************/
#define cl_intel_driver_diagnostics 1
typedef cl_uint cl_diagnostics_verbose_level;
#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL 0x4106
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_ALL_INTEL ( 0xff )
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL ( 1 )
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL ( 1 << 1 )
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL ( 1 << 2 )
/********************************
* cl_intel_planar_yuv extension *
*********************************/
#define CL_NV12_INTEL 0x410E
#define CL_MEM_NO_ACCESS_INTEL ( 1 << 24 )
#define CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL ( 1 << 25 )
#define CL_DEVICE_PLANAR_YUV_MAX_WIDTH_INTEL 0x417E
#define CL_DEVICE_PLANAR_YUV_MAX_HEIGHT_INTEL 0x417F
/*******************************************************
* cl_intel_device_side_avc_motion_estimation extension *
********************************************************/
#define CL_DEVICE_AVC_ME_VERSION_INTEL 0x410B
#define CL_DEVICE_AVC_ME_SUPPORTS_TEXTURE_SAMPLER_USE_INTEL 0x410C
#define CL_DEVICE_AVC_ME_SUPPORTS_PREEMPTION_INTEL 0x410D
#define CL_AVC_ME_VERSION_0_INTEL 0x0; // No support.
#define CL_AVC_ME_VERSION_1_INTEL 0x1; // First supported version.
#define CL_AVC_ME_MAJOR_16x16_INTEL 0x0
#define CL_AVC_ME_MAJOR_16x8_INTEL 0x1
#define CL_AVC_ME_MAJOR_8x16_INTEL 0x2
#define CL_AVC_ME_MAJOR_8x8_INTEL 0x3
#define CL_AVC_ME_MINOR_8x8_INTEL 0x0
#define CL_AVC_ME_MINOR_8x4_INTEL 0x1
#define CL_AVC_ME_MINOR_4x8_INTEL 0x2
#define CL_AVC_ME_MINOR_4x4_INTEL 0x3
#define CL_AVC_ME_MAJOR_FORWARD_INTEL 0x0
#define CL_AVC_ME_MAJOR_BACKWARD_INTEL 0x1
#define CL_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL 0x2
#define CL_AVC_ME_PARTITION_MASK_ALL_INTEL 0x0
#define CL_AVC_ME_PARTITION_MASK_16x16_INTEL 0x7E
#define CL_AVC_ME_PARTITION_MASK_16x8_INTEL 0x7D
#define CL_AVC_ME_PARTITION_MASK_8x16_INTEL 0x7B
#define CL_AVC_ME_PARTITION_MASK_8x8_INTEL 0x77
#define CL_AVC_ME_PARTITION_MASK_8x4_INTEL 0x6F
#define CL_AVC_ME_PARTITION_MASK_4x8_INTEL 0x5F
#define CL_AVC_ME_PARTITION_MASK_4x4_INTEL 0x3F
#define CL_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL 0x0
#define CL_AVC_ME_SEARCH_WINDOW_SMALL_INTEL 0x1
#define CL_AVC_ME_SEARCH_WINDOW_TINY_INTEL 0x2
#define CL_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL 0x3
#define CL_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL 0x4
#define CL_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL 0x5
#define CL_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL 0x6
#define CL_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL 0x7
#define CL_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL 0x8
#define CL_AVC_ME_SEARCH_WINDOW_16x12_RADIUS_INTEL 0x9
#define CL_AVC_ME_SEARCH_WINDOW_4x4_RADIUS_INTEL 0x2
#define CL_AVC_ME_SEARCH_WINDOW_2x2_RADIUS_INTEL 0xa
#define CL_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0
#define CL_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x2
#define CL_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0
#define CL_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1
#define CL_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL 0x3
#define CL_AVC_ME_COST_PRECISION_QPEL_INTEL 0x0
#define CL_AVC_ME_COST_PRECISION_HPEL_INTEL 0x1
#define CL_AVC_ME_COST_PRECISION_PEL_INTEL 0x2
#define CL_AVC_ME_COST_PRECISION_DPEL_INTEL 0x3
#define CL_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL 0x10
#define CL_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL 0x15
#define CL_AVC_ME_BIDIR_WEIGHT_HALF_INTEL 0x20
#define CL_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 0x2B
#define CL_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 0x30
#define CL_AVC_ME_BORDER_REACHED_LEFT_INTEL 0x0
#define CL_AVC_ME_BORDER_REACHED_RIGHT_INTEL 0x2
#define CL_AVC_ME_BORDER_REACHED_TOP_INTEL 0x4
#define CL_AVC_ME_BORDER_REACHED_BOTTOM_INTEL 0x8
#define CL_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL 0x0
#define CL_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL 0x4000
#define CL_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL ( 0x1 << 24 )
#define CL_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL ( 0x2 << 24 )
#define CL_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL ( 0x3 << 24 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL ( 0x55 << 24 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL ( 0xAA << 24 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL ( 0xFF << 24 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL ( 0x1 << 24 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL ( 0x2 << 24 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL ( 0x1 << 26 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL ( 0x2 << 26 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL ( 0x1 << 28 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL ( 0x2 << 28 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL ( 0x1 << 30 )
#define CL_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL ( 0x2 << 30 )
#define CL_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL 0x00
#define CL_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL 0x80
#define CL_AVC_ME_INTRA_16x16_INTEL 0x0
#define CL_AVC_ME_INTRA_8x8_INTEL 0x1
#define CL_AVC_ME_INTRA_4x4_INTEL 0x2
#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL 0x6
#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL 0x5
#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL 0x3
#define CL_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL 0x60
#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL 0x10
#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL 0x8
#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL 0x4
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8
#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0
#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2
#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3
#define CL_AVC_ME_FRAME_FORWARD_INTEL 0x1
#define CL_AVC_ME_FRAME_BACKWARD_INTEL 0x2
#define CL_AVC_ME_FRAME_DUAL_INTEL 0x3
#define CL_AVC_ME_SLICE_TYPE_PRED_INTEL 0x0
#define CL_AVC_ME_SLICE_TYPE_BPRED_INTEL 0x1
#define CL_AVC_ME_SLICE_TYPE_INTRA_INTEL 0x2
#define CL_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL 0x0
#define CL_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL 0x1
#ifdef __cplusplus
}
#endif
#endif /* __CL_EXT_INTEL_H */

View File

@@ -0,0 +1,171 @@
/**********************************************************************************
* Copyright (c) 2008-2019 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
* "Materials"), to deal in the Materials without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Materials, and to
* permit persons to whom the Materials are furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
* MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
* KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
* SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
* https://www.khronos.org/registry/
*
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
**********************************************************************************/
#ifndef __OPENCL_CL_GL_H
#define __OPENCL_CL_GL_H
#include <CL/cl.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef cl_uint cl_gl_object_type;
typedef cl_uint cl_gl_texture_info;
typedef cl_uint cl_gl_platform_info;
typedef struct __GLsync *cl_GLsync;
/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken */
#define CL_GL_OBJECT_BUFFER 0x2000
#define CL_GL_OBJECT_TEXTURE2D 0x2001
#define CL_GL_OBJECT_TEXTURE3D 0x2002
#define CL_GL_OBJECT_RENDERBUFFER 0x2003
#ifdef CL_VERSION_1_2
#define CL_GL_OBJECT_TEXTURE2D_ARRAY 0x200E
#define CL_GL_OBJECT_TEXTURE1D 0x200F
#define CL_GL_OBJECT_TEXTURE1D_ARRAY 0x2010
#define CL_GL_OBJECT_TEXTURE_BUFFER 0x2011
#endif
/* cl_gl_texture_info */
#define CL_GL_TEXTURE_TARGET 0x2004
#define CL_GL_MIPMAP_LEVEL 0x2005
#ifdef CL_VERSION_1_2
#define CL_GL_NUM_SAMPLES 0x2012
#endif
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromGLBuffer(cl_context context,
cl_mem_flags flags,
cl_GLuint bufobj,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
#ifdef CL_VERSION_1_2
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromGLTexture(cl_context context,
cl_mem_flags flags,
cl_GLenum target,
cl_GLint miplevel,
cl_GLuint texture,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
#endif
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromGLRenderbuffer(cl_context context,
cl_mem_flags flags,
cl_GLuint renderbuffer,
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetGLObjectInfo(cl_mem memobj,
cl_gl_object_type * gl_object_type,
cl_GLuint * gl_object_name) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetGLTextureInfo(cl_mem memobj,
cl_gl_texture_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireGLObjects(cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseGLObjects(cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem * mem_objects,
cl_uint num_events_in_wait_list,
const cl_event * event_wait_list,
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
/* Deprecated OpenCL 1.1 APIs */
extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
clCreateFromGLTexture2D(cl_context context,
cl_mem_flags flags,
cl_GLenum target,
cl_GLint miplevel,
cl_GLuint texture,
cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
clCreateFromGLTexture3D(cl_context context,
cl_mem_flags flags,
cl_GLenum target,
cl_GLint miplevel,
cl_GLuint texture,
cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
/* cl_khr_gl_sharing extension */
#define cl_khr_gl_sharing 1
typedef cl_uint cl_gl_context_info;
/* Additional Error Codes */
#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR -1000
/* cl_gl_context_info */
#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR 0x2006
#define CL_DEVICES_FOR_GL_CONTEXT_KHR 0x2007
/* Additional cl_context_properties */
#define CL_GL_CONTEXT_KHR 0x2008
#define CL_EGL_DISPLAY_KHR 0x2009
#define CL_GLX_DISPLAY_KHR 0x200A
#define CL_WGL_HDC_KHR 0x200B
#define CL_CGL_SHAREGROUP_KHR 0x200C
extern CL_API_ENTRY cl_int CL_API_CALL
clGetGLContextInfoKHR(const cl_context_properties * properties,
cl_gl_context_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret) CL_API_SUFFIX__VERSION_1_0;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
const cl_context_properties * properties,
cl_gl_context_info param_name,
size_t param_value_size,
void * param_value,
size_t * param_value_size_ret);
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_GL_H */

View File

@@ -0,0 +1,52 @@
/**********************************************************************************
* Copyright (c) 2008-2019 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
* "Materials"), to deal in the Materials without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Materials, and to
* permit persons to whom the Materials are furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
* MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
* KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
* SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
* https://www.khronos.org/registry/
*
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
**********************************************************************************/
#ifndef __OPENCL_CL_GL_EXT_H
#define __OPENCL_CL_GL_EXT_H
#ifdef __cplusplus
extern "C" {
#endif
#include <CL/cl_gl.h>
/*
* cl_khr_gl_event extension
*/
#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR 0x200D
extern CL_API_ENTRY cl_event CL_API_CALL
clCreateEventFromGLsyncKHR(cl_context context,
cl_GLsync cl_GLsync,
cl_int * errcode_ret) CL_EXT_SUFFIX__VERSION_1_1;
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_GL_EXT_H */

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,172 @@
/**********************************************************************************
* Copyright (c) 2008-2019 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
* "Materials"), to deal in the Materials without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Materials, and to
* permit persons to whom the Materials are furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
* MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
* KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
* SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
* https://www.khronos.org/registry/
*
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
**********************************************************************************/
/*****************************************************************************\
Copyright (c) 2013-2019 Intel Corporation All Rights Reserved.
THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
File Name: cl_va_api_media_sharing_intel.h
Abstract:
Notes:
\*****************************************************************************/
#ifndef __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
#define __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
#include <CL/cl.h>
#include <CL/cl_platform.h>
#include <va/va.h>
#ifdef __cplusplus
extern "C" {
#endif
/******************************************
* cl_intel_va_api_media_sharing extension *
*******************************************/
#define cl_intel_va_api_media_sharing 1
/* error codes */
#define CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL -1098
#define CL_INVALID_VA_API_MEDIA_SURFACE_INTEL -1099
#define CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL -1100
#define CL_VA_API_MEDIA_SURFACE_NOT_ACQUIRED_INTEL -1101
/* cl_va_api_device_source_intel */
#define CL_VA_API_DISPLAY_INTEL 0x4094
/* cl_va_api_device_set_intel */
#define CL_PREFERRED_DEVICES_FOR_VA_API_INTEL 0x4095
#define CL_ALL_DEVICES_FOR_VA_API_INTEL 0x4096
/* cl_context_info */
#define CL_CONTEXT_VA_API_DISPLAY_INTEL 0x4097
/* cl_mem_info */
#define CL_MEM_VA_API_MEDIA_SURFACE_INTEL 0x4098
/* cl_image_info */
#define CL_IMAGE_VA_API_PLANE_INTEL 0x4099
/* cl_command_type */
#define CL_COMMAND_ACQUIRE_VA_API_MEDIA_SURFACES_INTEL 0x409A
#define CL_COMMAND_RELEASE_VA_API_MEDIA_SURFACES_INTEL 0x409B
typedef cl_uint cl_va_api_device_source_intel;
typedef cl_uint cl_va_api_device_set_intel;
extern CL_API_ENTRY cl_int CL_API_CALL
clGetDeviceIDsFromVA_APIMediaAdapterINTEL(
cl_platform_id platform,
cl_va_api_device_source_intel media_adapter_type,
void* media_adapter,
cl_va_api_device_set_intel media_adapter_set,
cl_uint num_entries,
cl_device_id* devices,
cl_uint* num_devices) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL * clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)(
cl_platform_id platform,
cl_va_api_device_source_intel media_adapter_type,
void* media_adapter,
cl_va_api_device_set_intel media_adapter_set,
cl_uint num_entries,
cl_device_id* devices,
cl_uint* num_devices) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_mem CL_API_CALL
clCreateFromVA_APIMediaSurfaceINTEL(
cl_context context,
cl_mem_flags flags,
VASurfaceID* surface,
cl_uint plane,
cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_mem (CL_API_CALL * clCreateFromVA_APIMediaSurfaceINTEL_fn)(
cl_context context,
cl_mem_flags flags,
VASurfaceID* surface,
cl_uint plane,
cl_int* errcode_ret) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueAcquireVA_APIMediaSurfacesINTEL(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem* mem_objects,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem* mem_objects,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_EXT_SUFFIX__VERSION_1_2;
extern CL_API_ENTRY cl_int CL_API_CALL
clEnqueueReleaseVA_APIMediaSurfacesINTEL(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem* mem_objects,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_EXT_SUFFIX__VERSION_1_2;
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)(
cl_command_queue command_queue,
cl_uint num_objects,
const cl_mem* mem_objects,
cl_uint num_events_in_wait_list,
const cl_event* event_wait_list,
cl_event* event) CL_EXT_SUFFIX__VERSION_1_2;
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H */

View File

@@ -0,0 +1,86 @@
/*******************************************************************************
* Copyright (c) 2018 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
* "Materials"), to deal in the Materials without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Materials, and to
* permit persons to whom the Materials are furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
* MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
* KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
* SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
* https://www.khronos.org/registry/
*
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
******************************************************************************/
#ifndef __CL_VERSION_H
#define __CL_VERSION_H
/* Detect which version to target */
#if !defined(CL_TARGET_OPENCL_VERSION)
#pragma message("cl_version.h: CL_TARGET_OPENCL_VERSION is not defined. Defaulting to 220 (OpenCL 2.2)")
#define CL_TARGET_OPENCL_VERSION 220
#endif
#if CL_TARGET_OPENCL_VERSION != 100 && \
CL_TARGET_OPENCL_VERSION != 110 && \
CL_TARGET_OPENCL_VERSION != 120 && \
CL_TARGET_OPENCL_VERSION != 200 && \
CL_TARGET_OPENCL_VERSION != 210 && \
CL_TARGET_OPENCL_VERSION != 220
#pragma message("cl_version: CL_TARGET_OPENCL_VERSION is not a valid value (100, 110, 120, 200, 210, 220). Defaulting to 220 (OpenCL 2.2)")
#undef CL_TARGET_OPENCL_VERSION
#define CL_TARGET_OPENCL_VERSION 220
#endif
/* OpenCL Version */
#if CL_TARGET_OPENCL_VERSION >= 220 && !defined(CL_VERSION_2_2)
#define CL_VERSION_2_2 1
#endif
#if CL_TARGET_OPENCL_VERSION >= 210 && !defined(CL_VERSION_2_1)
#define CL_VERSION_2_1 1
#endif
#if CL_TARGET_OPENCL_VERSION >= 200 && !defined(CL_VERSION_2_0)
#define CL_VERSION_2_0 1
#endif
#if CL_TARGET_OPENCL_VERSION >= 120 && !defined(CL_VERSION_1_2)
#define CL_VERSION_1_2 1
#endif
#if CL_TARGET_OPENCL_VERSION >= 110 && !defined(CL_VERSION_1_1)
#define CL_VERSION_1_1 1
#endif
#if CL_TARGET_OPENCL_VERSION >= 100 && !defined(CL_VERSION_1_0)
#define CL_VERSION_1_0 1
#endif
/* Allow deprecated APIs for older OpenCL versions. */
#if CL_TARGET_OPENCL_VERSION <= 210 && !defined(CL_USE_DEPRECATED_OPENCL_2_1_APIS)
#define CL_USE_DEPRECATED_OPENCL_2_1_APIS
#endif
#if CL_TARGET_OPENCL_VERSION <= 200 && !defined(CL_USE_DEPRECATED_OPENCL_2_0_APIS)
#define CL_USE_DEPRECATED_OPENCL_2_0_APIS
#endif
#if CL_TARGET_OPENCL_VERSION <= 120 && !defined(CL_USE_DEPRECATED_OPENCL_1_2_APIS)
#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
#endif
#if CL_TARGET_OPENCL_VERSION <= 110 && !defined(CL_USE_DEPRECATED_OPENCL_1_1_APIS)
#define CL_USE_DEPRECATED_OPENCL_1_1_APIS
#endif
#if CL_TARGET_OPENCL_VERSION <= 100 && !defined(CL_USE_DEPRECATED_OPENCL_1_0_APIS)
#define CL_USE_DEPRECATED_OPENCL_1_0_APIS
#endif
#endif /* __CL_VERSION_H */

View File

@@ -0,0 +1,47 @@
/*******************************************************************************
* Copyright (c) 2008-2015 The Khronos Group Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and/or associated documentation files (the
* "Materials"), to deal in the Materials without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Materials, and to
* permit persons to whom the Materials are furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Materials.
*
* MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
* KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
* SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
* https://www.khronos.org/registry/
*
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
******************************************************************************/
/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
#ifndef __OPENCL_H
#define __OPENCL_H
#ifdef __cplusplus
extern "C" {
#endif
#include <CL/cl.h>
#include <CL/cl_gl.h>
#include <CL/cl_gl_ext.h>
#include <CL/cl_ext.h>
#ifdef __cplusplus
}
#endif
#endif /* __OPENCL_H */

View File

@@ -0,0 +1,79 @@
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH)
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT = kmeans
SRCS = main.cc read_input.c rmse.c cluster.c kmeans_clustering.c
all: $(PROJECT).dump $(PROJECT).hex
lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
kmeans_clustering.o: kmeans_clustering.c
$(CC) $(CXXFLAGS) -c kmeans_clustering.c
cluster.o: cluster.c
$(CC) $(CXXFLAGS) -c cluster.c
read_input.o: read_input.c
$(CC) $(CXXFLAGS) -c read_input.c
rmse.o: rmse.c
$(CC) $(CXXFLAGS) -c rmse.c
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -strace -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug

View File

View File

@@ -0,0 +1,155 @@
/*****************************************************************************/
/*IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. */
/*By downloading, copying, installing or using the software you agree */
/*to this license. If you do not agree to this license, do not download, */
/*install, copy or use the software. */
/* */
/* */
/*Copyright (c) 2005 Northwestern University */
/*All rights reserved. */
/*Redistribution of the software in source and binary forms, */
/*with or without modification, is permitted provided that the */
/*following conditions are met: */
/* */
/*1 Redistributions of source code must retain the above copyright */
/* notice, this list of conditions and the following disclaimer. */
/* */
/*2 Redistributions in binary form must reproduce the above copyright */
/* notice, this list of conditions and the following disclaimer in the */
/* documentation and/or other materials provided with the distribution.*/
/* */
/*3 Neither the name of Northwestern University nor the names of its */
/* contributors may be used to endorse or promote products derived */
/* from this software without specific prior written permission. */
/* */
/*THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS */
/*IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED */
/*TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT AND */
/*FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL */
/*NORTHWESTERN UNIVERSITY OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, */
/*INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/*(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */
/*SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) */
/*HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, */
/*STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN */
/*ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/*POSSIBILITY OF SUCH DAMAGE. */
/******************************************************************************/
/*************************************************************************/
/** File: cluster.c **/
/** Description: Takes as input a file, containing 1 data point per **/
/** per line, and performs a fuzzy c-means clustering **/
/** on the data. Fuzzy clustering is performed using **/
/** min to max clusters and the clustering that gets **/
/** the best score according to a compactness and **/
/** separation criterion are returned. **/
/** Author: Brendan McCane **/
/** James Cook University of North Queensland. **/
/** Australia. email: mccane@cs.jcu.edu.au **/
/** **/
/** Edited by: Jay Pisharath, Wei-keng Liao **/
/** Northwestern University. **/
/** **/
/** ================================================================ **/
/** **/
/** Edited by: Shuai Che, David Tarjan, Sang-Ha Lee **/
/** University of Virginia **/
/** **/
/** Description: No longer supports fuzzy c-means clustering; **/
/** only regular k-means clustering. **/
/** No longer performs "validity" function to analyze **/
/** compactness and separation crietria; instead **/
/** calculate root mean squared error. **/
/** **/
/*************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#include <math.h>
#include <float.h>
#include "kmeans.h"
float min_rmse_ref = FLT_MAX;
extern double wtime(void);
/* reference min_rmse value */
/*---< cluster() >-----------------------------------------------------------*/
int cluster(int npoints, /* number of data points */
int nfeatures, /* number of attributes for each point */
float **features, /* array: [npoints][nfeatures] */
int min_nclusters, /* range of min to max number of clusters */
int max_nclusters,
float threshold, /* loop terminating factor */
int *best_nclusters, /* out: number between min and max with lowest RMSE */
float ***cluster_centres, /* out: [best_nclusters][nfeatures] */
float *min_rmse, /* out: minimum RMSE */
int isRMSE, /* calculate RMSE */
int nloops /* number of iteration for each number of clusters */
)
{
int nclusters; /* number of clusters k */
int index =0; /* number of iteration to reach the best RMSE */
int rmse; /* RMSE for each clustering */
int *membership; /* which cluster a data point belongs to */
float **tmp_cluster_centres; /* hold coordinates of cluster centers */
int i;
/* allocate memory for membership */
membership = (int*) malloc(npoints * sizeof(int));
/* sweep k from min to max_nclusters to find the best number of clusters */
for(nclusters = min_nclusters; nclusters <= max_nclusters; nclusters++)
{
if (nclusters > npoints) break; /* cannot have more clusters than points */
/* allocate device memory, invert data array (@ kmeans_cuda.cu) */
allocate(npoints, nfeatures, nclusters, features);
/* iterate nloops times for each number of clusters */
for(i = 0; i < nloops; i++)
{
/* initialize initial cluster centers, CUDA calls (@ kmeans_cuda.cu) */
tmp_cluster_centres = kmeans_clustering(features,
nfeatures,
npoints,
nclusters,
threshold,
membership);
if (*cluster_centres) {
free((*cluster_centres)[0]);
free(*cluster_centres);
}
*cluster_centres = tmp_cluster_centres;
/* find the number of clusters with the best RMSE */
if(isRMSE)
{
rmse = rms_err(features,
nfeatures,
npoints,
tmp_cluster_centres,
nclusters);
if(rmse < min_rmse_ref){
min_rmse_ref = rmse; //update reference min RMSE
*min_rmse = min_rmse_ref; //update return min RMSE
*best_nclusters = nclusters; //update optimum number of clusters
index = i; //update number of iteration to reach best RMSE
}
}
}
deallocateMemory(); /* free device memory (@ kmeans_cuda.cu) */
}
free(membership);
return index;
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,191 @@
/* getopt.h */
/* Declarations for getopt.
Copyright (C) 1989-1994, 1996-1999, 2001 Free Software
Foundation, Inc. This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute
it and/or modify it under the terms of the GNU Lesser
General Public License as published by the Free Software
Foundation; either version 2.1 of the License, or
(at your option) any later version.
The GNU C Library is distributed in the hope that it will
be useful, but WITHOUT ANY WARRANTY; without even the
implied warranty of MERCHANTABILITY or FITNESS FOR A
PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
You should have received a copy of the GNU Lesser General
Public License along with the GNU C Library; if not, write
to the Free Software Foundation, Inc., 59 Temple Place,
Suite 330, Boston, MA 02111-1307 USA. */
#ifndef _GETOPT_H
#ifndef __need_getopt
# define _GETOPT_H 1
#endif
/* If __GNU_LIBRARY__ is not already defined, either we are being used
standalone, or this is the first header included in the source file.
If we are being used with glibc, we need to include <features.h>, but
that does not exist if we are standalone. So: if __GNU_LIBRARY__ is
not defined, include <ctype.h>, which will pull in <features.h> for us
if it's from glibc. (Why ctype.h? It's guaranteed to exist and it
doesn't flood the namespace with stuff the way some other headers do.) */
#if !defined __GNU_LIBRARY__
# include <ctype.h>
#endif
#ifdef __cplusplus
extern "C" {
#endif
/* For communication from `getopt' to the caller.
When `getopt' finds an option that takes an argument,
the argument value is returned here.
Also, when `ordering' is RETURN_IN_ORDER,
each non-option ARGV-element is returned here. */
extern char *optarg;
/* Index in ARGV of the next element to be scanned.
This is used for communication to and from the caller
and for communication between successive calls to `getopt'.
On entry to `getopt', zero means this is the first call; initialize.
When `getopt' returns -1, this is the index of the first of the
non-option elements that the caller should itself scan.
Otherwise, `optind' communicates from one call to the next
how much of ARGV has been scanned so far. */
extern int optind;
/* Callers store zero here to inhibit the error message `getopt' prints
for unrecognized options. */
extern int opterr;
/* Set to an option character which was unrecognized. */
extern int optopt;
#ifndef __need_getopt
/* Describe the long-named options requested by the application.
The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector
of `struct option' terminated by an element containing a name which is
zero.
The field `has_arg' is:
no_argument (or 0) if the option does not take an argument,
required_argument (or 1) if the option requires an argument,
optional_argument (or 2) if the option takes an optional argument.
If the field `flag' is not NULL, it points to a variable that is set
to the value given in the field `val' when the option is found, but
left unchanged if the option is not found.
To have a long-named option do something other than set an `int' to
a compiled-in constant, such as set a value from `optarg', set the
option's `flag' field to zero and its `val' field to a nonzero
value (the equivalent single-letter option character, if there is
one). For long options that have a zero `flag' field, `getopt'
returns the contents of the `val' field. */
struct option
{
# if (defined __STDC__ && __STDC__) || defined __cplusplus
const char *name;
# else
char *name;
# endif
/* has_arg can't be an enum because some compilers complain about
type mismatches in all the code that assumes it is an int. */
int has_arg;
int *flag;
int val;
};
/* Names for the values of the `has_arg' field of `struct option'. */
# define no_argument 0
# define required_argument 1
# define optional_argument 2
#endif /* need getopt */
/* Get definitions and prototypes for functions to process the
arguments in ARGV (ARGC of them, minus the program name) for
options given in OPTS.
Return the option character from OPTS just read. Return -1 when
there are no more options. For unrecognized options, or options
missing arguments, `optopt' is set to the option letter, and '?' is
returned.
The OPTS string is a list of characters which are recognized option
letters, optionally followed by colons, specifying that that letter
takes an argument, to be placed in `optarg'.
If a letter in OPTS is followed by two colons, its argument is
optional. This behavior is specific to the GNU `getopt'.
The argument `--' causes premature termination of argument
scanning, explicitly telling `getopt' that there are no more
options.
If OPTS begins with `--', then non-option arguments are treated as
arguments to the option '\0'. This behavior is specific to the GNU
`getopt'. */
#if (defined __STDC__ && __STDC__) || defined __cplusplus
# ifdef __GNU_LIBRARY__
/* Many other libraries have conflicting prototypes for getopt, with
differences in the consts, in stdlib.h. To avoid compilation
errors, only prototype getopt for the GNU C library. */
extern int getopt (int ___argc, char *const *___argv, const char *__shortopts);
# else /* not __GNU_LIBRARY__ */
extern int getopt ();
# endif /* __GNU_LIBRARY__ */
# ifndef __need_getopt
extern int getopt_long (int ___argc, char *const *___argv,
const char *__shortopts,
const struct option *__longopts, int *__longind);
extern int getopt_long_only (int ___argc, char *const *___argv,
const char *__shortopts,
const struct option *__longopts, int *__longind);
/* Internal only. Users should not call this directly. */
extern int _getopt_internal (int ___argc, char *const *___argv,
const char *__shortopts,
const struct option *__longopts, int *__longind,
int __long_only);
# endif
#else /* not __STDC__ */
extern int getopt ();
# ifndef __need_getopt
extern int getopt_long ();
extern int getopt_long_only ();
extern int _getopt_internal ();
# endif
#endif /* __STDC__ */
#ifdef __cplusplus
}
#endif
/* Make sure we later can get all the definitions and declarations. */
#undef __need_getopt
#endif /* getopt.h */

View File

@@ -0,0 +1,61 @@
#ifndef FLT_MAX
#define FLT_MAX 3.40282347e+38
#endif
__kernel void
kmeans_kernel_c(__global float *feature,
__global float *clusters,
__global int *membership,
int npoints,
int nclusters,
int nfeatures,
int offset,
int size
)
{
unsigned int point_id = get_global_id(0);
int index = 0;
//const unsigned int point_id = get_global_id(0);
if (point_id < npoints)
{
float min_dist=FLT_MAX;
for (int i=0; i < nclusters; i++) {
float dist = 0;
float ans = 0;
for (int l=0; l<nfeatures; l++){
ans += (feature[l * npoints + point_id]-clusters[i*nfeatures+l])*
(feature[l * npoints + point_id]-clusters[i*nfeatures+l]);
}
dist = ans;
if (dist < min_dist) {
min_dist = dist;
index = i;
}
}
//printf("%d\n", index);
membership[point_id] = index;
}
return;
}
__kernel void
kmeans_swap(__global float *feature,
__global float *feature_swap,
int npoints,
int nfeatures
){
unsigned int tid = get_global_id(0);
//for(int i = 0; i < nfeatures; i++)
// feature_swap[i * npoints + tid] = feature[tid * nfeatures + i];
//Lingjie Zhang modificated at 11/05/2015
if (tid < npoints){
for(int i = 0; i < nfeatures; i++)
feature_swap[i * npoints + tid] = feature[tid * nfeatures + i];
}
// end of Lingjie Zhang's modification
}

Binary file not shown.

View File

@@ -0,0 +1,65 @@
/*****************************************************************************/
/*IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. */
/*By downloading, copying, installing or using the software you agree */
/*to this license. If you do not agree to this license, do not download, */
/*install, copy or use the software. */
/* */
/* */
/*Copyright (c) 2005 Northwestern University */
/*All rights reserved. */
/*Redistribution of the software in source and binary forms, */
/*with or without modification, is permitted provided that the */
/*following conditions are met: */
/* */
/*1 Redistributions of source code must retain the above copyright */
/* notice, this list of conditions and the following disclaimer. */
/* */
/*2 Redistributions in binary form must reproduce the above copyright */
/* notice, this list of conditions and the following disclaimer in the */
/* documentation and/or other materials provided with the distribution.*/
/* */
/*3 Neither the name of Northwestern University nor the names of its */
/* contributors may be used to endorse or promote products derived */
/* from this software without specific prior written permission. */
/* */
/*THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS */
/*IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED */
/*TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT AND */
/*FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL */
/*NORTHWESTERN UNIVERSITY OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, */
/*INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/*(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */
/*SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) */
/*HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, */
/*STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN */
/*ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/*POSSIBILITY OF SUCH DAMAGE. */
/******************************************************************************/
#ifndef _H_FUZZY_KMEANS
#define _H_FUZZY_KMEANS
#ifndef FLT_MAX
#define FLT_MAX 3.40282347e+38
#endif
#ifdef __cplusplus
extern "C" {
#endif
float euclid_dist_2 (float*, float*, int);
int find_nearest_point (float* , int, float**, int);
float rms_err(float**, int, int, float**, int);
int cluster(int, int, float**, int, int, float, int*, float***, float*, int, int);
int setup(int argc, char** argv);
int allocate(int npoints, int nfeatures, int nclusters, float **feature);
void deallocateMemory();
int kmeansOCL(float **feature, int nfeatures, int npoints, int nclusters, int *membership, float **clusters, int *new_centers_len, float **new_centers);
float** kmeans_clustering(float **feature, int nfeatures, int npoints, int nclusters, float threshold, int *membership);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -0,0 +1,176 @@
/*****************************************************************************/
/*IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. */
/*By downloading, copying, installing or using the software you agree */
/*to this license. If you do not agree to this license, do not download, */
/*install, copy or use the software. */
/* */
/* */
/*Copyright (c) 2005 Northwestern University */
/*All rights reserved. */
/*Redistribution of the software in source and binary forms, */
/*with or without modification, is permitted provided that the */
/*following conditions are met: */
/* */
/*1 Redistributions of source code must retain the above copyright */
/* notice, this list of conditions and the following disclaimer. */
/* */
/*2 Redistributions in binary form must reproduce the above copyright */
/* notice, this list of conditions and the following disclaimer in the */
/* documentation and/or other materials provided with the distribution.*/
/* */
/*3 Neither the name of Northwestern University nor the names of its */
/* contributors may be used to endorse or promote products derived */
/* from this software without specific prior written permission. */
/* */
/*THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS */
/*IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED */
/*TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT AND */
/*FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL */
/*NORTHWESTERN UNIVERSITY OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, */
/*INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/*(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */
/*SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) */
/*HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, */
/*STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN */
/*ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/*POSSIBILITY OF SUCH DAMAGE. */
/******************************************************************************/
/*************************************************************************/
/** File: kmeans_clustering.c **/
/** Description: Implementation of regular k-means clustering **/
/** algorithm **/
/** Author: Wei-keng Liao **/
/** ECE Department, Northwestern University **/
/** email: wkliao@ece.northwestern.edu **/
/** **/
/** Edited by: Jay Pisharath **/
/** Northwestern University. **/
/** **/
/** ================================================================ **/
/** **/
/** Edited by: Shuai Che, David Tarjan, Sang-Ha Lee **/
/** University of Virginia **/
/** **/
/** Description: No longer supports fuzzy c-means clustering; **/
/** only regular k-means clustering. **/
/** No longer performs "validity" function to analyze **/
/** compactness and separation crietria; instead **/
/** calculate root mean squared error. **/
/** **/
/*************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <float.h>
#include <math.h>
#include "kmeans.h"
#define RANDOM_MAX 2147483647
extern double wtime(void);
/*----< kmeans_clustering() >---------------------------------------------*/
float** kmeans_clustering(float **feature, /* in: [npoints][nfeatures] */
int nfeatures,
int npoints,
int nclusters,
float threshold,
int *membership) /* out: [npoints] */
{
int i, j, n = 0; /* counters */
int loop=0, temp;
int *new_centers_len; /* [nclusters]: no. of points in each cluster */
float delta; /* if the point moved */
float **clusters; /* out: [nclusters][nfeatures] */
float **new_centers; /* [nclusters][nfeatures] */
int *initial; /* used to hold the index of points not yet selected
prevents the "birthday problem" of dual selection (?)
considered holding initial cluster indices, but changed due to
possible, though unlikely, infinite loops */
int initial_points;
int c = 0;
/* nclusters should never be > npoints
that would guarantee a cluster without points */
if (nclusters > npoints)
nclusters = npoints;
/* allocate space for and initialize returning variable clusters[] */
clusters = (float**) malloc(nclusters * sizeof(float*));
clusters[0] = (float*) malloc(nclusters * nfeatures * sizeof(float));
for (i=1; i<nclusters; i++)
clusters[i] = clusters[i-1] + nfeatures;
/* initialize the random clusters */
initial = (int *) malloc (npoints * sizeof(int));
for (i = 0; i < npoints; i++)
{
initial[i] = i;
}
initial_points = npoints;
/* randomly pick cluster centers */
for (i=0; i<nclusters && initial_points >= 0; i++) {
//n = (int)rand() % initial_points;
for (j=0; j<nfeatures; j++)
clusters[i][j] = feature[initial[n]][j]; // remapped
/* swap the selected index to the end (not really necessary,
could just move the end up) */
temp = initial[n];
initial[n] = initial[initial_points-1];
initial[initial_points-1] = temp;
initial_points--;
n++;
}
/* initialize the membership to -1 for all */
for (i=0; i < npoints; i++)
membership[i] = -1;
/* allocate space for and initialize new_centers_len and new_centers */
new_centers_len = (int*) calloc(nclusters, sizeof(int));
new_centers = (float**) malloc(nclusters * sizeof(float*));
new_centers[0] = (float*) calloc(nclusters * nfeatures, sizeof(float));
for (i=1; i<nclusters; i++)
new_centers[i] = new_centers[i-1] + nfeatures;
/* iterate until convergence */
do {
delta = 0.0;
// CUDA
delta = (float) kmeansOCL(feature, /* in: [npoints][nfeatures] */
nfeatures, /* number of attributes for each point */
npoints, /* number of data points */
nclusters, /* number of clusters */
membership, /* which cluster the point belongs to */
clusters, /* out: [nclusters][nfeatures] */
new_centers_len, /* out: number of points in each cluster */
new_centers /* sum of points in each cluster */
);
/* replace old cluster centers with new_centers */
/* CPU side of reduction */
for (i=0; i<nclusters; i++) {
for (j=0; j<nfeatures; j++) {
if (new_centers_len[i] > 0)
clusters[i][j] = new_centers[i][j] / new_centers_len[i]; /* take average i.e. sum/n */
new_centers[i][j] = 0.0; /* set back to 0 */
}
new_centers_len[i] = 0; /* set back to 0 */
}
c++;
} while ((delta > threshold) && (loop++ < 500)); /* makes sure loop terminates */
printf("iterated %d times\n", c);
free(new_centers[0]);
free(new_centers);
free(new_centers_len);
return clusters;
}

Some files were not shown because too many files have changed in this diff Show More