project directories reorganization
This commit is contained in:
@@ -1,101 +0,0 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#if(0)
|
||||
#define EXP(a) native_exp(a)
|
||||
#define LOG(a) native_log(a)
|
||||
#define SQRT(a) native_sqrt(a)
|
||||
#else
|
||||
#define EXP(a) exp(a)
|
||||
#define LOG(a) log(a)
|
||||
#define SQRT(a) sqrt(a)
|
||||
#endif
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Predefine functions to avoid bug in OpenCL compiler on Mac OSX 10.7 systems
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
float CND(float d);
|
||||
void BlackScholesBody(__global float *call, __global float *put, float S,
|
||||
float X, float T, float R, float V);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Rational approximation of cumulative normal distribution function
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
float CND(float d){
|
||||
const float A1 = 0.31938153f;
|
||||
const float A2 = -0.356563782f;
|
||||
const float A3 = 1.781477937f;
|
||||
const float A4 = -1.821255978f;
|
||||
const float A5 = 1.330274429f;
|
||||
const float RSQRT2PI = 0.39894228040143267793994605993438f;
|
||||
|
||||
float
|
||||
K = 1.0f / (1.0f + 0.2316419f * fabs(d));
|
||||
|
||||
float
|
||||
cnd = RSQRT2PI * EXP(- 0.5f * d * d) *
|
||||
(K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
|
||||
|
||||
if(d > 0)
|
||||
cnd = 1.0f - cnd;
|
||||
|
||||
return cnd;
|
||||
}
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Black-Scholes formula for both call and put
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
void BlackScholesBody(
|
||||
__global float *call, //Call option price
|
||||
__global float *put, //Put option price
|
||||
float S, //Current stock price
|
||||
float X, //Option strike price
|
||||
float T, //Option years
|
||||
float R, //Riskless rate of return
|
||||
float V //Stock volatility
|
||||
){
|
||||
float sqrtT = SQRT(T);
|
||||
float d1 = (LOG(S / X) + (R + 0.5f * V * V) * T) / (V * sqrtT);
|
||||
float d2 = d1 - V * sqrtT;
|
||||
float CNDD1 = CND(d1);
|
||||
float CNDD2 = CND(d2);
|
||||
|
||||
//Calculate Call and Put simultaneously
|
||||
float expRT = EXP(- R * T);
|
||||
*call = (S * CNDD1 - X * expRT * CNDD2);
|
||||
*put = (X * expRT * (1.0f - CNDD2) - S * (1.0f - CNDD1));
|
||||
}
|
||||
|
||||
|
||||
|
||||
__kernel void BlackScholes(
|
||||
__global float *d_Call, //Call option price
|
||||
__global float *d_Put, //Put option price
|
||||
__global float *d_S, //Current stock price
|
||||
__global float *d_X, //Option strike price
|
||||
__global float *d_T, //Option years
|
||||
float R, //Riskless rate of return
|
||||
float V, //Stock volatility
|
||||
unsigned int optN
|
||||
){
|
||||
for(unsigned int opt = get_global_id(0); opt < optN; opt += get_global_size(0))
|
||||
BlackScholesBody(
|
||||
&d_Call[opt],
|
||||
&d_Put[opt],
|
||||
d_S[opt],
|
||||
d_X[opt],
|
||||
d_T[opt],
|
||||
R,
|
||||
V
|
||||
);
|
||||
}
|
||||
@@ -1,66 +0,0 @@
|
||||
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
|
||||
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
|
||||
POCL_INC_PATH ?= $(wildcard ../include)
|
||||
POCL_LIB_PATH ?= $(wildcard ../lib)
|
||||
VX_RT_PATH ?= $(wildcard ../../../runtime)
|
||||
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
|
||||
|
||||
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
|
||||
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
|
||||
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
|
||||
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
|
||||
|
||||
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
|
||||
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
|
||||
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
|
||||
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
|
||||
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
|
||||
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
|
||||
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
|
||||
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
|
||||
|
||||
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
|
||||
|
||||
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
|
||||
CXXFLAGS += -ffreestanding # program may not begin at main()
|
||||
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
|
||||
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
|
||||
CXXFLAGS += -I$(POCL_INC_PATH) -I.
|
||||
|
||||
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
|
||||
PROJECT=BlackScholes
|
||||
|
||||
all: $(PROJECT).dump $(PROJECT).hex
|
||||
|
||||
lib$(PROJECT).a: BlackScholes.cl
|
||||
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
|
||||
|
||||
$(PROJECT).elf: main.cc lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc oclBlackScholes_common.h oclBlackScholes_launcher.cpp oclBlackScholes_gold.cpp $(VX_LIBS) -o $(PROJECT).elf
|
||||
|
||||
$(PROJECT).qemu: main.cc lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
|
||||
|
||||
$(PROJECT).hex: $(PROJECT).elf
|
||||
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
|
||||
|
||||
$(PROJECT).dump: $(PROJECT).elf
|
||||
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||
|
||||
run: $(PROJECT).hex
|
||||
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
|
||||
|
||||
qemu: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-s: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-c: $(PROJECT).qemu
|
||||
$(GDB) $(PROJECT).qemu
|
||||
|
||||
clean:
|
||||
rm -rf *.elf *.dump *.hex
|
||||
@@ -1,248 +0,0 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
// standard utilities and systems includes
|
||||
#include <oclUtils.h>
|
||||
#include <shrQATest.h>
|
||||
#include "oclBlackScholes_common.h"
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Helper functions
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
double executionTime(cl_event &event){
|
||||
cl_ulong start, end;
|
||||
|
||||
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
|
||||
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
|
||||
|
||||
return (double)1.0e-9 * (end - start); // convert nanoseconds to seconds on return
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Random float helper
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
float randFloat(float low, float high){
|
||||
float t = (float)rand() / (float)RAND_MAX;
|
||||
return (1.0f - t) * low + t * high;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Main program
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
cl_platform_id cpPlatform; //OpenCL platform
|
||||
cl_device_id* cdDevices = NULL; //OpenCL devices list (array)
|
||||
cl_context cxGPUContext; //OpenCL context
|
||||
cl_command_queue cqCommandQueue; //OpenCL command que
|
||||
cl_mem //OpenCL memory buffer objects
|
||||
d_Call,
|
||||
d_Put,
|
||||
d_S,
|
||||
d_X,
|
||||
d_T;
|
||||
|
||||
cl_int ciErrNum;
|
||||
|
||||
float
|
||||
*h_CallCPU,
|
||||
*h_PutCPU,
|
||||
*h_CallGPU,
|
||||
*h_PutGPU,
|
||||
*h_S,
|
||||
*h_X,
|
||||
*h_T;
|
||||
|
||||
const unsigned int optionCount = 4000000;
|
||||
const float R = 0.02f;
|
||||
const float V = 0.30f;
|
||||
|
||||
shrQAStart(argc, argv);
|
||||
|
||||
// Get the NVIDIA platform
|
||||
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
shrLog("clGetPlatformID...\n");
|
||||
|
||||
//Get all the devices
|
||||
cl_uint uiNumDevices = 0; // Number of devices available
|
||||
cl_uint uiTargetDevice = 0; // Default Device to compute on
|
||||
cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU)
|
||||
shrLog("Get the Device info and select Device...\n");
|
||||
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
|
||||
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
|
||||
// Get command line device options and config accordingly
|
||||
shrLog(" # of Devices Available = %u\n", uiNumDevices);
|
||||
if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE)
|
||||
{
|
||||
uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
|
||||
}
|
||||
shrLog(" Using Device %u: ", uiTargetDevice);
|
||||
oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
|
||||
ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits);
|
||||
|
||||
// set logfile name and start logs
|
||||
shrSetLogFileName ("oclBlackScholes.txt");
|
||||
shrLog("%s Starting...\n\n", argv[0]);
|
||||
|
||||
shrLog("Allocating and initializing host memory...\n");
|
||||
h_CallCPU = (float *)malloc(optionCount * sizeof(float));
|
||||
h_PutCPU = (float *)malloc(optionCount * sizeof(float));
|
||||
h_CallGPU = (float *)malloc(optionCount * sizeof(float));
|
||||
h_PutGPU = (float *)malloc(optionCount * sizeof(float));
|
||||
h_S = (float *)malloc(optionCount * sizeof(float));
|
||||
h_X = (float *)malloc(optionCount * sizeof(float));
|
||||
h_T = (float *)malloc(optionCount * sizeof(float));
|
||||
|
||||
srand(2009);
|
||||
for(unsigned int i = 0; i < optionCount; i++){
|
||||
h_CallCPU[i] = -1.0f;
|
||||
h_PutCPU[i] = -1.0f;
|
||||
h_S[i] = randFloat(5.0f, 30.0f);
|
||||
h_X[i] = randFloat(1.0f, 100.0f);
|
||||
h_T[i] = randFloat(0.25f, 10.0f);
|
||||
}
|
||||
|
||||
shrLog("Initializing OpenCL...\n");
|
||||
// Get the NVIDIA platform
|
||||
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// Get a GPU device
|
||||
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 1, &cdDevices[uiTargetDevice], NULL);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// Create the context
|
||||
cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
//Create a command-queue
|
||||
cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
shrLog("Creating OpenCL memory objects...\n");
|
||||
d_Call = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, optionCount * sizeof(float), NULL, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
d_Put = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, optionCount * sizeof(float), NULL, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
d_S = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_S, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
d_X = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_X, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
d_T = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_T, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
shrLog("Starting up BlackScholes...\n");
|
||||
initBlackScholes(cxGPUContext, cqCommandQueue, (const char **)argv);
|
||||
|
||||
shrLog("Running OpenCL BlackScholes...\n\n");
|
||||
//Just a single run or a warmup iteration
|
||||
BlackScholes(
|
||||
NULL,
|
||||
d_Call,
|
||||
d_Put,
|
||||
d_S,
|
||||
d_X,
|
||||
d_T,
|
||||
R,
|
||||
V,
|
||||
optionCount
|
||||
);
|
||||
|
||||
#ifdef GPU_PROFILING
|
||||
const int numIterations = 16;
|
||||
cl_event startMark, endMark;
|
||||
ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark);
|
||||
ciErrNum |= clFinish(cqCommandQueue);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
shrDeltaT(0);
|
||||
|
||||
for(int i = 0; i < numIterations; i++){
|
||||
BlackScholes(
|
||||
cqCommandQueue,
|
||||
d_Call,
|
||||
d_Put,
|
||||
d_S,
|
||||
d_X,
|
||||
d_T,
|
||||
R,
|
||||
V,
|
||||
optionCount
|
||||
);
|
||||
}
|
||||
|
||||
ciErrNum = clEnqueueMarker(cqCommandQueue, &endMark);
|
||||
ciErrNum |= clFinish(cqCommandQueue);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
//Calculate performance metrics by wallclock time
|
||||
double gpuTime = shrDeltaT(0) / numIterations;
|
||||
shrLogEx(LOGBOTH | MASTER, 0, "oclBlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u options, NumDevsUsed = %i, Workgroup = %u\n",
|
||||
(double)(2.0 * optionCount * 1.0e-9)/gpuTime, gpuTime, (2 * optionCount), 1, 0);
|
||||
|
||||
//Get profiling info
|
||||
cl_ulong startTime = 0, endTime = 0;
|
||||
ciErrNum = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &startTime, NULL);
|
||||
ciErrNum |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
shrLog("\nOpenCL time: %.5f s\n\n", 1.0e-9 * ((double)endTime - (double)startTime) / (double)numIterations);
|
||||
#endif
|
||||
|
||||
shrLog("\nReading back OpenCL BlackScholes results...\n");
|
||||
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Call, CL_TRUE, 0, optionCount * sizeof(float), h_CallGPU, 0, NULL, NULL);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Put, CL_TRUE, 0, optionCount * sizeof(float), h_PutGPU, 0, NULL, NULL);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
shrLog("Comparing against Host/C++ computation...\n");
|
||||
BlackScholesCPU(h_CallCPU, h_PutCPU, h_S, h_X, h_T, R, V, optionCount);
|
||||
double deltaCall = 0, deltaPut = 0, sumCall = 0, sumPut = 0;
|
||||
double L1call, L1put;
|
||||
for(unsigned int i = 0; i < optionCount; i++)
|
||||
{
|
||||
sumCall += fabs(h_CallCPU[i]);
|
||||
sumPut += fabs(h_PutCPU[i]);
|
||||
deltaCall += fabs(h_CallCPU[i] - h_CallGPU[i]);
|
||||
deltaPut += fabs(h_PutCPU[i] - h_PutGPU[i]);
|
||||
}
|
||||
L1call = deltaCall / sumCall;
|
||||
L1put = deltaPut / sumPut;
|
||||
shrLog("Relative L1 (call, put) = (%.3e, %.3e)\n\n", L1call, L1put);
|
||||
|
||||
shrLog("Shutting down...\n");
|
||||
closeBlackScholes();
|
||||
ciErrNum = clReleaseMemObject(d_T);
|
||||
ciErrNum |= clReleaseMemObject(d_X);
|
||||
ciErrNum |= clReleaseMemObject(d_S);
|
||||
ciErrNum |= clReleaseMemObject(d_Put);
|
||||
ciErrNum |= clReleaseMemObject(d_Call);
|
||||
ciErrNum |= clReleaseCommandQueue(cqCommandQueue);
|
||||
ciErrNum |= clReleaseContext(cxGPUContext);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
free(h_T);
|
||||
free(h_X);
|
||||
free(h_S);
|
||||
free(h_PutGPU);
|
||||
free(h_CallGPU);
|
||||
free(h_PutCPU);
|
||||
free(h_CallCPU);
|
||||
|
||||
if(cdDevices)free(cdDevices);
|
||||
|
||||
shrQAFinishExit(argc, (const char **)argv, ((L1call < 1E-6) && (L1put < 1E-6)) ? QA_PASSED : QA_FAILED );
|
||||
}
|
||||
Binary file not shown.
@@ -1,50 +0,0 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#include <oclUtils.h>
|
||||
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Process an array of optN options on CPU
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void BlackScholesCPU(
|
||||
float *h_Call, //Call option price
|
||||
float *h_Put, //Put option price
|
||||
float *h_S, //Current stock price
|
||||
float *h_X, //Option strike price
|
||||
float *h_T, //Option years
|
||||
float R, //Riskless rate of return
|
||||
float V, //Stock volatility
|
||||
unsigned int optionCount
|
||||
);
|
||||
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// OpenCL Black-Scholes kernel launcher
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqParamCommandQue, const char **argv);
|
||||
|
||||
extern "C" void closeBlackScholes(void);
|
||||
|
||||
extern "C" void BlackScholes(
|
||||
cl_command_queue cqCommandQueue,
|
||||
cl_mem d_Call, //Call option price
|
||||
cl_mem d_Put, //Put option price
|
||||
cl_mem d_S, //Current stock price
|
||||
cl_mem d_X, //Option strike price
|
||||
cl_mem d_T, //Option years
|
||||
cl_float R, //Riskless rate of return
|
||||
cl_float V, //Stock volatility
|
||||
cl_uint optionCount
|
||||
);
|
||||
@@ -1,92 +0,0 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#include <math.h>
|
||||
#include "oclBlackScholes_common.h"
|
||||
|
||||
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Rational approximation of cumulative normal distribution function
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
static double CND(double d){
|
||||
const double A1 = 0.31938153;
|
||||
const double A2 = -0.356563782;
|
||||
const double A3 = 1.781477937;
|
||||
const double A4 = -1.821255978;
|
||||
const double A5 = 1.330274429;
|
||||
const double RSQRT2PI = 0.39894228040143267793994605993438;
|
||||
|
||||
double
|
||||
K = 1.0 / (1.0 + 0.2316419 * fabs(d));
|
||||
|
||||
double
|
||||
cnd = RSQRT2PI * exp(- 0.5 * d * d) *
|
||||
(K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
|
||||
|
||||
if(d > 0)
|
||||
cnd = 1.0 - cnd;
|
||||
|
||||
return cnd;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
// Black-Scholes formula for both call and put
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
static void BlackScholesBodyCPU(
|
||||
float& call, //Call option price
|
||||
float& put, //Put option price
|
||||
float Sf, //Current stock price
|
||||
float Xf, //Option strike price
|
||||
float Tf, //Option years
|
||||
float Rf, //Riskless rate of return
|
||||
float Vf //Stock volatility
|
||||
){
|
||||
double S = Sf, X = Xf, T = Tf, R = Rf, V = Vf;
|
||||
|
||||
double sqrtT = sqrt(T);
|
||||
double d1 = (log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT);
|
||||
double d2 = d1 - V * sqrtT;
|
||||
double CNDD1 = CND(d1);
|
||||
double CNDD2 = CND(d2);
|
||||
|
||||
//Calculate Call and Put simultaneously
|
||||
double expRT = exp(- R * T);
|
||||
call = (float)(S * CNDD1 - X * expRT * CNDD2);
|
||||
put = (float)(X * expRT * (1.0 - CNDD2) - S * (1.0 - CNDD1));
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Process an array of optN options
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void BlackScholesCPU(
|
||||
float *h_Call, //Call option price
|
||||
float *h_Put, //Put option price
|
||||
float *h_S, //Current stock price
|
||||
float *h_X, //Option strike price
|
||||
float *h_T, //Option years
|
||||
float R, //Riskless rate of return
|
||||
float V, //Stock volatility
|
||||
unsigned int optionCount
|
||||
){
|
||||
for(unsigned int i = 0; i < optionCount; i++)
|
||||
BlackScholesBodyCPU(
|
||||
h_Call[i],
|
||||
h_Put[i],
|
||||
h_S[i],
|
||||
h_X[i],
|
||||
h_T[i],
|
||||
R,
|
||||
V
|
||||
);
|
||||
}
|
||||
@@ -1,125 +0,0 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
|
||||
#include <oclUtils.h>
|
||||
#include "oclBlackScholes_common.h"
|
||||
|
||||
static cl_program cpBlackScholes; //OpenCL program
|
||||
static cl_kernel ckBlackScholes; //OpenCL kernel
|
||||
static cl_command_queue cqDefaultCommandQueue;
|
||||
|
||||
extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqParamCommandQueue, const char **argv){
|
||||
cl_int ciErrNum;
|
||||
size_t kernelLength;
|
||||
|
||||
shrLog("...loading BlackScholes.cl\n");
|
||||
char *cPathAndName = shrFindFilePath("BlackScholes.cl", argv[0]);
|
||||
shrCheckError(cPathAndName != NULL, shrTRUE);
|
||||
char *cBlackScholes = oclLoadProgSource(cPathAndName, "// My comment\n", &kernelLength);
|
||||
shrCheckError(cBlackScholes != NULL, shrTRUE);
|
||||
|
||||
shrLog("...creating BlackScholes program\n");
|
||||
//cpBlackScholes = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cBlackScholes, &kernelLength, &ciErrNum);
|
||||
cpBlackScholes = clCreateProgramWithBuiltInKernels(context, 1, &device_id, "BlackScholes", NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
shrLog("...building BlackScholes program\n");
|
||||
ciErrNum = clBuildProgram(cpBlackScholes, 0, NULL, "-cl-fast-relaxed-math -Werror", NULL, NULL);
|
||||
|
||||
if(ciErrNum != CL_BUILD_SUCCESS){
|
||||
shrLog("*** Compilation failure ***\n");
|
||||
|
||||
size_t deviceNum;
|
||||
cl_device_id *cdDevices;
|
||||
ciErrNum = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &deviceNum);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
cdDevices = (cl_device_id *)malloc(deviceNum * sizeof(cl_device_id));
|
||||
shrCheckError(cdDevices != NULL, shrTRUE);
|
||||
|
||||
ciErrNum = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, deviceNum * sizeof(cl_device_id), cdDevices, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
size_t logSize;
|
||||
char *logTxt;
|
||||
|
||||
ciErrNum = clGetProgramBuildInfo(cpBlackScholes, cdDevices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
logTxt = (char *)malloc(logSize);
|
||||
shrCheckError(logTxt != NULL, shrTRUE);
|
||||
|
||||
ciErrNum = clGetProgramBuildInfo(cpBlackScholes, cdDevices[0], CL_PROGRAM_BUILD_LOG, logSize, logTxt, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
shrLog("%s\n", logTxt);
|
||||
shrLog("*** Exiting ***\n");
|
||||
free(logTxt);
|
||||
free(cdDevices);
|
||||
exit(666);
|
||||
}
|
||||
|
||||
//Save ptx code to separate file
|
||||
oclLogPtx(cpBlackScholes, oclGetFirstDev(cxGPUContext), "BlackScholes.ptx");
|
||||
|
||||
shrLog("...creating BlackScholes kernels\n");
|
||||
ckBlackScholes = clCreateKernel(cpBlackScholes, "BlackScholes", &ciErrNum);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
cqDefaultCommandQueue = cqParamCommandQueue;
|
||||
free(cBlackScholes);
|
||||
free(cPathAndName);
|
||||
}
|
||||
|
||||
extern "C" void closeBlackScholes(void){
|
||||
cl_int ciErrNum;
|
||||
ciErrNum = clReleaseKernel(ckBlackScholes);
|
||||
ciErrNum |= clReleaseProgram(cpBlackScholes);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// OpenCL Black-Scholes kernel launcher
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void BlackScholes(
|
||||
cl_command_queue cqCommandQueue,
|
||||
cl_mem d_Call, //Call option price
|
||||
cl_mem d_Put, //Put option price
|
||||
cl_mem d_S, //Current stock price
|
||||
cl_mem d_X, //Option strike price
|
||||
cl_mem d_T, //Option years
|
||||
cl_float R, //Riskless rate of return
|
||||
cl_float V, //Stock volatility
|
||||
cl_uint optionCount
|
||||
){
|
||||
cl_int ciErrNum;
|
||||
|
||||
if(!cqCommandQueue)
|
||||
cqCommandQueue = cqDefaultCommandQueue;
|
||||
|
||||
ciErrNum = clSetKernelArg(ckBlackScholes, 0, sizeof(cl_mem), (void *)&d_Call);
|
||||
ciErrNum |= clSetKernelArg(ckBlackScholes, 1, sizeof(cl_mem), (void *)&d_Put);
|
||||
ciErrNum |= clSetKernelArg(ckBlackScholes, 2, sizeof(cl_mem), (void *)&d_S);
|
||||
ciErrNum |= clSetKernelArg(ckBlackScholes, 3, sizeof(cl_mem), (void *)&d_X);
|
||||
ciErrNum |= clSetKernelArg(ckBlackScholes, 4, sizeof(cl_mem), (void *)&d_T);
|
||||
ciErrNum |= clSetKernelArg(ckBlackScholes, 5, sizeof(cl_float), (void *)&R);
|
||||
ciErrNum |= clSetKernelArg(ckBlackScholes, 6, sizeof(cl_float), (void *)&V);
|
||||
ciErrNum |= clSetKernelArg(ckBlackScholes, 7, sizeof(cl_uint), (void *)&optionCount);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
//Run the kernel
|
||||
size_t globalWorkSize = 60 * 1024;
|
||||
size_t localWorkSize = 128;
|
||||
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckBlackScholes, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
}
|
||||
@@ -1,198 +0,0 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef OCL_UTILS_H
|
||||
#define OCL_UTILS_H
|
||||
|
||||
// *********************************************************************
|
||||
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
// Common headers: Cross-API utililties and OpenCL header
|
||||
#include <shrUtils.h>
|
||||
|
||||
// All OpenCL headers
|
||||
#if defined (__APPLE__) || defined(MACOSX)
|
||||
#include <OpenCL/opencl.h>
|
||||
#else
|
||||
#include <CL/opencl.h>
|
||||
#endif
|
||||
|
||||
// Includes
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// For systems with CL_EXT that are not updated with these extensions, we copied these
|
||||
// extensions from <CL/cl_ext.h>
|
||||
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
|
||||
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
|
||||
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
|
||||
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
|
||||
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
|
||||
#define CL_DEVICE_WARP_SIZE_NV 0x4003
|
||||
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
|
||||
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
|
||||
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
|
||||
#endif
|
||||
|
||||
// reminders for build output window and log
|
||||
#ifdef _WIN32
|
||||
#pragma message ("Note: including shrUtils.h")
|
||||
#pragma message ("Note: including opencl.h")
|
||||
#endif
|
||||
|
||||
// SDK Revision #
|
||||
#define OCL_SDKREVISION "7027912"
|
||||
|
||||
// Error and Exit Handling Macros...
|
||||
// *********************************************************************
|
||||
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||
// (Companion Inline Function lower on page)
|
||||
#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||
|
||||
// Short version without Cleanup() callback pointer
|
||||
// Both Input (a) and Reference (b) are specified as args
|
||||
#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param clSelectedPlatformID OpenCL platform ID
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Print info about the device
|
||||
//!
|
||||
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and return device capability
|
||||
//!
|
||||
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" int oclGetDevCap(cl_device_id device);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Print the device name
|
||||
//!
|
||||
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of the first device from the context
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of the nth device from the context
|
||||
//!
|
||||
//! @return the id or -1 when out of range
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//! @param device_idx index of the device of interest
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of device with maximal FLOPS from the context
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Loads a Program file and prepends the cPreamble to the code.
|
||||
//!
|
||||
//! @return the source string if succeeded, 0 otherwise
|
||||
//! @param cFilename program filename
|
||||
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
|
||||
//! @param szFinalLength returned length of the code string
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the binary (PTX) of the program associated with the device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//! @param binary returned code
|
||||
//! @param length length of returned code
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//! @param const char* cPtxFileName optional PTX file name
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and log the Build Log from the OpenCL compiler for the requested program & device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
|
||||
|
||||
// Helper function for De-allocating cl objects
|
||||
// *********************************************************************
|
||||
extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
|
||||
|
||||
// Helper function to get OpenCL error string from constant
|
||||
// *********************************************************************
|
||||
extern "C" const char* oclErrorString(cl_int error);
|
||||
|
||||
// Helper function to get OpenCL image format string (channel order and type) from constant
|
||||
// *********************************************************************
|
||||
extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
|
||||
|
||||
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||
// *********************************************************************
|
||||
inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||
{
|
||||
// An error condition is defined by the sample/test value not equal to the reference
|
||||
if (iReference != iSample)
|
||||
{
|
||||
// If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
|
||||
iSample = (iSample == 0) ? -9999 : iSample;
|
||||
|
||||
// Log the error info
|
||||
shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
|
||||
|
||||
// Cleanup and exit, or just exit if no cleanup function pointer provided. Use iSample (error code in this case) as process exit code.
|
||||
if (pCleanup != NULL)
|
||||
{
|
||||
pCleanup(iSample);
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||
exit(iSample);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,238 +0,0 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef SHR_QATEST_H
|
||||
#define SHR_QATEST_H
|
||||
|
||||
// *********************************************************************
|
||||
// Generic utilities for NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
// OS dependent includes
|
||||
#ifdef _WIN32
|
||||
#pragma message ("Note: including windows.h")
|
||||
#pragma message ("Note: including math.h")
|
||||
#pragma message ("Note: including assert.h")
|
||||
#pragma message ("Note: including time.h")
|
||||
|
||||
// Headers needed for Windows
|
||||
#include <windows.h>
|
||||
#include <time.h>
|
||||
#else
|
||||
// Headers needed for Linux
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/time.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdarg.h>
|
||||
#include <unistd.h>
|
||||
#include <time.h>
|
||||
#endif
|
||||
|
||||
#ifndef STRCASECMP
|
||||
#ifdef _WIN32
|
||||
#define STRCASECMP _stricmp
|
||||
#else
|
||||
#define STRCASECMP strcasecmp
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef STRNCASECMP
|
||||
#ifdef _WIN32
|
||||
#define STRNCASECMP _strnicmp
|
||||
#else
|
||||
#define STRNCASECMP strncasecmp
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
// Standardized QA Start/Finish for CUDA SDK tests
|
||||
#define shrQAStart(a, b) __shrQAStart(a, b)
|
||||
#define shrQAFinish(a, b, c) __shrQAFinish(a, b, c)
|
||||
#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
|
||||
|
||||
inline int findExeNameStart(const char *exec_name)
|
||||
{
|
||||
int exename_start = (int)strlen(exec_name);
|
||||
|
||||
while( (exename_start > 0) &&
|
||||
(exec_name[exename_start] != '\\') &&
|
||||
(exec_name[exename_start] != '/') )
|
||||
{
|
||||
exename_start--;
|
||||
}
|
||||
if (exec_name[exename_start] == '\\' ||
|
||||
exec_name[exename_start] == '/')
|
||||
{
|
||||
return exename_start+1;
|
||||
} else {
|
||||
return exename_start;
|
||||
}
|
||||
}
|
||||
|
||||
inline int __shrQAStart(int argc, char **argv)
|
||||
{
|
||||
bool bQATest = false;
|
||||
// First clear the output buffer
|
||||
fflush(stdout);
|
||||
fflush(stdout);
|
||||
|
||||
for (int i=1; i < argc; i++) {
|
||||
int string_start = 0;
|
||||
while (argv[i][string_start] == '-')
|
||||
string_start++;
|
||||
char *string_argv = &argv[i][string_start];
|
||||
|
||||
if (!STRCASECMP(string_argv, "qatest")) {
|
||||
bQATest = true;
|
||||
}
|
||||
}
|
||||
|
||||
// We don't want to print the entire path, so we search for the first
|
||||
int exename_start = findExeNameStart(argv[0]);
|
||||
if (bQATest) {
|
||||
fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
|
||||
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||
fprintf(stdout, "\n");
|
||||
} else {
|
||||
fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
|
||||
}
|
||||
fflush(stdout);
|
||||
printf("\n"); fflush(stdout);
|
||||
return exename_start;
|
||||
}
|
||||
|
||||
enum eQAstatus {
|
||||
QA_FAILED = 0,
|
||||
QA_PASSED = 1,
|
||||
QA_WAIVED = 2
|
||||
};
|
||||
|
||||
inline void __ExitInTime(int seconds)
|
||||
{
|
||||
fprintf(stdout, "> exiting in %d seconds: ", seconds);
|
||||
fflush(stdout);
|
||||
time_t t;
|
||||
int count;
|
||||
for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
|
||||
fprintf(stdout, "%d...", count);
|
||||
#ifdef WIN32
|
||||
Sleep(1000);
|
||||
#else
|
||||
sleep(1);
|
||||
#endif
|
||||
}
|
||||
fprintf(stdout,"done!\n\n");
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
|
||||
inline void __shrQAFinish(int argc, const char **argv, int iStatus)
|
||||
{
|
||||
// By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
|
||||
bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
|
||||
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||
|
||||
for (int i=1; i < argc; i++) {
|
||||
int string_start = 0;
|
||||
while (argv[i][string_start] == '-')
|
||||
string_start++;
|
||||
|
||||
const char *string_argv = &argv[i][string_start];
|
||||
if (!STRCASECMP(string_argv, "qatest")) {
|
||||
bQATest = true;
|
||||
}
|
||||
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||
bNoPrompt = true;
|
||||
bQuitInTime = false;
|
||||
}
|
||||
if (!STRCASECMP(string_argv, "prompt")) {
|
||||
bNoPrompt = false;
|
||||
bQuitInTime = false;
|
||||
}
|
||||
}
|
||||
|
||||
int exename_start = findExeNameStart(argv[0]);
|
||||
if (bQATest) {
|
||||
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||
fprintf(stdout, "\n");
|
||||
} else {
|
||||
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||
}
|
||||
fflush(stdout);
|
||||
printf("\n"); fflush(stdout);
|
||||
if (bQuitInTime) {
|
||||
__ExitInTime(3);
|
||||
} else {
|
||||
if (!bNoPrompt) {
|
||||
fprintf(stdout, "\nPress <Enter> to exit...\n");
|
||||
fflush(stdout);
|
||||
getchar();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
|
||||
{
|
||||
bool bQuitInTime = true;
|
||||
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||
|
||||
for (int i=1; i < argc; i++) {
|
||||
int string_start = 0;
|
||||
while (argv[i][string_start] == '-')
|
||||
string_start++;
|
||||
|
||||
const char *string_argv = &argv[i][string_start];
|
||||
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||
bQuitInTime = false;
|
||||
}
|
||||
if (!STRCASECMP(string_argv, "prompt")) {
|
||||
bQuitInTime = false;
|
||||
}
|
||||
}
|
||||
|
||||
int exename_start = findExeNameStart(argv[0]);
|
||||
if (bQATest) {
|
||||
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||
fprintf(stdout, "\n");
|
||||
} else {
|
||||
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||
}
|
||||
fflush(stdout);
|
||||
|
||||
if (bQuitInTime) {
|
||||
__ExitInTime(3);
|
||||
}
|
||||
}
|
||||
|
||||
inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
|
||||
{
|
||||
__shrQAFinish(argc, argv, iStatus);
|
||||
|
||||
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
}
|
||||
|
||||
inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
|
||||
{
|
||||
__shrQAFinish2(bQAtest, argc, argv, iStatus);
|
||||
|
||||
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,642 +0,0 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef SHR_UTILS_H
|
||||
#define SHR_UTILS_H
|
||||
|
||||
// *********************************************************************
|
||||
// Generic utilities for NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
// reminders for output window and build log
|
||||
#ifdef _WIN32
|
||||
#pragma message ("Note: including windows.h")
|
||||
#pragma message ("Note: including math.h")
|
||||
#pragma message ("Note: including assert.h")
|
||||
#endif
|
||||
|
||||
// OS dependent includes
|
||||
#ifdef _WIN32
|
||||
// Headers needed for Windows
|
||||
#include <windows.h>
|
||||
#else
|
||||
// Headers needed for Linux
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/time.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdarg.h>
|
||||
#endif
|
||||
|
||||
// Other headers needed for both Windows and Linux
|
||||
#include <math.h>
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// Un-comment the following #define to enable profiling code in SDK apps
|
||||
//#define GPU_PROFILING
|
||||
|
||||
// Beginning of GPU Architecture definitions
|
||||
inline int ConvertSMVer2Cores(int major, int minor)
|
||||
{
|
||||
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
|
||||
typedef struct {
|
||||
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
|
||||
int Cores;
|
||||
} sSMtoCores;
|
||||
|
||||
sSMtoCores nGpuArchCoresPerSM[] =
|
||||
{ { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
|
||||
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
|
||||
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
|
||||
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
|
||||
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
|
||||
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
|
||||
{ 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
|
||||
{ -1, -1 }
|
||||
};
|
||||
|
||||
int index = 0;
|
||||
while (nGpuArchCoresPerSM[index].SM != -1) {
|
||||
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
|
||||
return nGpuArchCoresPerSM[index].Cores;
|
||||
}
|
||||
index++;
|
||||
}
|
||||
printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
|
||||
return -1;
|
||||
}
|
||||
// end of GPU Architecture definitions
|
||||
|
||||
|
||||
// Defines and enum for use with logging functions
|
||||
// *********************************************************************
|
||||
#define DEFAULTLOGFILE "SdkConsoleLog.txt"
|
||||
#define MASTERLOGFILE "SdkMasterLog.csv"
|
||||
enum LOGMODES
|
||||
{
|
||||
LOGCONSOLE = 1, // bit to signal "log to console"
|
||||
LOGFILE = 2, // bit to signal "log to file"
|
||||
LOGBOTH = 3, // convenience union of first 2 bits to signal "log to both"
|
||||
APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
|
||||
MASTER = 8, // bit to signal master .csv log output
|
||||
ERRORMSG = 16, // bit to signal "pre-pend Error"
|
||||
CLOSELOG = 32 // bit to close log file, if open, after any requested file write
|
||||
};
|
||||
#define HDASHLINE "-----------------------------------------------------------\n"
|
||||
|
||||
// Standardized boolean
|
||||
enum shrBOOL
|
||||
{
|
||||
shrFALSE = 0,
|
||||
shrTRUE = 1
|
||||
};
|
||||
|
||||
// Standardized MAX, MIN and CLAMP
|
||||
#define MAX(a, b) ((a > b) ? a : b)
|
||||
#define MIN(a, b) ((a < b) ? a : b)
|
||||
#define CLAMP(a, b, c) MIN(MAX(a, b), c) // double sided clip of input a
|
||||
#define TOPCLAMP(a, b) (a < b ? a:b) // single top side clip of input a
|
||||
|
||||
// Error and Exit Handling Macros...
|
||||
// *********************************************************************
|
||||
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||
// (Companion Inline Function lower on page)
|
||||
#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||
|
||||
// Short version without Cleanup() callback pointer
|
||||
// Both Input (a) and Reference (b) are specified as args
|
||||
#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0)
|
||||
|
||||
// Standardized Exit Macro for leaving main()... extended version
|
||||
// (Companion Inline Function lower on page)
|
||||
#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
|
||||
|
||||
// Standardized Exit Macro for leaving main()... short version
|
||||
// (Companion Inline Function lower on page)
|
||||
#define shrEXIT(a, b) __shrExitEX(a, b, EXIT_SUCCESS)
|
||||
|
||||
// Simple argument checker macro
|
||||
#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE
|
||||
|
||||
// Define for user-customized error handling
|
||||
#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
|
||||
|
||||
// Function to deallocate memory allocated within shrUtils
|
||||
// *********************************************************************
|
||||
extern "C" void shrFree(void* ptr);
|
||||
|
||||
// *********************************************************************
|
||||
// Helper function to log standardized information to Console, to File or to both
|
||||
//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n");
|
||||
//! : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
|
||||
//!
|
||||
//! Automatically opens file and stores handle if needed and not done yet
|
||||
//! Closes file and nulls handle on request
|
||||
//!
|
||||
//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.
|
||||
//! LOGFILE and LOGBOTH may be | 'd with APPENDMODE to select file append mode instead of overwrite mode
|
||||
//! LOGFILE and LOGBOTH may be | 'd with CLOSELOG to "write and close"
|
||||
//! First 3 options may be | 'd with MASTER to enable independent write to master data log file
|
||||
//! First 3 options may be | 'd with ERRORMSG to start line with standard error message
|
||||
//! @param 2 dValue:
|
||||
//! Positive val = double value for time in secs to be formatted to 6 decimals.
|
||||
//! Negative val is an error code and this give error preformatting.
|
||||
//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.
|
||||
//! ALL printf flags, width, precision and type specifiers are supported with this exception:
|
||||
//! Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
|
||||
//! Single byte char type specifiers (%s and %c) ARE supported
|
||||
//! @param 4... variable args: like printf or fprintf. Must match format specifer type above.
|
||||
//! @return 0 if OK, negative value on error or if error occurs or was passed in.
|
||||
// *********************************************************************
|
||||
extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
|
||||
|
||||
// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0,
|
||||
// *********************************************************************
|
||||
extern "C" int shrLog(const char* cFormatString, ...);
|
||||
|
||||
// *********************************************************************
|
||||
// Delta timer function for up to 3 independent timers using host high performance counters
|
||||
// Maintains state for 3 independent counters
|
||||
//! Example: double dElapsedTime = shrDeltaTime(0);
|
||||
//!
|
||||
//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
|
||||
//! @return delta time of specified counter since last call in seconds. Otherwise -9999.0 if error
|
||||
// *********************************************************************
|
||||
extern "C" double shrDeltaT(int iCounterID);
|
||||
|
||||
// Optional LogFileNameOverride function
|
||||
// *********************************************************************
|
||||
extern "C" void shrSetLogFileName (const char* cOverRideName);
|
||||
|
||||
// Helper function to init data arrays
|
||||
// *********************************************************************
|
||||
extern "C" void shrFillArray(float* pfData, int iSize);
|
||||
|
||||
// Helper function to print data arrays
|
||||
// *********************************************************************
|
||||
extern "C" void shrPrintArray(float* pfData, int iSize);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Find the path for a filename
|
||||
//! @return the path if succeeded, otherwise 0
|
||||
//! @param filename name of the file
|
||||
//! @param executablePath optional absolute path of the executable
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing single precision floating point data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing double precision floating point data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing integer data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing unsigned integer data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data,
|
||||
unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing char / byte data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing unsigned char / byte data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data,
|
||||
unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing single precision floating point
|
||||
//! data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @param epsilon epsilon for comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
|
||||
const float epsilon, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing double precision floating point
|
||||
//! data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @param epsilon epsilon for comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
|
||||
const double epsilon, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing integer data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing unsigned integer data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data,
|
||||
unsigned int len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing char / byte data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing unsigned char / byte data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
|
||||
unsigned int len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PPM image file (with unsigned char as data element type), padding
|
||||
//! 4th component
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param OutData handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//!
|
||||
//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData,
|
||||
unsigned int *w, unsigned int *h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PPM image file (with unsigned char as data element type, padded to
|
||||
//! 4 bytes)
|
||||
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PGM image file (with unsigned char as data element type)
|
||||
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PGM image file (with unsigned char as data element type)
|
||||
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
|
||||
unsigned int *w,unsigned int *h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Command line arguments: General notes
|
||||
// * All command line arguments begin with '--' followed by the token;
|
||||
// token and value are seperated by '='; example --samples=50
|
||||
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
|
||||
// (without whitespaces)
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Check if command line argument \a flag-name is given
|
||||
//! @return shrTRUE if command line argument \a flag_name has been given,
|
||||
//! otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param flag_name name of command line flag
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv,
|
||||
const char* flag_name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type int
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv,
|
||||
const char* arg_name, int* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type unsigned int
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv,
|
||||
const char* arg_name, unsigned int* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type float
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv,
|
||||
const char* arg_name, float* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type string
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv,
|
||||
const char* arg_name, char** val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument list those element are strings
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val command line argument list
|
||||
//! @param len length of the list / number of elements
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv,
|
||||
const char* arg_name, char** val,
|
||||
unsigned int* len);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparef( const float* reference, const float* data,
|
||||
const unsigned int len);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integer arrays
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparei( const int* reference, const int* data,
|
||||
const unsigned int len );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two unsigned integer arrays, with epsilon and threshold
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
|
||||
const unsigned int len, const float epsilon, const float threshold );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two unsigned char arrays
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integers with a tolernance for # of byte errors
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len, const float epsilon, const float threshold );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integer arrays witha n epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays with an epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays with an epsilon tolerance for equality and a
|
||||
//! threshold for # pixel errors
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon, const float threshold );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays using L2-norm with an epsilon tolerance for
|
||||
//! equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two PPM image files with an epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param src_file filename for the image to be compared
|
||||
//! @param data filename for the reference data / gold image
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||
//! $param verboseErrors output details of image mismatch to std::err
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two PGM image files with an epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param src_file filename for the image to be compared
|
||||
//! @param data filename for the reference data / gold image
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||
//! $param verboseErrors output details of image mismatch to std::err
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||
|
||||
extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
|
||||
|
||||
extern "C" size_t shrRoundUp(int group_size, int global_size);
|
||||
|
||||
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||
// *********************************************************************
|
||||
inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||
{
|
||||
if (iReference != iSample)
|
||||
{
|
||||
shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile);
|
||||
if (pCleanup != NULL)
|
||||
{
|
||||
pCleanup(EXIT_FAILURE);
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Standardized Exit
|
||||
// *********************************************************************
|
||||
inline void __shrExitEX(int argc, const char** argv, int iExitCode)
|
||||
{
|
||||
#ifdef WIN32
|
||||
if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||
#else
|
||||
if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||
#endif
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");
|
||||
getchar();
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]);
|
||||
}
|
||||
fflush(stderr);
|
||||
exit(iExitCode);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,29 +0,0 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
__kernel void DotProduct (__global float* a, __global float* b, __global float* c, int iNumElements)
|
||||
{
|
||||
// find position in global arrays
|
||||
int iGID = get_global_id(0);
|
||||
|
||||
// bound check (equivalent to the limit on a 'for' loop for standard/serial C code
|
||||
if (iGID >= iNumElements)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// process
|
||||
int iInOffset = iGID << 2;
|
||||
c[iGID] = a[iInOffset] * b[iInOffset]
|
||||
+ a[iInOffset + 1] * b[iInOffset + 1]
|
||||
+ a[iInOffset + 2] * b[iInOffset + 2]
|
||||
+ a[iInOffset + 3] * b[iInOffset + 3];
|
||||
}
|
||||
@@ -1,66 +0,0 @@
|
||||
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
|
||||
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
|
||||
POCL_INC_PATH ?= $(wildcard ../include)
|
||||
POCL_LIB_PATH ?= $(wildcard ../lib)
|
||||
VX_RT_PATH ?= $(wildcard ../../../runtime)
|
||||
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
|
||||
|
||||
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
|
||||
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
|
||||
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
|
||||
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
|
||||
|
||||
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
|
||||
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
|
||||
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
|
||||
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
|
||||
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
|
||||
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
|
||||
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
|
||||
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
|
||||
|
||||
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
|
||||
|
||||
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
|
||||
CXXFLAGS += -ffreestanding # program may not begin at main()
|
||||
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
|
||||
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
|
||||
CXXFLAGS += -I$(POCL_INC_PATH) -I.
|
||||
|
||||
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
|
||||
PROJECT=DotProduct
|
||||
|
||||
all: $(PROJECT).dump $(PROJECT).hex
|
||||
|
||||
lib$(PROJECT).a: DotProduct.cl
|
||||
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
|
||||
|
||||
$(PROJECT).elf: main.cc lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf
|
||||
|
||||
$(PROJECT).qemu: main.cc lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
|
||||
|
||||
$(PROJECT).hex: $(PROJECT).elf
|
||||
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
|
||||
|
||||
$(PROJECT).dump: $(PROJECT).elf
|
||||
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||
|
||||
run: $(PROJECT).hex
|
||||
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
|
||||
|
||||
qemu: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-s: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-c: $(PROJECT).qemu
|
||||
$(GDB) $(PROJECT).qemu
|
||||
|
||||
clean:
|
||||
rm -rf *.elf *.dump *.hex
|
||||
@@ -1,270 +0,0 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
// *********************************************************************
|
||||
// oclDotProduct Notes:
|
||||
//
|
||||
// A simple OpenCL API demo application that implements a
|
||||
// vector dot product computation between 2 float arrays.
|
||||
//
|
||||
// Runs computations with OpenCL on the GPU device and then checks results
|
||||
// against basic host CPU/C++ computation.
|
||||
//
|
||||
// Uses 'shr' and 'ocl' functions from oclUtils and shrUtils libraries for compactness.
|
||||
// But these are NOT required libs for OpenCL developement in general.
|
||||
// *********************************************************************
|
||||
|
||||
// standard utilities and systems includes
|
||||
#include <oclUtils.h>
|
||||
#include <shrQATest.h>
|
||||
|
||||
// Name of the file with the source code for the computation kernel
|
||||
// *********************************************************************
|
||||
const char* cSourceFile = "DotProduct.cl";
|
||||
|
||||
// Host buffers for demo
|
||||
// *********************************************************************
|
||||
void *srcA, *srcB, *dst; // Host buffers for OpenCL test
|
||||
void* Golden; // Host buffer for host golden processing cross check
|
||||
|
||||
// OpenCL Vars
|
||||
cl_platform_id cpPlatform; // OpenCL platform
|
||||
cl_device_id *cdDevices; // OpenCL device
|
||||
cl_context cxGPUContext; // OpenCL context
|
||||
cl_command_queue cqCommandQueue;// OpenCL command que
|
||||
cl_program program; // OpenCL program
|
||||
cl_kernel ckKernel; // OpenCL kernel
|
||||
cl_mem cmDevSrcA; // OpenCL device source buffer A
|
||||
cl_mem cmDevSrcB; // OpenCL device source buffer B
|
||||
cl_mem cmDevDst; // OpenCL device destination buffer
|
||||
size_t szGlobalWorkSize; // Total # of work items in the 1D range
|
||||
size_t szLocalWorkSize; // # of work items in the 1D work group
|
||||
size_t szParmDataBytes; // Byte size of context information
|
||||
size_t szKernelLength; // Byte size of kernel code
|
||||
cl_int ciErrNum; // Error code var
|
||||
char* cPathAndName = NULL; // var for full paths to data, src, etc.
|
||||
char* cSourceCL = NULL; // Buffer to hold source for compilation
|
||||
const char* cExecutableName = NULL;
|
||||
|
||||
// demo config vars
|
||||
int iNumElements= 1277944; // Length of float arrays to process (odd # for illustration)
|
||||
shrBOOL bNoPrompt = shrFALSE;
|
||||
|
||||
// Forward Declarations
|
||||
// *********************************************************************
|
||||
void DotProductHost(const float* pfData1, const float* pfData2, float* pfResult, int iNumElements);
|
||||
void Cleanup (int iExitCode);
|
||||
void (*pCleanup)(int) = &Cleanup;
|
||||
|
||||
int *gp_argc = NULL;
|
||||
char ***gp_argv = NULL;
|
||||
|
||||
// Main function
|
||||
// *********************************************************************
|
||||
int main(int argc, char **argv)
|
||||
{
|
||||
gp_argc = &argc;
|
||||
gp_argv = &argv;
|
||||
|
||||
shrQAStart(argc, argv);
|
||||
|
||||
// Get the NVIDIA platform
|
||||
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
shrLog("clGetPlatformID...\n");
|
||||
|
||||
// Get the NVIDIA platform
|
||||
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
shrLog("clGetPlatformID...\n");
|
||||
|
||||
//Get all the devices
|
||||
cl_uint uiNumDevices = 0; // Number of devices available
|
||||
cl_uint uiTargetDevice = 0; // Default Device to compute on
|
||||
cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU)
|
||||
shrLog("Get the Device info and select Device...\n");
|
||||
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
|
||||
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
|
||||
// Get command line device options and config accordingly
|
||||
shrLog(" # of Devices Available = %u\n", uiNumDevices);
|
||||
if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE)
|
||||
{
|
||||
uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
|
||||
}
|
||||
shrLog(" Using Device %u: ", uiTargetDevice);
|
||||
oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
|
||||
ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
||||
shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits);
|
||||
|
||||
// get command line arg for quick test, if provided
|
||||
bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");
|
||||
|
||||
// start logs
|
||||
cExecutableName = argv[0];
|
||||
shrSetLogFileName ("oclDotProduct.txt");
|
||||
shrLog("%s Starting...\n\n# of float elements per Array \t= %u\n", argv[0], iNumElements);
|
||||
|
||||
// set and log Global and Local work size dimensions
|
||||
szLocalWorkSize = 256;
|
||||
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize
|
||||
shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n",
|
||||
szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize));
|
||||
|
||||
// Allocate and initialize host arrays
|
||||
shrLog( "Allocate and Init Host Mem...\n");
|
||||
srcA = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize);
|
||||
srcB = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize);
|
||||
dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize);
|
||||
Golden = (void *)malloc(sizeof(cl_float) * iNumElements);
|
||||
shrFillArray((float*)srcA, 4 * iNumElements);
|
||||
shrFillArray((float*)srcB, 4 * iNumElements);
|
||||
|
||||
// Get the NVIDIA platform
|
||||
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Get a GPU device
|
||||
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 1, &cdDevices[uiTargetDevice], NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Create the context
|
||||
cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Create a command-queue
|
||||
shrLog("clCreateCommandQueue...\n");
|
||||
cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Allocate the OpenCL buffer memory objects for source and result on the device GMEM
|
||||
shrLog("clCreateBuffer (SrcA, SrcB and Dst in Device GMEM)...\n");
|
||||
cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Read the OpenCL kernel in from source file
|
||||
shrLog("oclLoadProgSource (%s)...\n", cSourceFile);
|
||||
cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
|
||||
//oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup);
|
||||
cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
|
||||
//oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup);
|
||||
|
||||
// Create the program
|
||||
shrLog("clCreateProgramWithSource...\n");
|
||||
//program = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);
|
||||
cl_program program =
|
||||
clCreateProgramWithBuiltInKernels(context, 1, &device_id, "sgemm", NULL);
|
||||
// Build the program with 'mad' Optimization option
|
||||
#ifdef MAC
|
||||
char* flags = "-cl-fast-relaxed-math -DMAC";
|
||||
#else
|
||||
char* flags = "-cl-fast-relaxed-math";
|
||||
#endif
|
||||
shrLog("clBuildProgram...\n");
|
||||
ciErrNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
|
||||
if (ciErrNum != CL_SUCCESS)
|
||||
{
|
||||
// write out standard error, Build Log and PTX, then cleanup and exit
|
||||
shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
|
||||
oclLogBuildInfo(program, oclGetFirstDev(cxGPUContext));
|
||||
oclLogPtx(program, oclGetFirstDev(cxGPUContext), "oclDotProduct.ptx");
|
||||
Cleanup(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
// Create the kernel
|
||||
shrLog("clCreateKernel (DotProduct)...\n");
|
||||
ckKernel = clCreateKernel(program, "DotProduct", &ciErrNum);
|
||||
|
||||
// Set the Argument values
|
||||
shrLog("clSetKernelArg 0 - 3...\n\n");
|
||||
ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA);
|
||||
ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB);
|
||||
ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst);
|
||||
ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// --------------------------------------------------------
|
||||
// Core sequence... copy input data to GPU, compute, copy results back
|
||||
|
||||
// Asynchronous write of data to GPU device
|
||||
shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n");
|
||||
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcA, 0, NULL, NULL);
|
||||
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcB, 0, NULL, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Launch kernel
|
||||
shrLog("clEnqueueNDRangeKernel (DotProduct)...\n");
|
||||
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Read back results and check accumulated errors
|
||||
shrLog("clEnqueueReadBuffer (Dst)...\n\n");
|
||||
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Compute and compare results for golden-host and report errors and pass/fail
|
||||
shrLog("Comparing against Host/C++ computation...\n\n");
|
||||
DotProductHost ((const float*)srcA, (const float*)srcB, (float*)Golden, iNumElements);
|
||||
shrBOOL bMatch = shrComparefet((const float*)Golden, (const float*)dst, (unsigned int)iNumElements, 0.0f, 0);
|
||||
|
||||
// Cleanup and leave
|
||||
Cleanup (EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
// "Golden" Host processing dot product function for comparison purposes
|
||||
// *********************************************************************
|
||||
void DotProductHost(const float* pfData1, const float* pfData2, float* pfResult, int iNumElements)
|
||||
{
|
||||
int i, j, k;
|
||||
for (i = 0, j = 0; i < iNumElements; i++)
|
||||
{
|
||||
pfResult[i] = 0.0f;
|
||||
for (k = 0; k < 4; k++, j++)
|
||||
{
|
||||
pfResult[i] += pfData1[j] * pfData2[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Cleanup and exit code
|
||||
// *********************************************************************
|
||||
void Cleanup(int iExitCode)
|
||||
{
|
||||
// Cleanup allocated objects
|
||||
shrLog("Starting Cleanup...\n\n");
|
||||
if(cPathAndName)free(cPathAndName);
|
||||
if(cSourceCL)free(cSourceCL);
|
||||
if(ckKernel)clReleaseKernel(ckKernel);
|
||||
if(program)clReleaseProgram(program);
|
||||
if(cqCommandQueue)clReleaseCommandQueue(cqCommandQueue);
|
||||
if(cxGPUContext)clReleaseContext(cxGPUContext);
|
||||
if (cmDevSrcA)clReleaseMemObject(cmDevSrcA);
|
||||
if (cmDevSrcB)clReleaseMemObject(cmDevSrcB);
|
||||
if (cmDevDst)clReleaseMemObject(cmDevDst);
|
||||
|
||||
// Free host memory
|
||||
free(srcA);
|
||||
free(srcB);
|
||||
free (dst);
|
||||
free(Golden);
|
||||
|
||||
if (cdDevices) free(cdDevices);
|
||||
|
||||
shrQAFinishExit(*gp_argc, (const char **)*gp_argv, (iExitCode == EXIT_SUCCESS) ? QA_PASSED : QA_FAILED);
|
||||
}
|
||||
@@ -1,198 +0,0 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef OCL_UTILS_H
|
||||
#define OCL_UTILS_H
|
||||
|
||||
// *********************************************************************
|
||||
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
// Common headers: Cross-API utililties and OpenCL header
|
||||
#include <shrUtils.h>
|
||||
|
||||
// All OpenCL headers
|
||||
#if defined (__APPLE__) || defined(MACOSX)
|
||||
#include <OpenCL/opencl.h>
|
||||
#else
|
||||
#include <CL/opencl.h>
|
||||
#endif
|
||||
|
||||
// Includes
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// For systems with CL_EXT that are not updated with these extensions, we copied these
|
||||
// extensions from <CL/cl_ext.h>
|
||||
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
|
||||
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
|
||||
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
|
||||
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
|
||||
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
|
||||
#define CL_DEVICE_WARP_SIZE_NV 0x4003
|
||||
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
|
||||
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
|
||||
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
|
||||
#endif
|
||||
|
||||
// reminders for build output window and log
|
||||
#ifdef _WIN32
|
||||
#pragma message ("Note: including shrUtils.h")
|
||||
#pragma message ("Note: including opencl.h")
|
||||
#endif
|
||||
|
||||
// SDK Revision #
|
||||
#define OCL_SDKREVISION "7027912"
|
||||
|
||||
// Error and Exit Handling Macros...
|
||||
// *********************************************************************
|
||||
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||
// (Companion Inline Function lower on page)
|
||||
#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||
|
||||
// Short version without Cleanup() callback pointer
|
||||
// Both Input (a) and Reference (b) are specified as args
|
||||
#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param clSelectedPlatformID OpenCL platform ID
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Print info about the device
|
||||
//!
|
||||
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and return device capability
|
||||
//!
|
||||
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" int oclGetDevCap(cl_device_id device);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Print the device name
|
||||
//!
|
||||
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of the first device from the context
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of the nth device from the context
|
||||
//!
|
||||
//! @return the id or -1 when out of range
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//! @param device_idx index of the device of interest
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of device with maximal FLOPS from the context
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Loads a Program file and prepends the cPreamble to the code.
|
||||
//!
|
||||
//! @return the source string if succeeded, 0 otherwise
|
||||
//! @param cFilename program filename
|
||||
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
|
||||
//! @param szFinalLength returned length of the code string
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the binary (PTX) of the program associated with the device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//! @param binary returned code
|
||||
//! @param length length of returned code
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//! @param const char* cPtxFileName optional PTX file name
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and log the Build Log from the OpenCL compiler for the requested program & device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
|
||||
|
||||
// Helper function for De-allocating cl objects
|
||||
// *********************************************************************
|
||||
extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
|
||||
|
||||
// Helper function to get OpenCL error string from constant
|
||||
// *********************************************************************
|
||||
extern "C" const char* oclErrorString(cl_int error);
|
||||
|
||||
// Helper function to get OpenCL image format string (channel order and type) from constant
|
||||
// *********************************************************************
|
||||
extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
|
||||
|
||||
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||
// *********************************************************************
|
||||
inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||
{
|
||||
// An error condition is defined by the sample/test value not equal to the reference
|
||||
if (iReference != iSample)
|
||||
{
|
||||
// If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
|
||||
iSample = (iSample == 0) ? -9999 : iSample;
|
||||
|
||||
// Log the error info
|
||||
shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
|
||||
|
||||
// Cleanup and exit, or just exit if no cleanup function pointer provided. Use iSample (error code in this case) as process exit code.
|
||||
if (pCleanup != NULL)
|
||||
{
|
||||
pCleanup(iSample);
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||
exit(iSample);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,238 +0,0 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef SHR_QATEST_H
|
||||
#define SHR_QATEST_H
|
||||
|
||||
// *********************************************************************
|
||||
// Generic utilities for NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
// OS dependent includes
|
||||
#ifdef _WIN32
|
||||
#pragma message ("Note: including windows.h")
|
||||
#pragma message ("Note: including math.h")
|
||||
#pragma message ("Note: including assert.h")
|
||||
#pragma message ("Note: including time.h")
|
||||
|
||||
// Headers needed for Windows
|
||||
#include <windows.h>
|
||||
#include <time.h>
|
||||
#else
|
||||
// Headers needed for Linux
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/time.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdarg.h>
|
||||
#include <unistd.h>
|
||||
#include <time.h>
|
||||
#endif
|
||||
|
||||
#ifndef STRCASECMP
|
||||
#ifdef _WIN32
|
||||
#define STRCASECMP _stricmp
|
||||
#else
|
||||
#define STRCASECMP strcasecmp
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef STRNCASECMP
|
||||
#ifdef _WIN32
|
||||
#define STRNCASECMP _strnicmp
|
||||
#else
|
||||
#define STRNCASECMP strncasecmp
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
// Standardized QA Start/Finish for CUDA SDK tests
|
||||
#define shrQAStart(a, b) __shrQAStart(a, b)
|
||||
#define shrQAFinish(a, b, c) __shrQAFinish(a, b, c)
|
||||
#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
|
||||
|
||||
inline int findExeNameStart(const char *exec_name)
|
||||
{
|
||||
int exename_start = (int)strlen(exec_name);
|
||||
|
||||
while( (exename_start > 0) &&
|
||||
(exec_name[exename_start] != '\\') &&
|
||||
(exec_name[exename_start] != '/') )
|
||||
{
|
||||
exename_start--;
|
||||
}
|
||||
if (exec_name[exename_start] == '\\' ||
|
||||
exec_name[exename_start] == '/')
|
||||
{
|
||||
return exename_start+1;
|
||||
} else {
|
||||
return exename_start;
|
||||
}
|
||||
}
|
||||
|
||||
inline int __shrQAStart(int argc, char **argv)
|
||||
{
|
||||
bool bQATest = false;
|
||||
// First clear the output buffer
|
||||
fflush(stdout);
|
||||
fflush(stdout);
|
||||
|
||||
for (int i=1; i < argc; i++) {
|
||||
int string_start = 0;
|
||||
while (argv[i][string_start] == '-')
|
||||
string_start++;
|
||||
char *string_argv = &argv[i][string_start];
|
||||
|
||||
if (!STRCASECMP(string_argv, "qatest")) {
|
||||
bQATest = true;
|
||||
}
|
||||
}
|
||||
|
||||
// We don't want to print the entire path, so we search for the first
|
||||
int exename_start = findExeNameStart(argv[0]);
|
||||
if (bQATest) {
|
||||
fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
|
||||
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||
fprintf(stdout, "\n");
|
||||
} else {
|
||||
fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
|
||||
}
|
||||
fflush(stdout);
|
||||
printf("\n"); fflush(stdout);
|
||||
return exename_start;
|
||||
}
|
||||
|
||||
enum eQAstatus {
|
||||
QA_FAILED = 0,
|
||||
QA_PASSED = 1,
|
||||
QA_WAIVED = 2
|
||||
};
|
||||
|
||||
inline void __ExitInTime(int seconds)
|
||||
{
|
||||
fprintf(stdout, "> exiting in %d seconds: ", seconds);
|
||||
fflush(stdout);
|
||||
time_t t;
|
||||
int count;
|
||||
for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
|
||||
fprintf(stdout, "%d...", count);
|
||||
#ifdef WIN32
|
||||
Sleep(1000);
|
||||
#else
|
||||
sleep(1);
|
||||
#endif
|
||||
}
|
||||
fprintf(stdout,"done!\n\n");
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
|
||||
inline void __shrQAFinish(int argc, const char **argv, int iStatus)
|
||||
{
|
||||
// By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
|
||||
bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
|
||||
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||
|
||||
for (int i=1; i < argc; i++) {
|
||||
int string_start = 0;
|
||||
while (argv[i][string_start] == '-')
|
||||
string_start++;
|
||||
|
||||
const char *string_argv = &argv[i][string_start];
|
||||
if (!STRCASECMP(string_argv, "qatest")) {
|
||||
bQATest = true;
|
||||
}
|
||||
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||
bNoPrompt = true;
|
||||
bQuitInTime = false;
|
||||
}
|
||||
if (!STRCASECMP(string_argv, "prompt")) {
|
||||
bNoPrompt = false;
|
||||
bQuitInTime = false;
|
||||
}
|
||||
}
|
||||
|
||||
int exename_start = findExeNameStart(argv[0]);
|
||||
if (bQATest) {
|
||||
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||
fprintf(stdout, "\n");
|
||||
} else {
|
||||
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||
}
|
||||
fflush(stdout);
|
||||
printf("\n"); fflush(stdout);
|
||||
if (bQuitInTime) {
|
||||
__ExitInTime(3);
|
||||
} else {
|
||||
if (!bNoPrompt) {
|
||||
fprintf(stdout, "\nPress <Enter> to exit...\n");
|
||||
fflush(stdout);
|
||||
getchar();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
|
||||
{
|
||||
bool bQuitInTime = true;
|
||||
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||
|
||||
for (int i=1; i < argc; i++) {
|
||||
int string_start = 0;
|
||||
while (argv[i][string_start] == '-')
|
||||
string_start++;
|
||||
|
||||
const char *string_argv = &argv[i][string_start];
|
||||
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||
bQuitInTime = false;
|
||||
}
|
||||
if (!STRCASECMP(string_argv, "prompt")) {
|
||||
bQuitInTime = false;
|
||||
}
|
||||
}
|
||||
|
||||
int exename_start = findExeNameStart(argv[0]);
|
||||
if (bQATest) {
|
||||
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||
fprintf(stdout, "\n");
|
||||
} else {
|
||||
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||
}
|
||||
fflush(stdout);
|
||||
|
||||
if (bQuitInTime) {
|
||||
__ExitInTime(3);
|
||||
}
|
||||
}
|
||||
|
||||
inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
|
||||
{
|
||||
__shrQAFinish(argc, argv, iStatus);
|
||||
|
||||
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
}
|
||||
|
||||
inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
|
||||
{
|
||||
__shrQAFinish2(bQAtest, argc, argv, iStatus);
|
||||
|
||||
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,642 +0,0 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef SHR_UTILS_H
|
||||
#define SHR_UTILS_H
|
||||
|
||||
// *********************************************************************
|
||||
// Generic utilities for NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
// reminders for output window and build log
|
||||
#ifdef _WIN32
|
||||
#pragma message ("Note: including windows.h")
|
||||
#pragma message ("Note: including math.h")
|
||||
#pragma message ("Note: including assert.h")
|
||||
#endif
|
||||
|
||||
// OS dependent includes
|
||||
#ifdef _WIN32
|
||||
// Headers needed for Windows
|
||||
#include <windows.h>
|
||||
#else
|
||||
// Headers needed for Linux
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/time.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdarg.h>
|
||||
#endif
|
||||
|
||||
// Other headers needed for both Windows and Linux
|
||||
#include <math.h>
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// Un-comment the following #define to enable profiling code in SDK apps
|
||||
//#define GPU_PROFILING
|
||||
|
||||
// Beginning of GPU Architecture definitions
|
||||
inline int ConvertSMVer2Cores(int major, int minor)
|
||||
{
|
||||
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
|
||||
typedef struct {
|
||||
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
|
||||
int Cores;
|
||||
} sSMtoCores;
|
||||
|
||||
sSMtoCores nGpuArchCoresPerSM[] =
|
||||
{ { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
|
||||
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
|
||||
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
|
||||
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
|
||||
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
|
||||
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
|
||||
{ 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
|
||||
{ -1, -1 }
|
||||
};
|
||||
|
||||
int index = 0;
|
||||
while (nGpuArchCoresPerSM[index].SM != -1) {
|
||||
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
|
||||
return nGpuArchCoresPerSM[index].Cores;
|
||||
}
|
||||
index++;
|
||||
}
|
||||
printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
|
||||
return -1;
|
||||
}
|
||||
// end of GPU Architecture definitions
|
||||
|
||||
|
||||
// Defines and enum for use with logging functions
|
||||
// *********************************************************************
|
||||
#define DEFAULTLOGFILE "SdkConsoleLog.txt"
|
||||
#define MASTERLOGFILE "SdkMasterLog.csv"
|
||||
enum LOGMODES
|
||||
{
|
||||
LOGCONSOLE = 1, // bit to signal "log to console"
|
||||
LOGFILE = 2, // bit to signal "log to file"
|
||||
LOGBOTH = 3, // convenience union of first 2 bits to signal "log to both"
|
||||
APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
|
||||
MASTER = 8, // bit to signal master .csv log output
|
||||
ERRORMSG = 16, // bit to signal "pre-pend Error"
|
||||
CLOSELOG = 32 // bit to close log file, if open, after any requested file write
|
||||
};
|
||||
#define HDASHLINE "-----------------------------------------------------------\n"
|
||||
|
||||
// Standardized boolean
|
||||
enum shrBOOL
|
||||
{
|
||||
shrFALSE = 0,
|
||||
shrTRUE = 1
|
||||
};
|
||||
|
||||
// Standardized MAX, MIN and CLAMP
|
||||
#define MAX(a, b) ((a > b) ? a : b)
|
||||
#define MIN(a, b) ((a < b) ? a : b)
|
||||
#define CLAMP(a, b, c) MIN(MAX(a, b), c) // double sided clip of input a
|
||||
#define TOPCLAMP(a, b) (a < b ? a:b) // single top side clip of input a
|
||||
|
||||
// Error and Exit Handling Macros...
|
||||
// *********************************************************************
|
||||
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||
// (Companion Inline Function lower on page)
|
||||
#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||
|
||||
// Short version without Cleanup() callback pointer
|
||||
// Both Input (a) and Reference (b) are specified as args
|
||||
#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0)
|
||||
|
||||
// Standardized Exit Macro for leaving main()... extended version
|
||||
// (Companion Inline Function lower on page)
|
||||
#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
|
||||
|
||||
// Standardized Exit Macro for leaving main()... short version
|
||||
// (Companion Inline Function lower on page)
|
||||
#define shrEXIT(a, b) __shrExitEX(a, b, EXIT_SUCCESS)
|
||||
|
||||
// Simple argument checker macro
|
||||
#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE
|
||||
|
||||
// Define for user-customized error handling
|
||||
#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
|
||||
|
||||
// Function to deallocate memory allocated within shrUtils
|
||||
// *********************************************************************
|
||||
extern "C" void shrFree(void* ptr);
|
||||
|
||||
// *********************************************************************
|
||||
// Helper function to log standardized information to Console, to File or to both
|
||||
//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n");
|
||||
//! : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
|
||||
//!
|
||||
//! Automatically opens file and stores handle if needed and not done yet
|
||||
//! Closes file and nulls handle on request
|
||||
//!
|
||||
//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.
|
||||
//! LOGFILE and LOGBOTH may be | 'd with APPENDMODE to select file append mode instead of overwrite mode
|
||||
//! LOGFILE and LOGBOTH may be | 'd with CLOSELOG to "write and close"
|
||||
//! First 3 options may be | 'd with MASTER to enable independent write to master data log file
|
||||
//! First 3 options may be | 'd with ERRORMSG to start line with standard error message
|
||||
//! @param 2 dValue:
|
||||
//! Positive val = double value for time in secs to be formatted to 6 decimals.
|
||||
//! Negative val is an error code and this give error preformatting.
|
||||
//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.
|
||||
//! ALL printf flags, width, precision and type specifiers are supported with this exception:
|
||||
//! Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
|
||||
//! Single byte char type specifiers (%s and %c) ARE supported
|
||||
//! @param 4... variable args: like printf or fprintf. Must match format specifer type above.
|
||||
//! @return 0 if OK, negative value on error or if error occurs or was passed in.
|
||||
// *********************************************************************
|
||||
extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
|
||||
|
||||
// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0,
|
||||
// *********************************************************************
|
||||
extern "C" int shrLog(const char* cFormatString, ...);
|
||||
|
||||
// *********************************************************************
|
||||
// Delta timer function for up to 3 independent timers using host high performance counters
|
||||
// Maintains state for 3 independent counters
|
||||
//! Example: double dElapsedTime = shrDeltaTime(0);
|
||||
//!
|
||||
//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
|
||||
//! @return delta time of specified counter since last call in seconds. Otherwise -9999.0 if error
|
||||
// *********************************************************************
|
||||
extern "C" double shrDeltaT(int iCounterID);
|
||||
|
||||
// Optional LogFileNameOverride function
|
||||
// *********************************************************************
|
||||
extern "C" void shrSetLogFileName (const char* cOverRideName);
|
||||
|
||||
// Helper function to init data arrays
|
||||
// *********************************************************************
|
||||
extern "C" void shrFillArray(float* pfData, int iSize);
|
||||
|
||||
// Helper function to print data arrays
|
||||
// *********************************************************************
|
||||
extern "C" void shrPrintArray(float* pfData, int iSize);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Find the path for a filename
|
||||
//! @return the path if succeeded, otherwise 0
|
||||
//! @param filename name of the file
|
||||
//! @param executablePath optional absolute path of the executable
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing single precision floating point data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing double precision floating point data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing integer data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing unsigned integer data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data,
|
||||
unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing char / byte data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing unsigned char / byte data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data,
|
||||
unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing single precision floating point
|
||||
//! data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @param epsilon epsilon for comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
|
||||
const float epsilon, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing double precision floating point
|
||||
//! data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @param epsilon epsilon for comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
|
||||
const double epsilon, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing integer data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing unsigned integer data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data,
|
||||
unsigned int len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing char / byte data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing unsigned char / byte data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
|
||||
unsigned int len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PPM image file (with unsigned char as data element type), padding
|
||||
//! 4th component
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param OutData handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//!
|
||||
//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData,
|
||||
unsigned int *w, unsigned int *h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PPM image file (with unsigned char as data element type, padded to
|
||||
//! 4 bytes)
|
||||
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PGM image file (with unsigned char as data element type)
|
||||
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PGM image file (with unsigned char as data element type)
|
||||
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
|
||||
unsigned int *w,unsigned int *h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Command line arguments: General notes
|
||||
// * All command line arguments begin with '--' followed by the token;
|
||||
// token and value are seperated by '='; example --samples=50
|
||||
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
|
||||
// (without whitespaces)
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Check if command line argument \a flag-name is given
|
||||
//! @return shrTRUE if command line argument \a flag_name has been given,
|
||||
//! otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param flag_name name of command line flag
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv,
|
||||
const char* flag_name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type int
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv,
|
||||
const char* arg_name, int* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type unsigned int
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv,
|
||||
const char* arg_name, unsigned int* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type float
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv,
|
||||
const char* arg_name, float* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type string
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv,
|
||||
const char* arg_name, char** val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument list those element are strings
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val command line argument list
|
||||
//! @param len length of the list / number of elements
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv,
|
||||
const char* arg_name, char** val,
|
||||
unsigned int* len);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparef( const float* reference, const float* data,
|
||||
const unsigned int len);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integer arrays
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparei( const int* reference, const int* data,
|
||||
const unsigned int len );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two unsigned integer arrays, with epsilon and threshold
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
|
||||
const unsigned int len, const float epsilon, const float threshold );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two unsigned char arrays
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integers with a tolernance for # of byte errors
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len, const float epsilon, const float threshold );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integer arrays witha n epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays with an epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays with an epsilon tolerance for equality and a
|
||||
//! threshold for # pixel errors
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon, const float threshold );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays using L2-norm with an epsilon tolerance for
|
||||
//! equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two PPM image files with an epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param src_file filename for the image to be compared
|
||||
//! @param data filename for the reference data / gold image
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||
//! $param verboseErrors output details of image mismatch to std::err
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two PGM image files with an epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param src_file filename for the image to be compared
|
||||
//! @param data filename for the reference data / gold image
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||
//! $param verboseErrors output details of image mismatch to std::err
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||
|
||||
extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
|
||||
|
||||
extern "C" size_t shrRoundUp(int group_size, int global_size);
|
||||
|
||||
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||
// *********************************************************************
|
||||
inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||
{
|
||||
if (iReference != iSample)
|
||||
{
|
||||
shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile);
|
||||
if (pCleanup != NULL)
|
||||
{
|
||||
pCleanup(EXIT_FAILURE);
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Standardized Exit
|
||||
// *********************************************************************
|
||||
inline void __shrExitEX(int argc, const char** argv, int iExitCode)
|
||||
{
|
||||
#ifdef WIN32
|
||||
if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||
#else
|
||||
if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||
#endif
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");
|
||||
getchar();
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]);
|
||||
}
|
||||
fflush(stderr);
|
||||
exit(iExitCode);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,66 +0,0 @@
|
||||
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
|
||||
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
|
||||
POCL_INC_PATH ?= $(wildcard ../include)
|
||||
POCL_LIB_PATH ?= $(wildcard ../lib)
|
||||
VX_RT_PATH ?= $(wildcard ../../../runtime)
|
||||
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
|
||||
|
||||
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
|
||||
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
|
||||
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
|
||||
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
|
||||
|
||||
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
|
||||
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
|
||||
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
|
||||
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
|
||||
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
|
||||
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
|
||||
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
|
||||
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
|
||||
|
||||
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
|
||||
|
||||
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
|
||||
CXXFLAGS += -ffreestanding # program may not begin at main()
|
||||
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
|
||||
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
|
||||
CXXFLAGS += -I$(POCL_INC_PATH) -I.
|
||||
|
||||
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
|
||||
PROJECT=VectorHypot
|
||||
|
||||
all: $(PROJECT).dump $(PROJECT).hex
|
||||
|
||||
lib$(PROJECT).a: VectorHypot.cl
|
||||
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
|
||||
|
||||
$(PROJECT).elf: main.cc lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf
|
||||
|
||||
$(PROJECT).qemu: main.cc lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
|
||||
|
||||
$(PROJECT).hex: $(PROJECT).elf
|
||||
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
|
||||
|
||||
$(PROJECT).dump: $(PROJECT).elf
|
||||
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||
|
||||
run: $(PROJECT).hex
|
||||
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
|
||||
|
||||
qemu: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-s: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-c: $(PROJECT).qemu
|
||||
$(GDB) $(PROJECT).qemu
|
||||
|
||||
clean:
|
||||
rm -rf *.elf *.dump *.hex
|
||||
@@ -1,41 +0,0 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
// OpenCL Kernel Function Naive Implementation for hyptenuse
|
||||
__kernel void VectorHypot(__global float4* fg4A, __global float4* fg4B, __global float4* fg4Hypot, unsigned int uiOffset, int iInnerLoopCount, unsigned int uiNumElements)
|
||||
{
|
||||
// get index into global data array
|
||||
size_t szGlobalOffset = get_global_id(0) + uiOffset;
|
||||
|
||||
// bound check
|
||||
if (szGlobalOffset >= uiNumElements)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Processing 4 elements per work item, so read fgA and fgB source values from GMEM
|
||||
float4 f4A = fg4A[szGlobalOffset];
|
||||
float4 f4B = fg4B[szGlobalOffset];
|
||||
float4 f4H = (float4)0.0f;
|
||||
|
||||
// Get the hypotenuses the vectors of 'legs', but exaggerate the time needed with loop
|
||||
for (int i = 0; i < iInnerLoopCount; i++)
|
||||
{
|
||||
// compute the 4 hypotenuses using built-in function
|
||||
f4H.x = hypot (f4A.x, f4B.x);
|
||||
f4H.y = hypot (f4A.y, f4B.y);
|
||||
f4H.z = hypot (f4A.z, f4B.z);
|
||||
f4H.w = hypot (f4A.w, f4B.w);
|
||||
}
|
||||
|
||||
// Write 4 result values back out to GMEM
|
||||
fg4Hypot[szGlobalOffset] = f4H;
|
||||
}
|
||||
@@ -1,686 +0,0 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
// *********************************************************************
|
||||
// oclCopyComputeOverlap Notes:
|
||||
//
|
||||
// OpenCL API demo application for NVIDIA CUDA GPU's that implements a
|
||||
// element by element vector hyptenuse computation using 2 input float arrays
|
||||
// and 1 output float array.
|
||||
//
|
||||
// Demonstrates host->GPU and GPU->host copies that are asynchronous/overlapped
|
||||
// with respect to GPU computation (and with respect to host thread).
|
||||
//
|
||||
// Because the overlap acheivable for this computation and data set on a given system depends upon the GPU being used and the
|
||||
// GPU/Host bandwidth, the sample adjust the computation duration to test the most ideal case and test against a consistent standard.
|
||||
// This sample should be able to achieve up to 30% overlap on GPU's arch 1.2 and 1.3, and up to 50% on arch 2.0+ (Fermi) GPU's.
|
||||
//
|
||||
// After setup, warmup and calibration to the system, the sample runs 4 scenarios:
|
||||
// A) Computations with 2 command queues on GPU
|
||||
// A multiple-cycle sequence is executed, timed and compared against the host
|
||||
// B) Computations with 1 command queue on GPU
|
||||
// A multiple-cycle sequence is executed, timed and compared against the host
|
||||
//
|
||||
// The 2-command queue approach ought to be substantially faster
|
||||
//
|
||||
// For developmental purposes, the "iInnerLoopCount" variable passes into kernel and independently
|
||||
// increases compute time without increasing data size (via a loop inside the kernel)
|
||||
//
|
||||
// At some value of iInnerLoopCount, # of elements, workgroup size, etc the Overlap percentage should reach 30%:
|
||||
// (This ~naively assumes time H2D bandwidth is the same as D2H bandwidth, but this is close on most systems)
|
||||
//
|
||||
// If we name the time to copy single input vector H2D (or outpute vector D2H) as "T", then the optimum comparison case is:
|
||||
//
|
||||
// Single Queue with all the data and all the work
|
||||
// Ttot (serial) = 4T + 4T + 2T = 10T
|
||||
//
|
||||
// Dual Queue, where each queue has 1/2 the data and 1/2 the work
|
||||
// Tq0 (overlap) = 2T + 2T + T ....
|
||||
// Tq1 (overlap) = .... 2T + 2T + T
|
||||
//
|
||||
// Ttot (elapsed, wall) = 2T + 2T + 2T + T = 7T
|
||||
//
|
||||
// Best Overlap % = 100.0 * (10T - 7T)/10T = 30.0 % (Tesla arch 1.2 or 1.3, single copy engine)
|
||||
//
|
||||
// For multiple independent cycles using arch >= 2.0 with 2 copy engines, input and output copies can also be overlapped.
|
||||
// This doesn't help for the first cycle, but theoretically can lead to 50% overlap over many independent cycles.
|
||||
// *********************************************************************
|
||||
|
||||
// common SDK header for standard utilities and system libs
|
||||
#include <oclUtils.h>
|
||||
#include <shrQATest.h>
|
||||
|
||||
// Best possible and Min ratio of compute/copy overlap timing benefit to pass the test
|
||||
// values greater than 0.0f represent a speed-up relative to non-overlapped
|
||||
#define EXPECTED_OVERLAP 30.0f
|
||||
#define EXPECTED_OVERLAP_FERMI 45.0f
|
||||
#define PASS_FACTOR 0.60f
|
||||
#define RETRIES_ON_FAILURE 1
|
||||
|
||||
// Base sizes for parameters manipulated dynamically or on the command line
|
||||
#define BASE_WORK_ITEMS 64
|
||||
#define BASE_ARRAY_LENGTH 40000
|
||||
#define BASE_LOOP_COUNT 32
|
||||
|
||||
// Vars
|
||||
// *********************************************************************
|
||||
cl_platform_id cpPlatform; // OpenCL platform
|
||||
cl_context cxGPUContext; // OpenCL context
|
||||
cl_command_queue cqCommandQueue[2]; // OpenCL command queues
|
||||
cl_device_id* cdDevices; // OpenCL device list
|
||||
cl_program cpProgram; // OpenCL program
|
||||
cl_kernel ckKernel[2]; // OpenCL kernel, 1 per queue
|
||||
cl_mem cmPinnedSrcA; // OpenCL pinned host source buffer A
|
||||
cl_mem cmPinnedSrcB; // OpenCL pinned host source buffer B
|
||||
cl_mem cmPinnedResult; // OpenCL pinned host result buffer
|
||||
float* fSourceA = NULL; // Mapped pointer for pinned Host source A buffer
|
||||
float* fSourceB = NULL; // Mapped pointer for pinned Host source B buffer
|
||||
float* fResult = NULL; // Mapped pointer for pinned Host result buffer
|
||||
cl_mem cmDevSrcA; // OpenCL device source buffer A
|
||||
cl_mem cmDevSrcB; // OpenCL device source buffer B
|
||||
cl_mem cmDevResult; // OpenCL device result buffer
|
||||
size_t szBuffBytes; // Size of main buffers
|
||||
size_t szGlobalWorkSize; // 1D var for Total # of work items in the launched ND range
|
||||
size_t szLocalWorkSize = BASE_WORK_ITEMS; // initial # of work items in the work group
|
||||
cl_int ciErrNum; // Error code var
|
||||
char* cPathAndName = NULL; // Var for full paths to data, src, etc.
|
||||
char* cSourceCL = NULL; // Buffer to hold source for compilation
|
||||
const char* cExecutableName = NULL;
|
||||
|
||||
// demo config vars
|
||||
const char* cSourceFile = "VectorHypot.cl"; // OpenCL computation kernel source code
|
||||
float* Golden = NULL; // temp buffer to hold golden results for cross check
|
||||
bool bNoPrompt = false; // Command line switch to skip exit prompt
|
||||
bool bQATest = false; // Command line switch to test
|
||||
|
||||
// Forward Declarations
|
||||
// *********************************************************************
|
||||
double DualQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig);
|
||||
double OneQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig);
|
||||
int AdjustCompute(cl_device_id cdTargetDevice, unsigned int uiNumElements, int iInitialLoopCount, int iCycles);
|
||||
void VectorHypotHost(const float* pfData1, const float* pfData2, float* pfResult, unsigned int uiNumElements, int iInnerLoopCount);
|
||||
void Cleanup (int iExitCode);
|
||||
void (*pCleanup)(int) = &Cleanup;
|
||||
|
||||
int *gp_argc = 0;
|
||||
const char *** gp_argv = NULL;
|
||||
|
||||
// Main function
|
||||
// *********************************************************************
|
||||
int main(int argc, const char **argv)
|
||||
{
|
||||
//Locals
|
||||
size_t szKernelLength; // Byte size of kernel code
|
||||
double dBuildTime; // Compile time
|
||||
cl_uint uiTargetDevice = 0; // Default Device to compute on
|
||||
cl_uint uiNumDevsUsed = 1; // Number of devices used in this sample
|
||||
cl_uint uiNumDevices; // Number of devices available
|
||||
int iDevCap = -1; // Capability of device
|
||||
int iInnerLoopCount = BASE_LOOP_COUNT; // Varies "compute intensity" per data within the kernel
|
||||
const int iTestCycles = 10; // How many times to run the external test loop
|
||||
const int iWarmupCycles = 8; // How many times to run the warmup sequence
|
||||
cl_uint uiWorkGroupMultiple = 4; // Command line var (using "workgroupmult=<n>") to optionally increase workgroup size
|
||||
cl_uint uiNumElements = BASE_ARRAY_LENGTH; // initial # of elements per array to process (note: procesing 4 per work item)
|
||||
cl_uint uiSizeMultiple = 4; // Command line var (using "sizemult=<n>") to optionally increase vector sizes
|
||||
bool bPassFlag = false; // Var to accumulate test pass/fail
|
||||
shrBOOL bMatch = shrFALSE; // Cross check result
|
||||
shrBOOL bTestOverlap = shrFALSE;
|
||||
double dAvgGPUTime[2] = {0.0, 0.0}; // Average time of iTestCycles calls for 2-Queue and 1-Queue test
|
||||
double dHostTime[2] = {0.0, 0.0}; // Host computation time (2nd test is redundant but a good stability indicator)
|
||||
float fMinPassCriteria[2] = {0.0f, 0.0f}; // Test pass cireria, adjusted dependant on GPU arch
|
||||
|
||||
gp_argc = &argc;
|
||||
gp_argv = &argv;
|
||||
|
||||
shrQAStart(argc, (char **)argv);
|
||||
|
||||
// start logs
|
||||
cExecutableName = argv[0];
|
||||
shrSetLogFileName ("oclCopyComputeOverlap.txt");
|
||||
shrLog("%s Starting...\n\n", argv[0]);
|
||||
|
||||
// get basic command line args
|
||||
bNoPrompt = (shrTRUE == shrCheckCmdLineFlag(argc, argv, "noprompt"));
|
||||
bQATest = (shrTRUE == shrCheckCmdLineFlag(argc, argv, "qatest"));
|
||||
shrGetCmdLineArgumentu(argc, argv, "device", &uiTargetDevice);
|
||||
|
||||
// Optional Command-line multiplier for vector size
|
||||
// Default val of 4 gives 10.24 million float elements per vector
|
||||
// Range of 3 - 16 (7.68 to 40.96 million floats) is reasonable range (if system and GPU have enough memory)
|
||||
shrGetCmdLineArgumentu(argc, argv, "sizemult", &uiSizeMultiple);
|
||||
uiSizeMultiple = CLAMP(uiSizeMultiple, 1, 50);
|
||||
uiNumElements = uiSizeMultiple * BASE_ARRAY_LENGTH * BASE_WORK_ITEMS;
|
||||
shrLog("Array sizes = %u float elements\n", uiNumElements);
|
||||
|
||||
// Optional Command-line multiplier for workgroup size (x 64 work items)
|
||||
// Default val of 4 gives szLocalWorkSize of 256.
|
||||
// Range of 1 - 8 (resulting in workgroup sizes of 64 to 512) is reasonable range
|
||||
shrGetCmdLineArgumentu(argc, argv, "workgroupmult", &uiWorkGroupMultiple);
|
||||
uiWorkGroupMultiple = CLAMP(uiWorkGroupMultiple, 1, 10);
|
||||
szLocalWorkSize = uiWorkGroupMultiple * BASE_WORK_ITEMS;
|
||||
shrLog("Workgroup Size = %u\n\n", szLocalWorkSize);
|
||||
|
||||
// Get the NVIDIA platform if available, otherwise use default
|
||||
shrLog("Get the Platform ID...\n\n");
|
||||
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Get OpenCL platform name and version
|
||||
char cBuffer[256];
|
||||
ciErrNum = clGetPlatformInfo (cpPlatform, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("Platform Name = %s\n\n", cBuffer);
|
||||
|
||||
// Get all the devices
|
||||
shrLog("Get the Device info and select Device...\n");
|
||||
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
cdDevices = (cl_device_id*)malloc(uiNumDevices * sizeof(cl_device_id));
|
||||
|
||||
// Ethans changes
|
||||
CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
|
||||
CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
|
||||
//ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Set target device and check capabilities
|
||||
shrLog(" # of Devices Available = %u\n", uiNumDevices);
|
||||
uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
|
||||
shrLog(" Using Device %u, ", uiTargetDevice);
|
||||
oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
|
||||
iDevCap = oclGetDevCap(cdDevices[uiTargetDevice]);
|
||||
if (iDevCap > 0) {
|
||||
shrLog(", Capability = %d.%d\n\n", iDevCap/10, iDevCap%10);
|
||||
} else {
|
||||
shrLog("\n\n", iDevCap);
|
||||
}
|
||||
if (strstr(cBuffer, "NVIDIA") != NULL)
|
||||
{
|
||||
if (iDevCap < 12)
|
||||
{
|
||||
shrLog("Device doesn't have overlap capability. Skipping test...\n");
|
||||
Cleanup (EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
// Device and Platform eligible for overlap testing
|
||||
bTestOverlap = shrTRUE;
|
||||
|
||||
// If device has overlap capability, proceed
|
||||
fMinPassCriteria[0] = PASS_FACTOR * EXPECTED_OVERLAP; // 1st cycle overlap is same for 1 or 2 copy engines
|
||||
if (iDevCap != 20)
|
||||
{
|
||||
// Single copy engine
|
||||
fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP; // avg of many cycles
|
||||
}
|
||||
else
|
||||
{
|
||||
char cDevName[1024];
|
||||
clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_NAME, sizeof(cDevName), &cDevName, NULL);
|
||||
if(strstr(cDevName, "Quadro")!=0 || strstr(cDevName, "Tesla")!=0)
|
||||
{
|
||||
// Tesla or Quadro (arch = 2.0) ... Dual copy engine
|
||||
fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP_FERMI; // average of many cycles
|
||||
}
|
||||
else
|
||||
{
|
||||
// Geforce ... Single copy engine
|
||||
fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP; // average of many cycles
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Create the context
|
||||
shrLog("clCreateContext...\n");
|
||||
cxGPUContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Create 2 command-queues
|
||||
cqCommandQueue[0] = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clCreateCommandQueue [0]...\n");
|
||||
cqCommandQueue[1] = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clCreateCommandQueue [1]...\n");
|
||||
|
||||
// Allocate the OpenCL source and result buffer memory objects on GPU device GMEM
|
||||
szBuffBytes = sizeof(cl_float) * uiNumElements;
|
||||
cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szBuffBytes, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szBuffBytes, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
cmDevResult = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, szBuffBytes, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clCreateBuffer (Src A, Src B and Result GPU Device GMEM, 3 x %u floats) ...\n", uiNumElements);
|
||||
|
||||
// Allocate pinned source and result host buffers:
|
||||
// Note: Pinned (Page Locked) memory is needed for async host<->GPU memory copy operations ***
|
||||
cmPinnedSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
cmPinnedSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
cmPinnedResult = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clCreateBuffer (Src A, Src B and Result Pinned Host buffers, 3 x %u floats)...\n\n", uiNumElements);
|
||||
|
||||
// Get mapped pointers to pinned input host buffers
|
||||
// Note: This allows general (non-OpenCL) host functions to access pinned buffers using standard pointers
|
||||
fSourceA = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedSrcA, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
fSourceB = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedSrcB, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
fResult = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedResult, CL_TRUE, CL_MAP_READ, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
|
||||
//oclCheckErrorEX (ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clEnqueueMapBuffer (Pointers to 3 pinned host buffers)...\n");
|
||||
|
||||
// Alloc temp golden buffer for cross checks
|
||||
Golden = (float*)malloc(szBuffBytes);
|
||||
//oclCheckErrorEX(Golden != NULL, shrTRUE, pCleanup);
|
||||
|
||||
// Read the OpenCL kernel in from source file
|
||||
cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
|
||||
//oclCheckError(cPathAndName != NULL, shrTRUE);
|
||||
cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
|
||||
// oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("oclLoadProgSource (%s)...\n", cSourceFile);
|
||||
|
||||
// Create the program object
|
||||
//cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clCreateProgramWithSource...\n");
|
||||
cl_program program =
|
||||
clCreateProgramWithBuiltInKernels(context, 1, &device_id, "VectorHypot", NULL);
|
||||
// Build the program for the target device
|
||||
clFinish(cqCommandQueue[0]);
|
||||
shrDeltaT(0);
|
||||
ciErrNum = clBuildProgram(program, uiNumDevsUsed, &cdDevices[uiTargetDevice], "-cl-fast-relaxed-math", NULL, NULL);
|
||||
shrLog("clBuildProgram...");
|
||||
if (ciErrNum != CL_SUCCESS)
|
||||
{
|
||||
// write out standard error, Build Log and PTX, then cleanup and exit
|
||||
shrLogEx(LOGBOTH | ERRORMSG, (double)ciErrNum, STDERROR);
|
||||
oclLogBuildInfo(program, oclGetFirstDev(cxGPUContext));
|
||||
oclLogPtx(program, oclGetFirstDev(cxGPUContext), "VectorHypot.ptx");
|
||||
Cleanup(EXIT_FAILURE);
|
||||
}
|
||||
dBuildTime = shrDeltaT(0);
|
||||
|
||||
// Ethan - Kernel Addition
|
||||
|
||||
if (program == NULL) {
|
||||
std::cerr << "Failed to write program binary" << std::endl;
|
||||
Cleanup(context, queue, program, kernel, memObjects);
|
||||
return 1;
|
||||
} else {
|
||||
std::cout << "Read program from binary." << std::endl;
|
||||
}
|
||||
|
||||
// Create the kernel
|
||||
ckKernel[0] = clCreateKernel(program, "VectorHypot", &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
ckKernel[1] = clCreateKernel(program, "VectorHypot", &ciErrNum);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clCreateKernel (ckKernel[2])...\n");
|
||||
|
||||
// Offsets for 2 queues
|
||||
cl_uint uiOffset[2] = {0, uiNumElements / (2 * 4)};
|
||||
|
||||
// Set the Argument values for the 1st kernel instance (queue 0)
|
||||
ciErrNum = clSetKernelArg(ckKernel[0], 0, sizeof(cl_mem), (void*)&cmDevSrcA);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[0], 1, sizeof(cl_mem), (void*)&cmDevSrcB);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[0], 2, sizeof(cl_mem), (void*)&cmDevResult);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[0], 3, sizeof(cl_uint), (void*)&uiOffset[0]);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[0], 4, sizeof(cl_int), (void*)&iInnerLoopCount);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[0], 5, sizeof(cl_uint), (void*)&uiNumElements);
|
||||
shrLog("clSetKernelArg ckKernel[0] args 0 - 5...\n");
|
||||
|
||||
// Set the Argument values for the 2d kernel instance (queue 1)
|
||||
ciErrNum |= clSetKernelArg(ckKernel[1], 0, sizeof(cl_mem), (void*)&cmDevSrcA);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[1], 1, sizeof(cl_mem), (void*)&cmDevSrcB);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[1], 2, sizeof(cl_mem), (void*)&cmDevResult);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[1], 3, sizeof(cl_uint), (void*)&uiOffset[1]);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[1], 4, sizeof(cl_int), (void*)&iInnerLoopCount);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[1], 5, sizeof(cl_uint), (void*)&uiNumElements);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
shrLog("clSetKernelArg ckKernel[1] args 0 - 5...\n\n");
|
||||
|
||||
//*******************************************
|
||||
// Warmup the driver with dual queue sequence
|
||||
//*******************************************
|
||||
|
||||
// Warmup with dual queue sequence for iTestCycles
|
||||
shrLog("Warmup with 2-Queue sequence, %d cycles...\n", iWarmupCycles);
|
||||
DualQueueSequence(iWarmupCycles, uiNumElements, false);
|
||||
|
||||
// Use single queue config to adjust compute intensity
|
||||
shrLog("Adjust compute for GPU / system...\n");
|
||||
iInnerLoopCount = AdjustCompute(cdDevices[uiTargetDevice], uiNumElements, iInnerLoopCount, iTestCycles);
|
||||
shrLog(" Kernel inner loop count = %d\n", iInnerLoopCount);
|
||||
|
||||
//*******************************************
|
||||
// Run and time with 2 command-queues
|
||||
//*******************************************
|
||||
for( int iRun =0; iRun <= RETRIES_ON_FAILURE; ++iRun ) {
|
||||
|
||||
// Run the sequence iTestCycles times
|
||||
dAvgGPUTime[0] = DualQueueSequence(iTestCycles, uiNumElements, false);
|
||||
|
||||
// Warmup then Compute on host iTestCycles times (using mapped standard pointer to pinned host cl_mem buffer)
|
||||
shrLog(" Device vs Host Result Comparison\t: ");
|
||||
VectorHypotHost(fSourceA, fSourceB, Golden, uiNumElements, iInnerLoopCount);
|
||||
shrDeltaT(0);
|
||||
for (int i = 0; i < iTestCycles; i++)
|
||||
{
|
||||
VectorHypotHost (fSourceA, fSourceB, Golden, uiNumElements, iInnerLoopCount);
|
||||
}
|
||||
dHostTime[0] = shrDeltaT(0)/iTestCycles;
|
||||
|
||||
// Compare host and GPU results (using mapped standard pointer to pinned host cl_mem buffer)
|
||||
bMatch = shrComparefet(Golden, fResult, uiNumElements, 0.0f, 0);
|
||||
shrLog("gpu %s cpu\n", (bMatch == shrTRUE) ? "MATCHES" : "DOESN'T MATCH");
|
||||
bPassFlag = (bMatch == shrTRUE);
|
||||
|
||||
//*******************************************
|
||||
// Run and time with 1 command queue
|
||||
//*******************************************
|
||||
// Run the sequence iTestCycles times
|
||||
dAvgGPUTime[1] = OneQueueSequence(iTestCycles, uiNumElements, false);
|
||||
|
||||
// Compute on host iTestCycles times (using mapped standard pointer to pinned host cl_mem buffer)
|
||||
shrLog(" Device vs Host Result Comparison\t: ");
|
||||
shrDeltaT(0);
|
||||
for (int i = 0; i < iTestCycles; i++)
|
||||
{
|
||||
VectorHypotHost(fSourceA, fSourceB, Golden, (int)uiNumElements, iInnerLoopCount);
|
||||
}
|
||||
dHostTime[1] = shrDeltaT(0)/iTestCycles;
|
||||
|
||||
// Compare host and GPU results (using mapped standard pointer to pinned host cl_mem buffer)
|
||||
bMatch = shrComparefet(Golden, fResult, uiNumElements, 0.0f, 0);
|
||||
shrLog("gpu %s cpu\n", (bMatch == shrTRUE) ? "MATCHES" : "DOESN'T MATCH");
|
||||
bPassFlag &= (bMatch == shrTRUE);
|
||||
|
||||
//*******************************************
|
||||
|
||||
// Compare Single and Dual queue timing
|
||||
shrLog("\nResult Summary:\n");
|
||||
|
||||
// Log GPU and CPU Time for 2-queue scenario
|
||||
shrLog(" Avg GPU Elapsed Time for 2-Queues\t= %.5f s\n", dAvgGPUTime[0]);
|
||||
shrLog(" Avg Host Elapsed Time\t\t\t= %.5f s\n\n", dHostTime[0]);
|
||||
|
||||
// Log GPU and CPU Time for 1-queue scenario
|
||||
shrLog(" Avg GPU Elapsed Time for 1-Queue\t= %.5f s\n", dAvgGPUTime[1]);
|
||||
shrLog(" Avg Host Elapsed Time\t\t\t= %.5f s\n\n", dHostTime[1]);
|
||||
|
||||
// Log overlap % for GPU (comparison of 2-queue and 1 queue scenarios) and status
|
||||
double dAvgOverlap = 100.0 * (1.0 - dAvgGPUTime[0]/dAvgGPUTime[1]);
|
||||
|
||||
if( bTestOverlap ) {
|
||||
bool bAvgOverlapOK = (dAvgOverlap >= fMinPassCriteria[1]);
|
||||
if( iRun == RETRIES_ON_FAILURE || bAvgOverlapOK ) {
|
||||
shrLog(" Measured and (Acceptable) Avg Overlap\t= %.1f %% (%.1f %%) -> Measured Overlap is %s\n\n", dAvgOverlap, fMinPassCriteria[1], bAvgOverlapOK ? "Acceptable" : "NOT Acceptable");
|
||||
|
||||
// Log info to master log in standard format
|
||||
shrLogEx(LOGBOTH | MASTER, 0, "oclCopyComputeOverlap-Avg, Throughput = %.4f OverlapPercent, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n",
|
||||
dAvgOverlap, dAvgGPUTime[0], uiNumElements, uiNumDevsUsed, szLocalWorkSize);
|
||||
|
||||
bPassFlag &= bAvgOverlapOK;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
shrLog(" Measured and (Acceptable) Avg Overlap\t= %.1f %% (%.1f %%) -> Retry %d more time(s)...\n\n", dAvgOverlap, fMinPassCriteria[1], RETRIES_ON_FAILURE - iRun);
|
||||
}
|
||||
|
||||
|
||||
//*******************************************
|
||||
// Report pass/fail, cleanup and exit
|
||||
Cleanup (bPassFlag ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
}
|
||||
|
||||
// Run 1 queue sequence for n cycles
|
||||
// *********************************************************************
|
||||
double OneQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig)
|
||||
{
|
||||
// Use fresh source Data: (re)initialize pinned host array buffers (using mapped standard pointer to pinned host cl_mem buffer)
|
||||
shrFillArray(fSourceA, (int)uiNumElements);
|
||||
shrFillArray(fSourceB, (int)uiNumElements);
|
||||
|
||||
// Reset Global work size for 1 command-queue, and log work sizes & dimensions
|
||||
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/4));
|
||||
|
||||
// *** Make sure queues are empty and then start timer
|
||||
double dAvgTime = 0.0;
|
||||
clFinish(cqCommandQueue[0]);
|
||||
clFinish(cqCommandQueue[1]);
|
||||
shrDeltaT(0);
|
||||
|
||||
// Run the sequence iCycles times
|
||||
for (int i = 0; i < iCycles; i++)
|
||||
{
|
||||
// Nonblocking Write of all of input data from host to device in command-queue 0
|
||||
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szBuffBytes, (void*)&fSourceA[0], 0, NULL, NULL);
|
||||
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szBuffBytes, (void*)&fSourceB[0], 0, NULL, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// Launch kernel computation, command-queue 0
|
||||
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Non Blocking Read of output data from device to host, command-queue 0
|
||||
ciErrNum = clEnqueueReadBuffer(cqCommandQueue[0], cmDevResult, CL_FALSE, 0, szBuffBytes, (void*)&fResult[0], 0, NULL, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// Flush sequence to device (may not be necessary on Linux or WinXP or when using the NVIDIA Tesla Computing Cluster driver)
|
||||
clFlush(cqCommandQueue[0]);
|
||||
}
|
||||
|
||||
// *** Assure sync to host and return average sequence time
|
||||
clFinish(cqCommandQueue[0]);
|
||||
dAvgTime = shrDeltaT(0)/(double)iCycles;
|
||||
|
||||
// Log config if asked for
|
||||
if (bShowConfig)
|
||||
{
|
||||
shrLog("\n1-Queue sequence Configuration:\n");
|
||||
shrLog(" Global Work Size (per command-queue)\t= %u\n Local Work Size \t\t\t= %u\n # of Work Groups (per command-queue)\t= %u\n # of command-queues\t\t\t= 1\n",
|
||||
szGlobalWorkSize, szLocalWorkSize, szGlobalWorkSize/szLocalWorkSize);
|
||||
}
|
||||
return dAvgTime;
|
||||
}
|
||||
|
||||
// Run 2 queue sequence for n cycles
|
||||
// *********************************************************************
|
||||
double DualQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig)
|
||||
{
|
||||
// Locals
|
||||
size_t szHalfBuffer = szBuffBytes / 2;
|
||||
size_t szHalfOffset = szHalfBuffer / sizeof(float);
|
||||
double dAvgTime = 0.0;
|
||||
|
||||
// Use fresh source Data: (re)initialize pinned host array buffers (using mapped standard pointer to pinned host cl_mem buffer)
|
||||
shrFillArray(fSourceA, (int)uiNumElements);
|
||||
shrFillArray(fSourceB, (int)uiNumElements);
|
||||
|
||||
// Set Global work size for 2 command-queues, and log work sizes & dimensions
|
||||
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/(2 * 4)));
|
||||
|
||||
// Make sure queues are empty and then start timer
|
||||
clFinish(cqCommandQueue[0]);
|
||||
clFinish(cqCommandQueue[1]);
|
||||
shrDeltaT(0);
|
||||
|
||||
for (int i = 0; i < iCycles; i++)
|
||||
{
|
||||
// Mid Phase 0
|
||||
// Nonblocking Write of 1st half of input data from host to device in command-queue 0
|
||||
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szHalfBuffer, (void*)&fSourceA[0], 0, NULL, NULL);
|
||||
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szHalfBuffer, (void*)&fSourceB[0], 0, NULL, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// Push out the write for queue 0 (and prior read from queue 1 at end of loop) to the driver
|
||||
// (not necessary on Linux, Mac OSX or WinXP)
|
||||
clFlush(cqCommandQueue[0]);
|
||||
clFlush(cqCommandQueue[1]);
|
||||
|
||||
// Start Phase 1 ***********************************
|
||||
|
||||
// Launch kernel computation, command-queue 0
|
||||
// (Note: The order MATTERS here on Fermi ! THE KERNEL IN THIS PHASE SHOULD BE LAUNCHED BEFORE THE WRITE)
|
||||
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Nonblocking Write of 2nd half of input data from host to device in command-queue 1
|
||||
// (Note: The order MATTERS here on Fermi ! THE KERNEL IN THIS PHASE SHOULD BE LAUNCHED BEFORE THE WRITE)
|
||||
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[1], cmDevSrcA, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fSourceA[szHalfOffset], 0, NULL, NULL);
|
||||
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[1], cmDevSrcB, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fSourceB[szHalfOffset], 0, NULL, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// Push out the compute for queue 0 and write for queue 1 to the driver
|
||||
// (not necessary on Linux, Mac OSX or WinXP)
|
||||
clFlush(cqCommandQueue[0]);
|
||||
clFlush(cqCommandQueue[1]);
|
||||
|
||||
// Start Phase 2 ***********************************
|
||||
|
||||
// Launch kernel computation, command-queue 1
|
||||
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[1], ckKernel[1], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
|
||||
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
|
||||
// Non Blocking Read of 1st half of output data from device to host, command-queue 0
|
||||
ciErrNum = clEnqueueReadBuffer(cqCommandQueue[0], cmDevResult, CL_FALSE, 0, szHalfBuffer, (void*)&fResult[0], 0, NULL, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// Push out the compute for queue 1 and the read for queue 0 to the driver
|
||||
// (not necessary on Linux, Mac OSX or WinXP)
|
||||
clFlush(cqCommandQueue[0]);
|
||||
clFlush(cqCommandQueue[1]);
|
||||
|
||||
// Start Phase 0 (Rolls over) ***********************************
|
||||
|
||||
// Non Blocking Read of 2nd half of output data from device to host, command-queue 1
|
||||
ciErrNum = clEnqueueReadBuffer(cqCommandQueue[1], cmDevResult, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fResult[szHalfOffset], 0, NULL, NULL);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
}
|
||||
|
||||
// *** Sync to host and get average sequence time
|
||||
clFinish(cqCommandQueue[0]);
|
||||
clFinish(cqCommandQueue[1]);
|
||||
dAvgTime = shrDeltaT(0)/(double)iCycles;
|
||||
|
||||
// Log config if asked for
|
||||
if (bShowConfig)
|
||||
{
|
||||
shrLog("\n2-Queue sequence Configuration:\n");
|
||||
shrLog(" Global Work Size (per command-queue)\t= %u\n Local Work Size \t\t\t= %u\n # of Work Groups (per command-queue)\t= %u\n # of command-queues\t\t\t= 2\n",
|
||||
szGlobalWorkSize, szLocalWorkSize, szGlobalWorkSize/szLocalWorkSize);
|
||||
}
|
||||
|
||||
return dAvgTime;
|
||||
}
|
||||
|
||||
// Function to adjust compute task according to device capability
|
||||
// This allows a consistent overlap % across a wide variety of GPU's for test purposes
|
||||
// It also implitly illustrates the relationship between compute capability and overlap at fixed work size
|
||||
// *********************************************************************
|
||||
int AdjustCompute(cl_device_id cdTargetDevice, unsigned int uiNumElements, int iInitLoopCount, int iCycles)
|
||||
{
|
||||
// Locals
|
||||
double dCopyTime, dComputeTime;
|
||||
int iComputedLoopCount;
|
||||
|
||||
// Change Source Data
|
||||
shrFillArray(fSourceA, (int)uiNumElements);
|
||||
shrFillArray(fSourceB, (int)uiNumElements);
|
||||
|
||||
// Reset Global work size for 1 command-queue, and log work sizes & dimensions
|
||||
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/4));
|
||||
|
||||
// *** Make sure queues are empty and then start timer
|
||||
clFinish(cqCommandQueue[0]);
|
||||
clFinish(cqCommandQueue[1]);
|
||||
shrDeltaT(0);
|
||||
|
||||
// Run the copy iCycles times and measure copy time on this system
|
||||
for (int i = 0; i < iCycles; i++)
|
||||
{
|
||||
// Nonblocking Write of all of input data from host to device in command-queue 0
|
||||
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szBuffBytes, (void*)&fSourceA[0], 0, NULL, NULL);
|
||||
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szBuffBytes, (void*)&fSourceB[0], 0, NULL, NULL);
|
||||
ciErrNum |= clFlush(cqCommandQueue[0]);
|
||||
shrCheckError(ciErrNum, CL_SUCCESS);
|
||||
}
|
||||
clFinish(cqCommandQueue[0]);
|
||||
dCopyTime = shrDeltaT(0);
|
||||
|
||||
// Run the compute iCycles times and measure compute time on this system
|
||||
for (int i = 0; i < iCycles; i++)
|
||||
{
|
||||
// Launch kernel computation, command-queue 0
|
||||
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
|
||||
ciErrNum |= clFlush(cqCommandQueue[0]);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
}
|
||||
clFinish(cqCommandQueue[0]);
|
||||
dComputeTime = shrDeltaT(0);
|
||||
|
||||
// Determine number of core loop cycles proportional to copy/compute time ratio
|
||||
dComputeTime = MAX(dComputeTime, 1.0e-6);
|
||||
iComputedLoopCount = CLAMP(2, (int)((dCopyTime/dComputeTime) * (double)iInitLoopCount), (iInitLoopCount * 4));
|
||||
ciErrNum |= clSetKernelArg(ckKernel[0], 4, sizeof(cl_int), (void*)&iComputedLoopCount);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[1], 4, sizeof(cl_int), (void*)&iComputedLoopCount);
|
||||
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
|
||||
return (iComputedLoopCount);
|
||||
}
|
||||
|
||||
// Cleanup/Exit function
|
||||
// *********************************************************************
|
||||
void Cleanup (int iExitCode)
|
||||
{
|
||||
// Cleanup allocated objects
|
||||
shrLog("Starting Cleanup...\n\n");
|
||||
if(cPathAndName)free(cPathAndName);
|
||||
if(cSourceCL)free(cSourceCL);
|
||||
if(Golden)free(Golden);
|
||||
if(ckKernel[0])clReleaseKernel(ckKernel[0]);
|
||||
if(ckKernel[1])clReleaseKernel(ckKernel[1]);
|
||||
if(program)clReleaseProgram(program);
|
||||
if(fSourceA)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedSrcA, (void*)fSourceA, 0, NULL, NULL);
|
||||
if(fSourceB)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedSrcB, (void*)fSourceB, 0, NULL, NULL);
|
||||
if(fResult)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedResult, (void*)fResult, 0, NULL, NULL);
|
||||
if(cmDevSrcA)clReleaseMemObject(cmDevSrcA);
|
||||
if(cmDevSrcB)clReleaseMemObject(cmDevSrcB);
|
||||
if(cmDevResult)clReleaseMemObject(cmDevResult);
|
||||
if(cmPinnedSrcA)clReleaseMemObject(cmPinnedSrcA);
|
||||
if(cmPinnedSrcB)clReleaseMemObject(cmPinnedSrcB);
|
||||
if(cmPinnedResult)clReleaseMemObject(cmPinnedResult);
|
||||
if(cqCommandQueue[0])clReleaseCommandQueue(cqCommandQueue[0]);
|
||||
if(cqCommandQueue[1])clReleaseCommandQueue(cqCommandQueue[1]);
|
||||
if(cxGPUContext)clReleaseContext(cxGPUContext);
|
||||
if(cdDevices)free(cdDevices);
|
||||
|
||||
// Master status Pass/Fail (all tests)
|
||||
shrQAFinishExit( *gp_argc, (const char **)*gp_argv, (iExitCode == EXIT_SUCCESS) ? QA_PASSED : QA_FAILED );
|
||||
}
|
||||
|
||||
// "Golden" Host processing vector hyptenuse function for comparison purposes
|
||||
// *********************************************************************
|
||||
void VectorHypotHost(const float* pfData1, const float* pfData2, float* pfResult, unsigned int uiNumElements, int iInnerLoopCount)
|
||||
{
|
||||
for (unsigned int i = 0; i < uiNumElements; i++)
|
||||
{
|
||||
float fA = pfData1[i];
|
||||
float fB = pfData2[i];
|
||||
float fC = sqrtf(fA * fA + fB * fB);
|
||||
|
||||
pfResult[i] = fC;
|
||||
}
|
||||
}
|
||||
@@ -1,198 +0,0 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef OCL_UTILS_H
|
||||
#define OCL_UTILS_H
|
||||
|
||||
// *********************************************************************
|
||||
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
// Common headers: Cross-API utililties and OpenCL header
|
||||
#include <shrUtils.h>
|
||||
|
||||
// All OpenCL headers
|
||||
#if defined (__APPLE__) || defined(MACOSX)
|
||||
#include <OpenCL/opencl.h>
|
||||
#else
|
||||
#include <CL/opencl.h>
|
||||
#endif
|
||||
|
||||
// Includes
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// For systems with CL_EXT that are not updated with these extensions, we copied these
|
||||
// extensions from <CL/cl_ext.h>
|
||||
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
|
||||
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
|
||||
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
|
||||
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
|
||||
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
|
||||
#define CL_DEVICE_WARP_SIZE_NV 0x4003
|
||||
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
|
||||
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
|
||||
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
|
||||
#endif
|
||||
|
||||
// reminders for build output window and log
|
||||
#ifdef _WIN32
|
||||
#pragma message ("Note: including shrUtils.h")
|
||||
#pragma message ("Note: including opencl.h")
|
||||
#endif
|
||||
|
||||
// SDK Revision #
|
||||
#define OCL_SDKREVISION "7027912"
|
||||
|
||||
// Error and Exit Handling Macros...
|
||||
// *********************************************************************
|
||||
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||
// (Companion Inline Function lower on page)
|
||||
#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||
|
||||
// Short version without Cleanup() callback pointer
|
||||
// Both Input (a) and Reference (b) are specified as args
|
||||
#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0)
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param clSelectedPlatformID OpenCL platform ID
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Print info about the device
|
||||
//!
|
||||
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and return device capability
|
||||
//!
|
||||
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" int oclGetDevCap(cl_device_id device);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Print the device name
|
||||
//!
|
||||
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
|
||||
//! @param device OpenCL id of the device
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of the first device from the context
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of the nth device from the context
|
||||
//!
|
||||
//! @return the id or -1 when out of range
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//! @param device_idx index of the device of interest
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Gets the id of device with maximal FLOPS from the context
|
||||
//!
|
||||
//! @return the id
|
||||
//! @param cxGPUContext OpenCL context
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Loads a Program file and prepends the cPreamble to the code.
|
||||
//!
|
||||
//! @return the source string if succeeded, 0 otherwise
|
||||
//! @param cFilename program filename
|
||||
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
|
||||
//! @param szFinalLength returned length of the code string
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the binary (PTX) of the program associated with the device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//! @param binary returned code
|
||||
//! @param length length of returned code
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//! @param const char* cPtxFileName optional PTX file name
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
//! Get and log the Build Log from the OpenCL compiler for the requested program & device
|
||||
//!
|
||||
//! @param cpProgram OpenCL program
|
||||
//! @param cdDevice device of interest
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
|
||||
|
||||
// Helper function for De-allocating cl objects
|
||||
// *********************************************************************
|
||||
extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
|
||||
|
||||
// Helper function to get OpenCL error string from constant
|
||||
// *********************************************************************
|
||||
extern "C" const char* oclErrorString(cl_int error);
|
||||
|
||||
// Helper function to get OpenCL image format string (channel order and type) from constant
|
||||
// *********************************************************************
|
||||
extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
|
||||
|
||||
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||
// *********************************************************************
|
||||
inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||
{
|
||||
// An error condition is defined by the sample/test value not equal to the reference
|
||||
if (iReference != iSample)
|
||||
{
|
||||
// If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
|
||||
iSample = (iSample == 0) ? -9999 : iSample;
|
||||
|
||||
// Log the error info
|
||||
shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
|
||||
|
||||
// Cleanup and exit, or just exit if no cleanup function pointer provided. Use iSample (error code in this case) as process exit code.
|
||||
if (pCleanup != NULL)
|
||||
{
|
||||
pCleanup(iSample);
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||
exit(iSample);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,238 +0,0 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef SHR_QATEST_H
|
||||
#define SHR_QATEST_H
|
||||
|
||||
// *********************************************************************
|
||||
// Generic utilities for NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
// OS dependent includes
|
||||
#ifdef _WIN32
|
||||
#pragma message ("Note: including windows.h")
|
||||
#pragma message ("Note: including math.h")
|
||||
#pragma message ("Note: including assert.h")
|
||||
#pragma message ("Note: including time.h")
|
||||
|
||||
// Headers needed for Windows
|
||||
#include <windows.h>
|
||||
#include <time.h>
|
||||
#else
|
||||
// Headers needed for Linux
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/time.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdarg.h>
|
||||
#include <unistd.h>
|
||||
#include <time.h>
|
||||
#endif
|
||||
|
||||
#ifndef STRCASECMP
|
||||
#ifdef _WIN32
|
||||
#define STRCASECMP _stricmp
|
||||
#else
|
||||
#define STRCASECMP strcasecmp
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifndef STRNCASECMP
|
||||
#ifdef _WIN32
|
||||
#define STRNCASECMP _strnicmp
|
||||
#else
|
||||
#define STRNCASECMP strncasecmp
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
// Standardized QA Start/Finish for CUDA SDK tests
|
||||
#define shrQAStart(a, b) __shrQAStart(a, b)
|
||||
#define shrQAFinish(a, b, c) __shrQAFinish(a, b, c)
|
||||
#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
|
||||
|
||||
inline int findExeNameStart(const char *exec_name)
|
||||
{
|
||||
int exename_start = (int)strlen(exec_name);
|
||||
|
||||
while( (exename_start > 0) &&
|
||||
(exec_name[exename_start] != '\\') &&
|
||||
(exec_name[exename_start] != '/') )
|
||||
{
|
||||
exename_start--;
|
||||
}
|
||||
if (exec_name[exename_start] == '\\' ||
|
||||
exec_name[exename_start] == '/')
|
||||
{
|
||||
return exename_start+1;
|
||||
} else {
|
||||
return exename_start;
|
||||
}
|
||||
}
|
||||
|
||||
inline int __shrQAStart(int argc, char **argv)
|
||||
{
|
||||
bool bQATest = false;
|
||||
// First clear the output buffer
|
||||
fflush(stdout);
|
||||
fflush(stdout);
|
||||
|
||||
for (int i=1; i < argc; i++) {
|
||||
int string_start = 0;
|
||||
while (argv[i][string_start] == '-')
|
||||
string_start++;
|
||||
char *string_argv = &argv[i][string_start];
|
||||
|
||||
if (!STRCASECMP(string_argv, "qatest")) {
|
||||
bQATest = true;
|
||||
}
|
||||
}
|
||||
|
||||
// We don't want to print the entire path, so we search for the first
|
||||
int exename_start = findExeNameStart(argv[0]);
|
||||
if (bQATest) {
|
||||
fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
|
||||
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||
fprintf(stdout, "\n");
|
||||
} else {
|
||||
fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
|
||||
}
|
||||
fflush(stdout);
|
||||
printf("\n"); fflush(stdout);
|
||||
return exename_start;
|
||||
}
|
||||
|
||||
enum eQAstatus {
|
||||
QA_FAILED = 0,
|
||||
QA_PASSED = 1,
|
||||
QA_WAIVED = 2
|
||||
};
|
||||
|
||||
inline void __ExitInTime(int seconds)
|
||||
{
|
||||
fprintf(stdout, "> exiting in %d seconds: ", seconds);
|
||||
fflush(stdout);
|
||||
time_t t;
|
||||
int count;
|
||||
for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
|
||||
fprintf(stdout, "%d...", count);
|
||||
#ifdef WIN32
|
||||
Sleep(1000);
|
||||
#else
|
||||
sleep(1);
|
||||
#endif
|
||||
}
|
||||
fprintf(stdout,"done!\n\n");
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
|
||||
inline void __shrQAFinish(int argc, const char **argv, int iStatus)
|
||||
{
|
||||
// By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
|
||||
bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
|
||||
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||
|
||||
for (int i=1; i < argc; i++) {
|
||||
int string_start = 0;
|
||||
while (argv[i][string_start] == '-')
|
||||
string_start++;
|
||||
|
||||
const char *string_argv = &argv[i][string_start];
|
||||
if (!STRCASECMP(string_argv, "qatest")) {
|
||||
bQATest = true;
|
||||
}
|
||||
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||
bNoPrompt = true;
|
||||
bQuitInTime = false;
|
||||
}
|
||||
if (!STRCASECMP(string_argv, "prompt")) {
|
||||
bNoPrompt = false;
|
||||
bQuitInTime = false;
|
||||
}
|
||||
}
|
||||
|
||||
int exename_start = findExeNameStart(argv[0]);
|
||||
if (bQATest) {
|
||||
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||
fprintf(stdout, "\n");
|
||||
} else {
|
||||
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||
}
|
||||
fflush(stdout);
|
||||
printf("\n"); fflush(stdout);
|
||||
if (bQuitInTime) {
|
||||
__ExitInTime(3);
|
||||
} else {
|
||||
if (!bNoPrompt) {
|
||||
fprintf(stdout, "\nPress <Enter> to exit...\n");
|
||||
fflush(stdout);
|
||||
getchar();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
|
||||
{
|
||||
bool bQuitInTime = true;
|
||||
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
|
||||
|
||||
for (int i=1; i < argc; i++) {
|
||||
int string_start = 0;
|
||||
while (argv[i][string_start] == '-')
|
||||
string_start++;
|
||||
|
||||
const char *string_argv = &argv[i][string_start];
|
||||
// For SDK individual samples that don't specify -noprompt or -prompt,
|
||||
// a 3 second delay will happen before exiting, giving a user time to view results
|
||||
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
|
||||
bQuitInTime = false;
|
||||
}
|
||||
if (!STRCASECMP(string_argv, "prompt")) {
|
||||
bQuitInTime = false;
|
||||
}
|
||||
}
|
||||
|
||||
int exename_start = findExeNameStart(argv[0]);
|
||||
if (bQATest) {
|
||||
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
|
||||
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
|
||||
fprintf(stdout, "\n");
|
||||
} else {
|
||||
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
|
||||
}
|
||||
fflush(stdout);
|
||||
|
||||
if (bQuitInTime) {
|
||||
__ExitInTime(3);
|
||||
}
|
||||
}
|
||||
|
||||
inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
|
||||
{
|
||||
__shrQAFinish(argc, argv, iStatus);
|
||||
|
||||
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
}
|
||||
|
||||
inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
|
||||
{
|
||||
__shrQAFinish2(bQAtest, argc, argv, iStatus);
|
||||
|
||||
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,642 +0,0 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef SHR_UTILS_H
|
||||
#define SHR_UTILS_H
|
||||
|
||||
// *********************************************************************
|
||||
// Generic utilities for NVIDIA GPU Computing SDK
|
||||
// *********************************************************************
|
||||
|
||||
// reminders for output window and build log
|
||||
#ifdef _WIN32
|
||||
#pragma message ("Note: including windows.h")
|
||||
#pragma message ("Note: including math.h")
|
||||
#pragma message ("Note: including assert.h")
|
||||
#endif
|
||||
|
||||
// OS dependent includes
|
||||
#ifdef _WIN32
|
||||
// Headers needed for Windows
|
||||
#include <windows.h>
|
||||
#else
|
||||
// Headers needed for Linux
|
||||
#include <sys/stat.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/time.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdarg.h>
|
||||
#endif
|
||||
|
||||
// Other headers needed for both Windows and Linux
|
||||
#include <math.h>
|
||||
#include <assert.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
// Un-comment the following #define to enable profiling code in SDK apps
|
||||
//#define GPU_PROFILING
|
||||
|
||||
// Beginning of GPU Architecture definitions
|
||||
inline int ConvertSMVer2Cores(int major, int minor)
|
||||
{
|
||||
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
|
||||
typedef struct {
|
||||
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
|
||||
int Cores;
|
||||
} sSMtoCores;
|
||||
|
||||
sSMtoCores nGpuArchCoresPerSM[] =
|
||||
{ { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
|
||||
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
|
||||
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
|
||||
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
|
||||
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
|
||||
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
|
||||
{ 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
|
||||
{ -1, -1 }
|
||||
};
|
||||
|
||||
int index = 0;
|
||||
while (nGpuArchCoresPerSM[index].SM != -1) {
|
||||
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
|
||||
return nGpuArchCoresPerSM[index].Cores;
|
||||
}
|
||||
index++;
|
||||
}
|
||||
printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
|
||||
return -1;
|
||||
}
|
||||
// end of GPU Architecture definitions
|
||||
|
||||
|
||||
// Defines and enum for use with logging functions
|
||||
// *********************************************************************
|
||||
#define DEFAULTLOGFILE "SdkConsoleLog.txt"
|
||||
#define MASTERLOGFILE "SdkMasterLog.csv"
|
||||
enum LOGMODES
|
||||
{
|
||||
LOGCONSOLE = 1, // bit to signal "log to console"
|
||||
LOGFILE = 2, // bit to signal "log to file"
|
||||
LOGBOTH = 3, // convenience union of first 2 bits to signal "log to both"
|
||||
APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
|
||||
MASTER = 8, // bit to signal master .csv log output
|
||||
ERRORMSG = 16, // bit to signal "pre-pend Error"
|
||||
CLOSELOG = 32 // bit to close log file, if open, after any requested file write
|
||||
};
|
||||
#define HDASHLINE "-----------------------------------------------------------\n"
|
||||
|
||||
// Standardized boolean
|
||||
enum shrBOOL
|
||||
{
|
||||
shrFALSE = 0,
|
||||
shrTRUE = 1
|
||||
};
|
||||
|
||||
// Standardized MAX, MIN and CLAMP
|
||||
#define MAX(a, b) ((a > b) ? a : b)
|
||||
#define MIN(a, b) ((a < b) ? a : b)
|
||||
#define CLAMP(a, b, c) MIN(MAX(a, b), c) // double sided clip of input a
|
||||
#define TOPCLAMP(a, b) (a < b ? a:b) // single top side clip of input a
|
||||
|
||||
// Error and Exit Handling Macros...
|
||||
// *********************************************************************
|
||||
// Full error handling macro with Cleanup() callback (if supplied)...
|
||||
// (Companion Inline Function lower on page)
|
||||
#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__)
|
||||
|
||||
// Short version without Cleanup() callback pointer
|
||||
// Both Input (a) and Reference (b) are specified as args
|
||||
#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0)
|
||||
|
||||
// Standardized Exit Macro for leaving main()... extended version
|
||||
// (Companion Inline Function lower on page)
|
||||
#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
|
||||
|
||||
// Standardized Exit Macro for leaving main()... short version
|
||||
// (Companion Inline Function lower on page)
|
||||
#define shrEXIT(a, b) __shrExitEX(a, b, EXIT_SUCCESS)
|
||||
|
||||
// Simple argument checker macro
|
||||
#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE
|
||||
|
||||
// Define for user-customized error handling
|
||||
#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
|
||||
|
||||
// Function to deallocate memory allocated within shrUtils
|
||||
// *********************************************************************
|
||||
extern "C" void shrFree(void* ptr);
|
||||
|
||||
// *********************************************************************
|
||||
// Helper function to log standardized information to Console, to File or to both
|
||||
//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n");
|
||||
//! : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
|
||||
//!
|
||||
//! Automatically opens file and stores handle if needed and not done yet
|
||||
//! Closes file and nulls handle on request
|
||||
//!
|
||||
//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.
|
||||
//! LOGFILE and LOGBOTH may be | 'd with APPENDMODE to select file append mode instead of overwrite mode
|
||||
//! LOGFILE and LOGBOTH may be | 'd with CLOSELOG to "write and close"
|
||||
//! First 3 options may be | 'd with MASTER to enable independent write to master data log file
|
||||
//! First 3 options may be | 'd with ERRORMSG to start line with standard error message
|
||||
//! @param 2 dValue:
|
||||
//! Positive val = double value for time in secs to be formatted to 6 decimals.
|
||||
//! Negative val is an error code and this give error preformatting.
|
||||
//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.
|
||||
//! ALL printf flags, width, precision and type specifiers are supported with this exception:
|
||||
//! Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
|
||||
//! Single byte char type specifiers (%s and %c) ARE supported
|
||||
//! @param 4... variable args: like printf or fprintf. Must match format specifer type above.
|
||||
//! @return 0 if OK, negative value on error or if error occurs or was passed in.
|
||||
// *********************************************************************
|
||||
extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
|
||||
|
||||
// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0,
|
||||
// *********************************************************************
|
||||
extern "C" int shrLog(const char* cFormatString, ...);
|
||||
|
||||
// *********************************************************************
|
||||
// Delta timer function for up to 3 independent timers using host high performance counters
|
||||
// Maintains state for 3 independent counters
|
||||
//! Example: double dElapsedTime = shrDeltaTime(0);
|
||||
//!
|
||||
//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
|
||||
//! @return delta time of specified counter since last call in seconds. Otherwise -9999.0 if error
|
||||
// *********************************************************************
|
||||
extern "C" double shrDeltaT(int iCounterID);
|
||||
|
||||
// Optional LogFileNameOverride function
|
||||
// *********************************************************************
|
||||
extern "C" void shrSetLogFileName (const char* cOverRideName);
|
||||
|
||||
// Helper function to init data arrays
|
||||
// *********************************************************************
|
||||
extern "C" void shrFillArray(float* pfData, int iSize);
|
||||
|
||||
// Helper function to print data arrays
|
||||
// *********************************************************************
|
||||
extern "C" void shrPrintArray(float* pfData, int iSize);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Find the path for a filename
|
||||
//! @return the path if succeeded, otherwise 0
|
||||
//! @param filename name of the file
|
||||
//! @param executablePath optional absolute path of the executable
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing single precision floating point data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing double precision floating point data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing integer data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing unsigned integer data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data,
|
||||
unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing char / byte data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Read file \filename containing unsigned char / byte data
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the source file
|
||||
//! @param data uninitialized pointer, returned initialized and pointing to
|
||||
//! the data read
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @note If a NULL pointer is passed to this function and it is
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data,
|
||||
unsigned int* len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing single precision floating point
|
||||
//! data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @param epsilon epsilon for comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
|
||||
const float epsilon, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing double precision floating point
|
||||
//! data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
//! @param epsilon epsilon for comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
|
||||
const double epsilon, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing integer data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing unsigned integer data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data,
|
||||
unsigned int len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing char / byte data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len,
|
||||
bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Write a data file \filename containing unsigned char / byte data
|
||||
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
|
||||
//! @param filename name of the file to write
|
||||
//! @param data pointer to data to write
|
||||
//! @param len number of data elements in data, -1 on error
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
|
||||
unsigned int len, bool verbose = false);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PPM image file (with unsigned char as data element type), padding
|
||||
//! 4th component
|
||||
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param OutData handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//!
|
||||
//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData,
|
||||
unsigned int *w, unsigned int *h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PPM image file (with unsigned char as data element type, padded to
|
||||
//! 4 bytes)
|
||||
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Save PGM image file (with unsigned char as data element type)
|
||||
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data,
|
||||
unsigned int w, unsigned int h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Load PGM image file (with unsigned char as data element type)
|
||||
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
|
||||
//! @param file name of the image file
|
||||
//! @param data handle to the data read
|
||||
//! @param w width of the image
|
||||
//! @param h height of the image
|
||||
//! @note If a NULL pointer is passed to this function and it is initialized
|
||||
//! within shrUtils, then free() has to be used to deallocate the memory
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
|
||||
unsigned int *w,unsigned int *h);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
// Command line arguments: General notes
|
||||
// * All command line arguments begin with '--' followed by the token;
|
||||
// token and value are seperated by '='; example --samples=50
|
||||
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
|
||||
// (without whitespaces)
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Check if command line argument \a flag-name is given
|
||||
//! @return shrTRUE if command line argument \a flag_name has been given,
|
||||
//! otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param flag_name name of command line flag
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv,
|
||||
const char* flag_name);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type int
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv,
|
||||
const char* arg_name, int* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type unsigned int
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv,
|
||||
const char* arg_name, unsigned int* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type float
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv,
|
||||
const char* arg_name, float* val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument of type string
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val value of the command line argument
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv,
|
||||
const char* arg_name, char** val);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Get the value of a command line argument list those element are strings
|
||||
//! @return shrTRUE if command line argument \a arg_name has been given and
|
||||
//! is of the requested type, otherwise shrFALSE
|
||||
//! @param argc argc as passed to main()
|
||||
//! @param argv argv as passed to main()
|
||||
//! @param arg_name name of the command line argument
|
||||
//! @param val command line argument list
|
||||
//! @param len length of the list / number of elements
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv,
|
||||
const char* arg_name, char** val,
|
||||
unsigned int* len);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparef( const float* reference, const float* data,
|
||||
const unsigned int len);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integer arrays
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparei( const int* reference, const int* data,
|
||||
const unsigned int len );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two unsigned integer arrays, with epsilon and threshold
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
|
||||
const unsigned int len, const float epsilon, const float threshold );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two unsigned char arrays
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integers with a tolernance for # of byte errors
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len, const float epsilon, const float threshold );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two integer arrays witha n epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays with an epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays with an epsilon tolerance for equality and a
|
||||
//! threshold for # pixel errors
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon, const float threshold );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two float arrays using L2-norm with an epsilon tolerance for
|
||||
//! equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param reference handle to the reference data / gold image
|
||||
//! @param data handle to the computed data
|
||||
//! @param len number of elements in reference and data
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
|
||||
const unsigned int len, const float epsilon );
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two PPM image files with an epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param src_file filename for the image to be compared
|
||||
//! @param data filename for the reference data / gold image
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||
//! $param verboseErrors output details of image mismatch to std::err
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//! Compare two PGM image files with an epsilon tolerance for equality
|
||||
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
|
||||
//! @param src_file filename for the image to be compared
|
||||
//! @param data filename for the reference data / gold image
|
||||
//! @param epsilon epsilon to use for the comparison
|
||||
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
|
||||
//! $param verboseErrors output details of image mismatch to std::err
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
|
||||
|
||||
extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
|
||||
|
||||
extern "C" size_t shrRoundUp(int group_size, int global_size);
|
||||
|
||||
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
|
||||
// *********************************************************************
|
||||
inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
|
||||
{
|
||||
if (iReference != iSample)
|
||||
{
|
||||
shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile);
|
||||
if (pCleanup != NULL)
|
||||
{
|
||||
pCleanup(EXIT_FAILURE);
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Standardized Exit
|
||||
// *********************************************************************
|
||||
inline void __shrExitEX(int argc, const char** argv, int iExitCode)
|
||||
{
|
||||
#ifdef WIN32
|
||||
if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||
#else
|
||||
if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
|
||||
#endif
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");
|
||||
getchar();
|
||||
}
|
||||
else
|
||||
{
|
||||
shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]);
|
||||
}
|
||||
fflush(stderr);
|
||||
exit(iExitCode);
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -39,6 +39,27 @@ string kernel_names[2] = {"BFS_1", "BFS_2"};
|
||||
int work_group_size = 512;
|
||||
int device_id_inused = 0; // deviced id used (default : 0)
|
||||
|
||||
int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
|
||||
if (nullptr == filename || nullptr == data || 0 == size)
|
||||
return -1;
|
||||
|
||||
FILE* fp = fopen(filename, "r");
|
||||
if (NULL == fp) {
|
||||
fprintf(stderr, "Failed to load kernel.");
|
||||
return -1;
|
||||
}
|
||||
fseek(fp , 0 , SEEK_END);
|
||||
long fsize = ftell(fp);
|
||||
rewind(fp);
|
||||
|
||||
*data = (uint8_t*)malloc(fsize);
|
||||
*size = fread(*data, 1, fsize, fp);
|
||||
|
||||
fclose(fp);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Converts the contents of a file into a string
|
||||
*/
|
||||
@@ -222,14 +243,25 @@ free(allPlatforms);*/
|
||||
const char * source = source_str.c_str();
|
||||
size_t sourceSize[] = { source_str.length() };*/
|
||||
|
||||
oclHandles.program = clCreateProgramWithBuiltInKernels(
|
||||
oclHandles.context, 1, &oclHandles.devices[DEVICE_ID_INUSED],
|
||||
"BFS_1;BFS_2", &resultCL);
|
||||
//oclHandles.program = clCreateProgramWithBuiltInKernels(
|
||||
// oclHandles.context, 1, &oclHandles.devices[DEVICE_ID_INUSED],
|
||||
// "BFS_1;BFS_2", &resultCL);
|
||||
/*oclHandles.program = clCreateProgramWithSource(oclHandles.context,
|
||||
1,
|
||||
&source,
|
||||
sourceSize,
|
||||
&resultCL);*/
|
||||
// read kernel binary from file
|
||||
uint8_t *kernel_bin = NULL;
|
||||
size_t kernel_size;
|
||||
cl_int binary_status = 0;
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
std::abort();
|
||||
|
||||
oclHandles.program = clCreateProgramWithBinary(
|
||||
oclHandles.context, 1, &oclHandles.devices[DEVICE_ID_INUSED], &kernel_size, &kernel_bin, &binary_status, &resultCL);
|
||||
free(kernel_bin);
|
||||
|
||||
if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL))
|
||||
throw(string("InitCL()::Error: Loading Binary into cl_program. "
|
||||
"(clCreateProgramWithBinary)"));
|
||||
|
||||
@@ -1,68 +1,47 @@
|
||||
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
|
||||
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
|
||||
POCL_INC_PATH ?= $(wildcard ../include)
|
||||
POCL_LIB_PATH ?= $(wildcard ../lib)
|
||||
VX_RT_PATH ?= $(wildcard ../../../runtime)
|
||||
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
|
||||
LLVM_HOME ?= ~/dev/llvm-project/drops
|
||||
TOOLCHAIN_PATH ?= ~/dev/riscv-gnu-toolchain/drops
|
||||
SYSROOT ?= $(TOOLCHAIN_PATH)/riscv32-unknown-elf
|
||||
POCL_CC_PATH ?= $(realpath ../compiler)
|
||||
POCL_RT_PATH ?= $(realpath ../runtime)
|
||||
VORTEX_DRV_PATH ?= $(realpath ../../../driver/sw)
|
||||
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
||||
|
||||
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
|
||||
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
|
||||
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
|
||||
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
|
||||
CXXFLAGS += -std=c++11 -O0 -g -fpermissive -Wall -Wextra -pedantic -Wfatal-errors
|
||||
|
||||
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
|
||||
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
|
||||
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
|
||||
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
|
||||
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
|
||||
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
|
||||
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
|
||||
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
|
||||
CXXFLAGS += -I$(POCL_RT_PATH)/include
|
||||
|
||||
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
|
||||
|
||||
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
|
||||
CXXFLAGS += -ffreestanding # program may not begin at main()
|
||||
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
|
||||
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
|
||||
CXXFLAGS += -I$(POCL_INC_PATH)
|
||||
|
||||
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
|
||||
|
||||
PROJECT = bfs
|
||||
|
||||
SRCS = main.cc
|
||||
|
||||
all: $(PROJECT).dump $(PROJECT).hex
|
||||
all: $(PROJECT)
|
||||
|
||||
lib$(PROJECT).a: kernel.cl
|
||||
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
|
||||
kernel.pocl: kernel.cl
|
||||
TOOLCHAIN_PATH=$(TOOLCHAIN_PATH) SYSROOT=$(SYSROOT) LLVM_HOME=$(LLVM_HOME) VORTEX_RUNTIME_PATH=$(VORTEX_RT_PATH) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_HOME)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
|
||||
run-fpga: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
|
||||
run-ase: $(PROJECT) kernel.pocl
|
||||
ASE_LOG=0 LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
$(PROJECT).hex: $(PROJECT).elf
|
||||
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
|
||||
run-simx: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
$(PROJECT).dump: $(PROJECT).elf
|
||||
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||
run-rtlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run: $(PROJECT).hex
|
||||
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
|
||||
|
||||
qemu: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -strace -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-s: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-c: $(PROJECT).qemu
|
||||
$(GDB) $(PROJECT).qemu
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug
|
||||
rm -rf $(PROJECT) *.o *.dump .depend
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
||||
BIN
benchmarks/opencl/bfs/kernel.pocl
Normal file
BIN
benchmarks/opencl/bfs/kernel.pocl
Normal file
Binary file not shown.
Binary file not shown.
@@ -187,7 +187,7 @@ int main(int argc, char *argv[]) {
|
||||
FILE *fp;
|
||||
Node *h_graph_nodes;
|
||||
char *h_graph_mask, *h_updating_graph_mask, *h_graph_visited;
|
||||
|
||||
|
||||
try {
|
||||
char *input_f = "graph4096.txt";
|
||||
printf("Reading File\n");
|
||||
|
||||
BIN
benchmarks/opencl/compiler/bin/poclcc
Executable file
BIN
benchmarks/opencl/compiler/bin/poclcc
Executable file
Binary file not shown.
1
benchmarks/opencl/compiler/lib/libOpenCL.so
Symbolic link
1
benchmarks/opencl/compiler/lib/libOpenCL.so
Symbolic link
@@ -0,0 +1 @@
|
||||
libOpenCL.so.2
|
||||
1
benchmarks/opencl/compiler/lib/libOpenCL.so.2
Symbolic link
1
benchmarks/opencl/compiler/lib/libOpenCL.so.2
Symbolic link
@@ -0,0 +1 @@
|
||||
libOpenCL.so.2.5.0
|
||||
BIN
benchmarks/opencl/compiler/lib/libOpenCL.so.2.5.0
Normal file
BIN
benchmarks/opencl/compiler/lib/libOpenCL.so.2.5.0
Normal file
Binary file not shown.
193
benchmarks/opencl/compiler/share/pocl/include/_builtin_renames.h
Normal file
193
benchmarks/opencl/compiler/share/pocl/include/_builtin_renames.h
Normal file
@@ -0,0 +1,193 @@
|
||||
/* pocl/_kernel_renames.h - Rename OpenCL builtin functions to avoid name
|
||||
clashes with libm functions which are called in implementation.
|
||||
|
||||
Copyright (c) 2011-2013 Erik Schnetter <eschnetter@perimeterinstitute.ca>
|
||||
Perimeter Institute for Theoretical Physics
|
||||
Copyright (c) 2011-2017 Pekka Jääskeläinen / TUT
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef _KERNEL_RENAMES_H
|
||||
#define _KERNEL_RENAMES_H
|
||||
|
||||
/* Move built-in declarations and libm functions out of the way.
|
||||
(There should be a better way of doing so. These functions are
|
||||
built-in math functions for OpenCL (see Clang's "Builtins.def").
|
||||
Functions defined in libc or libm may also
|
||||
interfere with OpenCL's functions, since their prototypes will be
|
||||
wrong. */
|
||||
#define abs _cl_abs
|
||||
#define abs_diff _cl_abs_diff
|
||||
#define acos _cl_acos
|
||||
#define acosh _cl_acosh
|
||||
#define acospi _cl_acospi
|
||||
#define add_sat _cl_add_sat
|
||||
#define all _cl_all
|
||||
#define any _cl_any
|
||||
#define asin _cl_asin
|
||||
#define asinh _cl_asinh
|
||||
#define asinpi _cl_asinpi
|
||||
#define atan _cl_atan
|
||||
#define atan2 _cl_atan2
|
||||
#define atan2pi _cl_atan2pi
|
||||
#define atanh _cl_atanh
|
||||
#define atanpi _cl_atanpi
|
||||
#define bitselect _cl_bitselect
|
||||
#define cbrt _cl_cbrt
|
||||
#define ceil _cl_ceil
|
||||
#define clamp _cl_clamp
|
||||
#define clz _cl_clz
|
||||
#define copysign _cl_copysign
|
||||
#define cos _cl_cos
|
||||
#define cosh _cl_cosh
|
||||
#define cospi _cl_cospi
|
||||
#define cross _cl_cross
|
||||
#define degrees _cl_degrees
|
||||
#define distance _cl_distance
|
||||
#define dot _cl_dot
|
||||
#define erf _cl_erf
|
||||
#define erfc _cl_erfc
|
||||
#define exp _cl_exp
|
||||
#define exp10 _cl_exp10
|
||||
#define exp2 _cl_exp2
|
||||
#define expm1 _cl_expm1
|
||||
#define fabs _cl_fabs
|
||||
#define fast_distance _cl_fast_distance
|
||||
#define fast_length _cl_fast_length
|
||||
#define fast_normalize _cl_fast_normalize
|
||||
#define fdim _cl_fdim
|
||||
#define floor _cl_floor
|
||||
#define fma _cl_fma
|
||||
#define fmax _cl_fmax
|
||||
#define fmin _cl_fmin
|
||||
#define fmod _cl_fmod
|
||||
#define fract _cl_fract
|
||||
#define frexp _cl_frexp
|
||||
#define hadd _cl_hadd
|
||||
#define half_cos _cl_half_cos
|
||||
#define half_divide _cl_half_divide
|
||||
#define half_exp _cl_half_exp
|
||||
#define half_exp10 _cl_half_exp10
|
||||
#define half_exp2 _cl_half_exp2
|
||||
#define half_log _cl_half_log
|
||||
#define half_log10 _cl_half_log10
|
||||
#define half_log2 _cl_half_log2
|
||||
#define half_powr _cl_half_powr
|
||||
#define half_recip _cl_half_recip
|
||||
#define half_rsqrt _cl_half_rsqrt
|
||||
#define half_sin _cl_half_sin
|
||||
#define half_sqrt _cl_half_sqrt
|
||||
#define half_tan _cl_half_tan
|
||||
#define hypot _cl_hypot
|
||||
#define ilogb _cl_ilogb
|
||||
#define isequal _cl_isequal
|
||||
#define isfinite _cl_isfinite
|
||||
#define isgreater _cl_isgreater
|
||||
#define isgreaterequal _cl_isgreaterequal
|
||||
#define isinf _cl_isinf
|
||||
#define isless _cl_isless
|
||||
#define islessequal _cl_islessequal
|
||||
#define islessgreater _cl_islessgreater
|
||||
#define isnan _cl_isnan
|
||||
#define isnormal _cl_isnormal
|
||||
#define isnotequal _cl_isnotequal
|
||||
#define isordered _cl_isordered
|
||||
#define isunordered _cl_isunordered
|
||||
#define ldexp _cl_ldexp
|
||||
#define length _cl_length
|
||||
#define lgamma _cl_lgamma
|
||||
#define lgamma_r _cl_lgamma_r
|
||||
#define log _cl_log
|
||||
#define log10 _cl_log10
|
||||
#define log1p _cl_log1p
|
||||
#define log2 _cl_log2
|
||||
#define logb _cl_logb
|
||||
#define mad _cl_mad
|
||||
#define mad24 _cl_mad24
|
||||
#define mad_hi _cl_mad_hi
|
||||
#define mad_sat _cl_mad_sat
|
||||
#define max _cl_max
|
||||
#define maxmag _cl_maxmag
|
||||
#define min _cl_min
|
||||
#define minmag _cl_minmag
|
||||
#define mix _cl_mix
|
||||
#define modf _cl_modf
|
||||
#define mul24 _cl_mul24
|
||||
#define mul_hi _cl_mul_hi
|
||||
#define nan _cl_nan
|
||||
#define native_cos _cl_native_cos
|
||||
#define native_divide _cl_native_divide
|
||||
#define native_exp _cl_native_exp
|
||||
#define native_exp10 _cl_native_exp10
|
||||
#define native_exp2 _cl_native_exp2
|
||||
#define native_log _cl_native_log
|
||||
#define native_log10 _cl_native_log10
|
||||
#define native_log2 _cl_native_log2
|
||||
#define native_powr _cl_native_powr
|
||||
#define native_recip _cl_native_recip
|
||||
#define native_rsqrt _cl_native_rsqrt
|
||||
#define native_sin _cl_native_sin
|
||||
#define native_sqrt _cl_native_sqrt
|
||||
#define native_tan _cl_native_tan
|
||||
#define nextafter _cl_nextafter
|
||||
#define normalize _cl_normalize
|
||||
#define popcount _cl_popcount
|
||||
#define pow _cl_pow
|
||||
#define pown _cl_pown
|
||||
#define powr _cl_powr
|
||||
#define radians _cl_radians
|
||||
#define remainder _cl_remainder
|
||||
#define remquo _cl_remquo
|
||||
#define rhadd _cl_rhadd
|
||||
#define rint _cl_rint
|
||||
#define rootn _cl_rootn
|
||||
#define rotate _cl_rotate
|
||||
#define round _cl_round
|
||||
#define rsqrt _cl_rsqrt
|
||||
#define select _cl_select
|
||||
#define sign _cl_sign
|
||||
#define signbit _cl_signbit
|
||||
#define sin _cl_sin
|
||||
#define sincos _cl_sincos
|
||||
#define sinh _cl_sinh
|
||||
#define sinpi _cl_sinpi
|
||||
#define smoothstep _cl_smoothstep
|
||||
#define sqrt _cl_sqrt
|
||||
#define step _cl_step
|
||||
#define sub_sat _cl_sub_sat
|
||||
#define tan _cl_tan
|
||||
#define tanh _cl_tanh
|
||||
#define tanpi _cl_tanpi
|
||||
#define tgamma _cl_tgamma
|
||||
#define trunc _cl_trunc
|
||||
#define upsample _cl_upsample
|
||||
#define atom_add atomic_add
|
||||
#define atom_sub atomic_sub
|
||||
#define atom_xchg atomic_xchg
|
||||
#define atom_inc atomic_inc
|
||||
#define atom_dec atomic_dec
|
||||
#define atom_cmpxchg atomic_cmpxchg
|
||||
#define atom_min atomic_min
|
||||
#define atom_max atomic_max
|
||||
#define atom_and atomic_and
|
||||
#define atom_or atomic_or
|
||||
#define atom_xor atomic_xor
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,91 @@
|
||||
/* This file includes opencl-c.h from Clang and fixes a few pocl extras.
|
||||
|
||||
Copyright (c) 2011-2017 Pekka Jääskeläinen / TUT
|
||||
Copyright (c) 2017 Michal Babej / Tampere University of Technology
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef _OPENCL_H_
|
||||
/* Use the declarations shipped with Clang. */
|
||||
/* Check for _OPENCL_H already here because the kernel compiler loads the
|
||||
header beforehand, but cannot find the file due to include paths not
|
||||
set up. */
|
||||
#include <opencl-c.h>
|
||||
|
||||
/* Missing declarations from opencl-c.h. Some of the geometric builtins are
|
||||
defined only up to 4 vectors, but we implement them all: */
|
||||
#ifdef cl_khr_fp16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
half _CL_OVERLOADABLE _CL_READNONE length (half8 p);
|
||||
half _CL_OVERLOADABLE _CL_READNONE length (half16 p);
|
||||
|
||||
half _CL_OVERLOADABLE _CL_READNONE fast_length (half8 p);
|
||||
half _CL_OVERLOADABLE _CL_READNONE fast_length (half16 p);
|
||||
|
||||
half8 _CL_OVERLOADABLE _CL_READNONE normalize (half8 p);
|
||||
half16 _CL_OVERLOADABLE _CL_READNONE normalize (half16 p);
|
||||
|
||||
half8 _CL_OVERLOADABLE _CL_READNONE fast_normalize (half8 p);
|
||||
half16 _CL_OVERLOADABLE _CL_READNONE fast_normalize (half16 p);
|
||||
|
||||
half _CL_OVERLOADABLE _CL_READNONE dot (half8 p0, half8 p1);
|
||||
half _CL_OVERLOADABLE _CL_READNONE dot (half16 p0, half16 p1);
|
||||
#endif
|
||||
|
||||
float _CL_OVERLOADABLE _CL_READNONE length (float8 p);
|
||||
float _CL_OVERLOADABLE _CL_READNONE length (float16 p);
|
||||
|
||||
float _CL_OVERLOADABLE _CL_READNONE fast_length (float8 p);
|
||||
float _CL_OVERLOADABLE _CL_READNONE fast_length (float16 p);
|
||||
|
||||
float8 _CL_OVERLOADABLE _CL_READNONE normalize (float8 p);
|
||||
float16 _CL_OVERLOADABLE _CL_READNONE normalize (float16 p);
|
||||
|
||||
float8 _CL_OVERLOADABLE _CL_READNONE fast_normalize (float8 p);
|
||||
float16 _CL_OVERLOADABLE _CL_READNONE fast_normalize (float16 p);
|
||||
|
||||
float _CL_OVERLOADABLE _CL_READNONE dot (float8 p0, float8 p1);
|
||||
float _CL_OVERLOADABLE _CL_READNONE dot (float16 p0, float16 p1);
|
||||
|
||||
#ifdef cl_khr_fp64
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
||||
|
||||
double _CL_OVERLOADABLE _CL_READNONE length (double8 p);
|
||||
double _CL_OVERLOADABLE _CL_READNONE length (double16 p);
|
||||
|
||||
double _CL_OVERLOADABLE _CL_READNONE fast_length (double p);
|
||||
double _CL_OVERLOADABLE _CL_READNONE fast_length (double2 p);
|
||||
double _CL_OVERLOADABLE _CL_READNONE fast_length (double3 p);
|
||||
double _CL_OVERLOADABLE _CL_READNONE fast_length (double4 p);
|
||||
double _CL_OVERLOADABLE _CL_READNONE fast_length (double8 p);
|
||||
double _CL_OVERLOADABLE _CL_READNONE fast_length (double16 p);
|
||||
|
||||
double8 _CL_OVERLOADABLE _CL_READNONE normalize (double8 p);
|
||||
double16 _CL_OVERLOADABLE _CL_READNONE normalize (double16 p);
|
||||
|
||||
double8 _CL_OVERLOADABLE _CL_READNONE fast_normalize (double8 p);
|
||||
double16 _CL_OVERLOADABLE _CL_READNONE fast_normalize (double16 p);
|
||||
|
||||
double _CL_OVERLOADABLE _CL_READNONE dot (double8 p0, double8 p1);
|
||||
double _CL_OVERLOADABLE _CL_READNONE dot (double16 p0, double16 p1);
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,58 @@
|
||||
/* Enable all extensions known to pocl, which a device supports.
|
||||
* This is required at the start of include/_kernel.h for prototypes,
|
||||
* then at kernel lib compilation phase (because _kernel.h disables
|
||||
* everything at the end).
|
||||
*/
|
||||
|
||||
/* OpenCL 1.0-only extensions */
|
||||
|
||||
#if (__OPENCL_C_VERSION__ < 110)
|
||||
|
||||
#ifdef cl_khr_global_int32_base_atomics
|
||||
# pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_khr_global_int32_extended_atomics
|
||||
# pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_khr_local_int32_base_atomics
|
||||
# pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_khr_local_int32_extended_atomics
|
||||
# pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_khr_byte_addressable_store
|
||||
# pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/* all versions */
|
||||
#ifdef cl_khr_fp16
|
||||
# pragma OPENCL EXTENSION cl_khr_fp16: enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_khr_fp64
|
||||
# pragma OPENCL EXTENSION cl_khr_fp64: enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_khr_int64_base_atomics
|
||||
# pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
|
||||
#endif
|
||||
|
||||
#ifdef cl_khr_int64_extended_atomics
|
||||
# pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
|
||||
#endif
|
||||
|
||||
#if (__clang_major__ > 4)
|
||||
|
||||
#ifdef cl_khr_3d_image_writes
|
||||
# pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
233
benchmarks/opencl/compiler/share/pocl/include/_kernel.h
Normal file
233
benchmarks/opencl/compiler/share/pocl/include/_kernel.h
Normal file
@@ -0,0 +1,233 @@
|
||||
/* pocl/_kernel.h - OpenCL types and runtime library
|
||||
functions declarations. This should be included only from OpenCL C files.
|
||||
|
||||
Copyright (c) 2011 Universidad Rey Juan Carlos
|
||||
Copyright (c) 2011-2017 Pekka Jääskeläinen / TUT
|
||||
Copyright (c) 2011-2013 Erik Schnetter <eschnetter@perimeterinstitute.ca>
|
||||
Perimeter Institute for Theoretical Physics
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* If the -cl-std build option is not specified, the highest OpenCL C 1.x
|
||||
* language version supported by each device is used as the version of
|
||||
* OpenCL C when compiling the program for each device.
|
||||
*/
|
||||
#ifndef __OPENCL_C_VERSION__
|
||||
#define __OPENCL_C_VERSION__ 120
|
||||
#endif
|
||||
|
||||
#if (__OPENCL_C_VERSION__ > 99)
|
||||
#define CL_VERSION_1_0 100
|
||||
#endif
|
||||
|
||||
#if (__OPENCL_C_VERSION__ > 109)
|
||||
#define CL_VERSION_1_1 110
|
||||
#endif
|
||||
|
||||
#if (__OPENCL_C_VERSION__ > 119)
|
||||
#define CL_VERSION_1_2 120
|
||||
#endif
|
||||
|
||||
#if (__OPENCL_C_VERSION__ > 199)
|
||||
#define CL_VERSION_2_0 200
|
||||
#endif
|
||||
|
||||
#include "_enable_all_exts.h"
|
||||
|
||||
#include "_builtin_renames.h"
|
||||
|
||||
/* Define some feature test macros to help write generic code. These are used
|
||||
* mostly in _pocl_opencl.h header + some .cl files in kernel library */
|
||||
|
||||
#ifdef cl_khr_int64
|
||||
# define __IF_INT64(x) x
|
||||
#else
|
||||
# define __IF_INT64(x)
|
||||
#endif
|
||||
#ifdef cl_khr_fp16
|
||||
# define __IF_FP16(x) x
|
||||
#else
|
||||
# define __IF_FP16(x)
|
||||
#endif
|
||||
#ifdef cl_khr_fp64
|
||||
# define __IF_FP64(x) x
|
||||
#else
|
||||
# define __IF_FP64(x)
|
||||
#endif
|
||||
#ifdef cl_khr_int64_base_atomics
|
||||
#define __IF_BA64(x) x
|
||||
#else
|
||||
#define __IF_BA64(x)
|
||||
#endif
|
||||
#ifdef cl_khr_int64_extended_atomics
|
||||
#define __IF_EA64(x) x
|
||||
#else
|
||||
#define __IF_EA64(x)
|
||||
#endif
|
||||
|
||||
/****************************************************************************/
|
||||
|
||||
/* Function/type attributes supported by Clang/SPIR */
|
||||
#if __has_attribute(__always_inline__)
|
||||
# define _CL_ALWAYSINLINE __attribute__((__always_inline__))
|
||||
#else
|
||||
# define _CL_ALWAYSINLINE
|
||||
#endif
|
||||
#if __has_attribute(__noinline__)
|
||||
# define _CL_NOINLINE __attribute__((__noinline__))
|
||||
#else
|
||||
# define _CL_NOINLINE
|
||||
#endif
|
||||
#if __has_attribute(__overloadable__)
|
||||
# define _CL_OVERLOADABLE __attribute__((__overloadable__))
|
||||
#else
|
||||
# define _CL_OVERLOADABLE
|
||||
#endif
|
||||
#if __has_attribute(__pure__)
|
||||
# define _CL_READONLY __attribute__((__pure__))
|
||||
#else
|
||||
# define _CL_READONLY
|
||||
#endif
|
||||
#if __has_attribute(__const__)
|
||||
# define _CL_READNONE __attribute__((__const__))
|
||||
#else
|
||||
# define _CL_READNONE
|
||||
#endif
|
||||
#if __has_attribute(convergent)
|
||||
# define _CL_CONVERGENT __attribute__((convergent))
|
||||
#else
|
||||
# define _CL_CONVERGENT
|
||||
#endif
|
||||
|
||||
/************************ setup Clang version macros ******************/
|
||||
|
||||
#if (__clang_major__ == 6)
|
||||
|
||||
# undef LLVM_6_0
|
||||
# define LLVM_6_0
|
||||
|
||||
#elif (__clang_major__ == 7)
|
||||
|
||||
# undef LLVM_7_0
|
||||
# define LLVM_7_0
|
||||
|
||||
#elif (__clang_major__ == 8)
|
||||
|
||||
# undef LLVM_8_0
|
||||
# define LLVM_8_0
|
||||
|
||||
#elif (__clang_major__ == 9)
|
||||
|
||||
# undef LLVM_9_0
|
||||
# define LLVM_9_0
|
||||
|
||||
#elif (__clang_major__ == 10)
|
||||
|
||||
# undef LLVM_10_0
|
||||
# define LLVM_10_0
|
||||
|
||||
#else
|
||||
|
||||
#error Unsupported Clang/LLVM version.
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef LLVM_10_0
|
||||
#define LLVM_OLDER_THAN_10_0 1
|
||||
|
||||
#ifndef LLVM_9_0
|
||||
#define LLVM_OLDER_THAN_9_0 1
|
||||
|
||||
#ifndef LLVM_8_0
|
||||
#define LLVM_OLDER_THAN_8_0 1
|
||||
|
||||
#ifndef LLVM_7_0
|
||||
#define LLVM_OLDER_THAN_7_0 1
|
||||
|
||||
#ifndef LLVM_6_0
|
||||
#define LLVM_OLDER_THAN_6_0 1
|
||||
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/****************************************************************************/
|
||||
|
||||
/* A static assert statement to catch inconsistencies at build time */
|
||||
#if __has_extension(__c_static_assert__)
|
||||
# define _CL_STATIC_ASSERT(_t, _x) _Static_assert(_x, #_t)
|
||||
#else
|
||||
# define _CL_STATIC_ASSERT(_t, _x) typedef int __cl_ai##_t[(x) ? 1 : -1];
|
||||
#endif
|
||||
|
||||
/****************************************************************************/
|
||||
|
||||
#define IMG_RO_AQ __read_only
|
||||
#define IMG_WO_AQ __write_only
|
||||
|
||||
#if (__OPENCL_C_VERSION__ > 199)
|
||||
#define CLANG_HAS_RW_IMAGES
|
||||
#define IMG_RW_AQ __read_write
|
||||
#else
|
||||
#undef CLANG_HAS_RW_IMAGES
|
||||
#define IMG_RW_AQ __RW_IMAGES_UNSUPPORTED_BEFORE_CL_20
|
||||
#endif
|
||||
|
||||
/****************************************************************************/
|
||||
/* use Clang opencl header for definitions. */
|
||||
|
||||
#ifdef POCL_DEVICE_ADDRESS_BITS
|
||||
|
||||
/* If we wish to override the Clang set __SIZE_TYPE__ for this target,
|
||||
let's do it here so the opencl-c.h sets size_t to the wanted type. */
|
||||
|
||||
#ifdef __SIZE_TYPE__
|
||||
#undef __SIZE_TYPE__
|
||||
#endif
|
||||
|
||||
#if POCL_DEVICE_ADDRESS_BITS == 32
|
||||
#define __SIZE_TYPE__ uint
|
||||
#elif POCL_DEVICE_ADDRESS_BITS == 64
|
||||
#define __SIZE_TYPE__ ulong
|
||||
#else
|
||||
#error Unsupported POCL_DEVICE_ADDRESS_BITS value.
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#include "_clang_opencl.h"
|
||||
|
||||
/****************************************************************************/
|
||||
|
||||
/* GNU's libm seems to use INT_MIN here while the Clang's header uses
|
||||
INT_MAX. Both are allowed by the OpenCL specs, but we want them to
|
||||
be unified to avoid failing tests. */
|
||||
#undef FP_ILOGBNAN
|
||||
#undef FP_ILOGB0
|
||||
#define FP_ILOGBNAN INT_MIN
|
||||
#define FP_ILOGB0 INT_MIN
|
||||
|
||||
/****************************************************************************/
|
||||
|
||||
#include "pocl_image_types.h"
|
||||
|
||||
#pragma OPENCL EXTENSION all : disable
|
||||
189
benchmarks/opencl/compiler/share/pocl/include/_kernel_c.h
Normal file
189
benchmarks/opencl/compiler/share/pocl/include/_kernel_c.h
Normal file
@@ -0,0 +1,189 @@
|
||||
/* pocl/_kernel_c.h - C compatible OpenCL types and runtime library
|
||||
functions declarations for kernel builtin implementations using C.
|
||||
|
||||
Copyright (c) 2011 Universidad Rey Juan Carlos
|
||||
Copyright (c) 2011-2017 Pekka Jääskeläinen / TUT
|
||||
Copyright (c) 2011-2013 Erik Schnetter <eschnetter@perimeterinstitute.ca>
|
||||
Perimeter Institute for Theoretical Physics
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
/**
|
||||
* Header that can be implemented in C compiled implementations of
|
||||
* built-in functions to introduce the OpenCL C compatible types etc.
|
||||
*/
|
||||
#ifndef _KERNEL_C_H
|
||||
#define _KERNEL_C_H
|
||||
|
||||
#include "pocl_types.h"
|
||||
|
||||
#include "_kernel_constants.h"
|
||||
|
||||
/* Function/type attributes supported by Clang/SPIR */
|
||||
#if __has_attribute(__always_inline__)
|
||||
# define _CL_ALWAYSINLINE __attribute__((__always_inline__))
|
||||
#else
|
||||
# define _CL_ALWAYSINLINE
|
||||
#endif
|
||||
#if __has_attribute(__noinline__)
|
||||
# define _CL_NOINLINE __attribute__((__noinline__))
|
||||
#else
|
||||
# define _CL_NOINLINE
|
||||
#endif
|
||||
#if __has_attribute(__overloadable__)
|
||||
# define _CL_OVERLOADABLE __attribute__((__overloadable__))
|
||||
#else
|
||||
# define _CL_OVERLOADABLE
|
||||
#endif
|
||||
#if __has_attribute(__pure__)
|
||||
# define _CL_READONLY __attribute__((__pure__))
|
||||
#else
|
||||
# define _CL_READONLY
|
||||
#endif
|
||||
#if __has_attribute(__const__)
|
||||
# define _CL_READNONE __attribute__((__const__))
|
||||
#else
|
||||
# define _CL_READNONE
|
||||
#endif
|
||||
#if __has_attribute(convergent)
|
||||
# define _CL_CONVERGENT __attribute__((convergent))
|
||||
#else
|
||||
# define _CL_CONVERGENT
|
||||
#endif
|
||||
|
||||
|
||||
typedef char char2 __attribute__((__ext_vector_type__(2)));
|
||||
typedef char char3 __attribute__((__ext_vector_type__(3)));
|
||||
typedef char char4 __attribute__((__ext_vector_type__(4)));
|
||||
typedef char char8 __attribute__((__ext_vector_type__(8)));
|
||||
typedef char char16 __attribute__((__ext_vector_type__(16)));
|
||||
|
||||
typedef uchar uchar2 __attribute__((__ext_vector_type__(2)));
|
||||
typedef uchar uchar3 __attribute__((__ext_vector_type__(3)));
|
||||
typedef uchar uchar4 __attribute__((__ext_vector_type__(4)));
|
||||
typedef uchar uchar8 __attribute__((__ext_vector_type__(8)));
|
||||
typedef uchar uchar16 __attribute__((__ext_vector_type__(16)));
|
||||
|
||||
typedef short short2 __attribute__((__ext_vector_type__(2)));
|
||||
typedef short short3 __attribute__((__ext_vector_type__(3)));
|
||||
typedef short short4 __attribute__((__ext_vector_type__(4)));
|
||||
typedef short short8 __attribute__((__ext_vector_type__(8)));
|
||||
typedef short short16 __attribute__((__ext_vector_type__(16)));
|
||||
|
||||
typedef ushort ushort2 __attribute__((__ext_vector_type__(2)));
|
||||
typedef ushort ushort3 __attribute__((__ext_vector_type__(3)));
|
||||
typedef ushort ushort4 __attribute__((__ext_vector_type__(4)));
|
||||
typedef ushort ushort8 __attribute__((__ext_vector_type__(8)));
|
||||
typedef ushort ushort16 __attribute__((__ext_vector_type__(16)));
|
||||
|
||||
typedef int int2 __attribute__((__ext_vector_type__(2)));
|
||||
typedef int int3 __attribute__((__ext_vector_type__(3)));
|
||||
typedef int int4 __attribute__((__ext_vector_type__(4)));
|
||||
typedef int int8 __attribute__((__ext_vector_type__(8)));
|
||||
typedef int int16 __attribute__((__ext_vector_type__(16)));
|
||||
|
||||
typedef uint uint2 __attribute__((__ext_vector_type__(2)));
|
||||
typedef uint uint3 __attribute__((__ext_vector_type__(3)));
|
||||
typedef uint uint4 __attribute__((__ext_vector_type__(4)));
|
||||
typedef uint uint8 __attribute__((__ext_vector_type__(8)));
|
||||
typedef uint uint16 __attribute__((__ext_vector_type__(16)));
|
||||
|
||||
#if defined(__CBUILD__) && defined(cl_khr_fp16)
|
||||
/* NOTE: the Clang's __fp16 does not work robustly in C mode,
|
||||
it might produce invalid code at least with half vectors.
|
||||
Using the native 'half' type in OpenCL C mode works better. */
|
||||
typedef __fp16 half;
|
||||
#endif
|
||||
|
||||
typedef half half2 __attribute__((__ext_vector_type__(2)));
|
||||
typedef half half3 __attribute__((__ext_vector_type__(3)));
|
||||
typedef half half4 __attribute__((__ext_vector_type__(4)));
|
||||
typedef half half8 __attribute__((__ext_vector_type__(8)));
|
||||
typedef half half16 __attribute__((__ext_vector_type__(16)));
|
||||
|
||||
typedef float float2 __attribute__((__ext_vector_type__(2)));
|
||||
typedef float float3 __attribute__((__ext_vector_type__(3)));
|
||||
typedef float float4 __attribute__((__ext_vector_type__(4)));
|
||||
typedef float float8 __attribute__((__ext_vector_type__(8)));
|
||||
typedef float float16 __attribute__((__ext_vector_type__(16)));
|
||||
|
||||
#ifdef cl_khr_fp64
|
||||
# ifndef __CBUILD__
|
||||
# pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
||||
# endif
|
||||
typedef double double2 __attribute__((__ext_vector_type__(2)));
|
||||
typedef double double3 __attribute__((__ext_vector_type__(3)));
|
||||
typedef double double4 __attribute__((__ext_vector_type__(4)));
|
||||
typedef double double8 __attribute__((__ext_vector_type__(8)));
|
||||
typedef double double16 __attribute__((__ext_vector_type__(16)));
|
||||
#endif
|
||||
|
||||
#ifdef cl_khr_int64
|
||||
typedef long long2 __attribute__((__ext_vector_type__(2)));
|
||||
typedef long long3 __attribute__((__ext_vector_type__(3)));
|
||||
typedef long long4 __attribute__((__ext_vector_type__(4)));
|
||||
typedef long long8 __attribute__((__ext_vector_type__(8)));
|
||||
typedef long long16 __attribute__((__ext_vector_type__(16)));
|
||||
|
||||
typedef ulong ulong2 __attribute__((__ext_vector_type__(2)));
|
||||
typedef ulong ulong3 __attribute__((__ext_vector_type__(3)));
|
||||
typedef ulong ulong4 __attribute__((__ext_vector_type__(4)));
|
||||
typedef ulong ulong8 __attribute__((__ext_vector_type__(8)));
|
||||
typedef ulong ulong16 __attribute__((__ext_vector_type__(16)));
|
||||
#endif
|
||||
|
||||
#if defined(__TCE__)
|
||||
|
||||
#define POCL_ADDRESS_SPACE_PRIVATE 0
|
||||
#define POCL_ADDRESS_SPACE_GLOBAL 1
|
||||
#define POCL_ADDRESS_SPACE_LOCAL 3
|
||||
#define POCL_ADDRESS_SPACE_CONSTANT 2
|
||||
#define POCL_ADDRESS_SPACE_GENERIC 6
|
||||
|
||||
#endif
|
||||
|
||||
typedef uint cl_mem_fence_flags;
|
||||
|
||||
/* Integer Constants */
|
||||
|
||||
#if defined(__CBUILD__)
|
||||
|
||||
#define CHAR_BIT 8
|
||||
#define CHAR_MAX SCHAR_MAX
|
||||
#define CHAR_MIN SCHAR_MIN
|
||||
#define INT_MAX 2147483647
|
||||
#define INT_MIN (-2147483647 - 1)
|
||||
#ifdef cl_khr_int64
|
||||
#define LONG_MAX 0x7fffffffffffffffL
|
||||
#define LONG_MIN (-0x7fffffffffffffffL - 1)
|
||||
#endif
|
||||
#define SCHAR_MAX 127
|
||||
#define SCHAR_MIN (-127 - 1)
|
||||
#define SHRT_MAX 32767
|
||||
#define SHRT_MIN (-32767 - 1)
|
||||
#define UCHAR_MAX 255
|
||||
#define USHRT_MAX 65535
|
||||
#define UINT_MAX 0xffffffff
|
||||
#ifdef cl_khr_int64
|
||||
#define ULONG_MAX 0xffffffffffffffffUL
|
||||
#endif
|
||||
|
||||
#endif /* __CBUILD__ */
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,93 @@
|
||||
/* pocl/_kernel_constants.h - C compatible OpenCL types and runtime library
|
||||
constants declarations.
|
||||
|
||||
Copyright (c) 2011 Universidad Rey Juan Carlos
|
||||
Copyright (c) 2011-2013 Pekka Jääskeläinen / TUT
|
||||
Copyright (c) 2011-2013 Erik Schnetter <eschnetter@perimeterinstitute.ca>
|
||||
Perimeter Institute for Theoretical Physics
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
/**
|
||||
* Header that can be implemented in C compiled implementations of
|
||||
* built-in functions to introduce the OpenCL C compatible constants.
|
||||
*/
|
||||
#ifndef _KERNEL_CONSTANTS_H
|
||||
#define _KERNEL_CONSTANTS_H
|
||||
|
||||
/* clang's header defines these */
|
||||
#ifndef _OPENCL_H_
|
||||
|
||||
/* cl_channel_order */
|
||||
#define CLK_R 0x10B0
|
||||
#define CLK_A 0x10B1
|
||||
#define CLK_RG 0x10B2
|
||||
#define CLK_RA 0x10B3
|
||||
#define CLK_RGB 0x10B4
|
||||
#define CLK_RGBA 0x10B5
|
||||
#define CLK_BGRA 0x10B6
|
||||
#define CLK_ARGB 0x10B7
|
||||
#define CLK_INTENSITY 0x10B8
|
||||
#define CLK_LUMINANCE 0x10B9
|
||||
#define CLK_Rx 0x10BA
|
||||
#define CLK_RGx 0x10BB
|
||||
#define CLK_RGBx 0x10BC
|
||||
#define CLK_DEPTH 0x10BD
|
||||
#define CLK_DEPTH_STENCIL 0x10BE
|
||||
|
||||
/* cl_channel_type */
|
||||
#define CLK_SNORM_INT8 0x10D0
|
||||
#define CLK_SNORM_INT16 0x10D1
|
||||
#define CLK_UNORM_INT8 0x10D2
|
||||
#define CLK_UNORM_INT16 0x10D3
|
||||
#define CLK_UNORM_SHORT_565 0x10D4
|
||||
#define CLK_UNORM_SHORT_555 0x10D5
|
||||
#define CLK_UNORM_INT_101010 0x10D6
|
||||
#define CLK_SIGNED_INT8 0x10D7
|
||||
#define CLK_SIGNED_INT16 0x10D8
|
||||
#define CLK_SIGNED_INT32 0x10D9
|
||||
#define CLK_UNSIGNED_INT8 0x10DA
|
||||
#define CLK_UNSIGNED_INT16 0x10DB
|
||||
#define CLK_UNSIGNED_INT32 0x10DC
|
||||
#define CLK_HALF_FLOAT 0x10DD
|
||||
#define CLK_FLOAT 0x10DE
|
||||
#define CLK_UNORM_INT24 0x10DF
|
||||
|
||||
/* cl_addressing _mode */
|
||||
#define CLK_ADDRESS_NONE 0x00
|
||||
#define CLK_ADDRESS_CLAMP_TO_EDGE 0x02
|
||||
#define CLK_ADDRESS_CLAMP 0x04
|
||||
#define CLK_ADDRESS_REPEAT 0x06
|
||||
#define CLK_ADDRESS_MIRRORED_REPEAT 0x08
|
||||
|
||||
/* cl_sampler_info */
|
||||
#define CLK_NORMALIZED_COORDS_FALSE 0x00
|
||||
#define CLK_NORMALIZED_COORDS_TRUE 0x01
|
||||
|
||||
/* filter_mode */
|
||||
#define CLK_FILTER_NEAREST 0x10
|
||||
#define CLK_FILTER_LINEAR 0x20
|
||||
|
||||
/* barrier() flags */
|
||||
#define CLK_LOCAL_MEM_FENCE 0x01
|
||||
#define CLK_GLOBAL_MEM_FENCE 0x02
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
571
benchmarks/opencl/compiler/share/pocl/include/opencl-c-base.h
Normal file
571
benchmarks/opencl/compiler/share/pocl/include/opencl-c-base.h
Normal file
@@ -0,0 +1,571 @@
|
||||
//===----- opencl-c-base.h - OpenCL C language base definitions -----------===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef _OPENCL_BASE_H_
|
||||
#define _OPENCL_BASE_H_
|
||||
|
||||
// built-in scalar data types:
|
||||
|
||||
/**
|
||||
* An unsigned 8-bit integer.
|
||||
*/
|
||||
typedef unsigned char uchar;
|
||||
|
||||
/**
|
||||
* An unsigned 16-bit integer.
|
||||
*/
|
||||
typedef unsigned short ushort;
|
||||
|
||||
/**
|
||||
* An unsigned 32-bit integer.
|
||||
*/
|
||||
typedef unsigned int uint;
|
||||
|
||||
/**
|
||||
* An unsigned 64-bit integer.
|
||||
*/
|
||||
typedef unsigned long ulong;
|
||||
|
||||
/**
|
||||
* The unsigned integer type of the result of the sizeof operator. This
|
||||
* is a 32-bit unsigned integer if CL_DEVICE_ADDRESS_BITS
|
||||
* defined in table 4.3 is 32-bits and is a 64-bit unsigned integer if
|
||||
* CL_DEVICE_ADDRESS_BITS is 64-bits.
|
||||
*/
|
||||
typedef __SIZE_TYPE__ size_t;
|
||||
|
||||
/**
|
||||
* A signed integer type that is the result of subtracting two pointers.
|
||||
* This is a 32-bit signed integer if CL_DEVICE_ADDRESS_BITS
|
||||
* defined in table 4.3 is 32-bits and is a 64-bit signed integer if
|
||||
* CL_DEVICE_ADDRESS_BITS is 64-bits.
|
||||
*/
|
||||
typedef __PTRDIFF_TYPE__ ptrdiff_t;
|
||||
|
||||
/**
|
||||
* A signed integer type with the property that any valid pointer to
|
||||
* void can be converted to this type, then converted back to pointer
|
||||
* to void, and the result will compare equal to the original pointer.
|
||||
*/
|
||||
typedef __INTPTR_TYPE__ intptr_t;
|
||||
|
||||
/**
|
||||
* An unsigned integer type with the property that any valid pointer to
|
||||
* void can be converted to this type, then converted back to pointer
|
||||
* to void, and the result will compare equal to the original pointer.
|
||||
*/
|
||||
typedef __UINTPTR_TYPE__ uintptr_t;
|
||||
|
||||
// built-in vector data types:
|
||||
typedef char char2 __attribute__((ext_vector_type(2)));
|
||||
typedef char char3 __attribute__((ext_vector_type(3)));
|
||||
typedef char char4 __attribute__((ext_vector_type(4)));
|
||||
typedef char char8 __attribute__((ext_vector_type(8)));
|
||||
typedef char char16 __attribute__((ext_vector_type(16)));
|
||||
typedef uchar uchar2 __attribute__((ext_vector_type(2)));
|
||||
typedef uchar uchar3 __attribute__((ext_vector_type(3)));
|
||||
typedef uchar uchar4 __attribute__((ext_vector_type(4)));
|
||||
typedef uchar uchar8 __attribute__((ext_vector_type(8)));
|
||||
typedef uchar uchar16 __attribute__((ext_vector_type(16)));
|
||||
typedef short short2 __attribute__((ext_vector_type(2)));
|
||||
typedef short short3 __attribute__((ext_vector_type(3)));
|
||||
typedef short short4 __attribute__((ext_vector_type(4)));
|
||||
typedef short short8 __attribute__((ext_vector_type(8)));
|
||||
typedef short short16 __attribute__((ext_vector_type(16)));
|
||||
typedef ushort ushort2 __attribute__((ext_vector_type(2)));
|
||||
typedef ushort ushort3 __attribute__((ext_vector_type(3)));
|
||||
typedef ushort ushort4 __attribute__((ext_vector_type(4)));
|
||||
typedef ushort ushort8 __attribute__((ext_vector_type(8)));
|
||||
typedef ushort ushort16 __attribute__((ext_vector_type(16)));
|
||||
typedef int int2 __attribute__((ext_vector_type(2)));
|
||||
typedef int int3 __attribute__((ext_vector_type(3)));
|
||||
typedef int int4 __attribute__((ext_vector_type(4)));
|
||||
typedef int int8 __attribute__((ext_vector_type(8)));
|
||||
typedef int int16 __attribute__((ext_vector_type(16)));
|
||||
typedef uint uint2 __attribute__((ext_vector_type(2)));
|
||||
typedef uint uint3 __attribute__((ext_vector_type(3)));
|
||||
typedef uint uint4 __attribute__((ext_vector_type(4)));
|
||||
typedef uint uint8 __attribute__((ext_vector_type(8)));
|
||||
typedef uint uint16 __attribute__((ext_vector_type(16)));
|
||||
typedef long long2 __attribute__((ext_vector_type(2)));
|
||||
typedef long long3 __attribute__((ext_vector_type(3)));
|
||||
typedef long long4 __attribute__((ext_vector_type(4)));
|
||||
typedef long long8 __attribute__((ext_vector_type(8)));
|
||||
typedef long long16 __attribute__((ext_vector_type(16)));
|
||||
typedef ulong ulong2 __attribute__((ext_vector_type(2)));
|
||||
typedef ulong ulong3 __attribute__((ext_vector_type(3)));
|
||||
typedef ulong ulong4 __attribute__((ext_vector_type(4)));
|
||||
typedef ulong ulong8 __attribute__((ext_vector_type(8)));
|
||||
typedef ulong ulong16 __attribute__((ext_vector_type(16)));
|
||||
typedef float float2 __attribute__((ext_vector_type(2)));
|
||||
typedef float float3 __attribute__((ext_vector_type(3)));
|
||||
typedef float float4 __attribute__((ext_vector_type(4)));
|
||||
typedef float float8 __attribute__((ext_vector_type(8)));
|
||||
typedef float float16 __attribute__((ext_vector_type(16)));
|
||||
#ifdef cl_khr_fp16
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
typedef half half2 __attribute__((ext_vector_type(2)));
|
||||
typedef half half3 __attribute__((ext_vector_type(3)));
|
||||
typedef half half4 __attribute__((ext_vector_type(4)));
|
||||
typedef half half8 __attribute__((ext_vector_type(8)));
|
||||
typedef half half16 __attribute__((ext_vector_type(16)));
|
||||
#endif
|
||||
#ifdef cl_khr_fp64
|
||||
#if __OPENCL_C_VERSION__ < CL_VERSION_1_2
|
||||
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
||||
#endif
|
||||
typedef double double2 __attribute__((ext_vector_type(2)));
|
||||
typedef double double3 __attribute__((ext_vector_type(3)));
|
||||
typedef double double4 __attribute__((ext_vector_type(4)));
|
||||
typedef double double8 __attribute__((ext_vector_type(8)));
|
||||
typedef double double16 __attribute__((ext_vector_type(16)));
|
||||
#endif
|
||||
|
||||
#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||
#define NULL ((void*)0)
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Value of maximum non-infinite single-precision floating-point
|
||||
* number.
|
||||
*/
|
||||
#define MAXFLOAT 0x1.fffffep127f
|
||||
|
||||
/**
|
||||
* A positive float constant expression. HUGE_VALF evaluates
|
||||
* to +infinity. Used as an error value returned by the built-in
|
||||
* math functions.
|
||||
*/
|
||||
#define HUGE_VALF (__builtin_huge_valf())
|
||||
|
||||
/**
|
||||
* A positive double constant expression. HUGE_VAL evaluates
|
||||
* to +infinity. Used as an error value returned by the built-in
|
||||
* math functions.
|
||||
*/
|
||||
#define HUGE_VAL (__builtin_huge_val())
|
||||
|
||||
/**
|
||||
* A constant expression of type float representing positive or
|
||||
* unsigned infinity.
|
||||
*/
|
||||
#define INFINITY (__builtin_inff())
|
||||
|
||||
/**
|
||||
* A constant expression of type float representing a quiet NaN.
|
||||
*/
|
||||
#define NAN as_float(INT_MAX)
|
||||
|
||||
#define FP_ILOGB0 INT_MIN
|
||||
#define FP_ILOGBNAN INT_MAX
|
||||
|
||||
#define FLT_DIG 6
|
||||
#define FLT_MANT_DIG 24
|
||||
#define FLT_MAX_10_EXP +38
|
||||
#define FLT_MAX_EXP +128
|
||||
#define FLT_MIN_10_EXP -37
|
||||
#define FLT_MIN_EXP -125
|
||||
#define FLT_RADIX 2
|
||||
#define FLT_MAX 0x1.fffffep127f
|
||||
#define FLT_MIN 0x1.0p-126f
|
||||
#define FLT_EPSILON 0x1.0p-23f
|
||||
|
||||
#define M_E_F 2.71828182845904523536028747135266250f
|
||||
#define M_LOG2E_F 1.44269504088896340735992468100189214f
|
||||
#define M_LOG10E_F 0.434294481903251827651128918916605082f
|
||||
#define M_LN2_F 0.693147180559945309417232121458176568f
|
||||
#define M_LN10_F 2.30258509299404568401799145468436421f
|
||||
#define M_PI_F 3.14159265358979323846264338327950288f
|
||||
#define M_PI_2_F 1.57079632679489661923132169163975144f
|
||||
#define M_PI_4_F 0.785398163397448309615660845819875721f
|
||||
#define M_1_PI_F 0.318309886183790671537767526745028724f
|
||||
#define M_2_PI_F 0.636619772367581343075535053490057448f
|
||||
#define M_2_SQRTPI_F 1.12837916709551257389615890312154517f
|
||||
#define M_SQRT2_F 1.41421356237309504880168872420969808f
|
||||
#define M_SQRT1_2_F 0.707106781186547524400844362104849039f
|
||||
|
||||
#define DBL_DIG 15
|
||||
#define DBL_MANT_DIG 53
|
||||
#define DBL_MAX_10_EXP +308
|
||||
#define DBL_MAX_EXP +1024
|
||||
#define DBL_MIN_10_EXP -307
|
||||
#define DBL_MIN_EXP -1021
|
||||
#define DBL_RADIX 2
|
||||
#define DBL_MAX 0x1.fffffffffffffp1023
|
||||
#define DBL_MIN 0x1.0p-1022
|
||||
#define DBL_EPSILON 0x1.0p-52
|
||||
|
||||
#define M_E 0x1.5bf0a8b145769p+1
|
||||
#define M_LOG2E 0x1.71547652b82fep+0
|
||||
#define M_LOG10E 0x1.bcb7b1526e50ep-2
|
||||
#define M_LN2 0x1.62e42fefa39efp-1
|
||||
#define M_LN10 0x1.26bb1bbb55516p+1
|
||||
#define M_PI 0x1.921fb54442d18p+1
|
||||
#define M_PI_2 0x1.921fb54442d18p+0
|
||||
#define M_PI_4 0x1.921fb54442d18p-1
|
||||
#define M_1_PI 0x1.45f306dc9c883p-2
|
||||
#define M_2_PI 0x1.45f306dc9c883p-1
|
||||
#define M_2_SQRTPI 0x1.20dd750429b6dp+0
|
||||
#define M_SQRT2 0x1.6a09e667f3bcdp+0
|
||||
#define M_SQRT1_2 0x1.6a09e667f3bcdp-1
|
||||
|
||||
#ifdef cl_khr_fp16
|
||||
|
||||
#define HALF_DIG 3
|
||||
#define HALF_MANT_DIG 11
|
||||
#define HALF_MAX_10_EXP +4
|
||||
#define HALF_MAX_EXP +16
|
||||
#define HALF_MIN_10_EXP -4
|
||||
#define HALF_MIN_EXP -13
|
||||
#define HALF_RADIX 2
|
||||
#define HALF_MAX ((0x1.ffcp15h))
|
||||
#define HALF_MIN ((0x1.0p-14h))
|
||||
#define HALF_EPSILON ((0x1.0p-10h))
|
||||
|
||||
#define M_E_H 2.71828182845904523536028747135266250h
|
||||
#define M_LOG2E_H 1.44269504088896340735992468100189214h
|
||||
#define M_LOG10E_H 0.434294481903251827651128918916605082h
|
||||
#define M_LN2_H 0.693147180559945309417232121458176568h
|
||||
#define M_LN10_H 2.30258509299404568401799145468436421h
|
||||
#define M_PI_H 3.14159265358979323846264338327950288h
|
||||
#define M_PI_2_H 1.57079632679489661923132169163975144h
|
||||
#define M_PI_4_H 0.785398163397448309615660845819875721h
|
||||
#define M_1_PI_H 0.318309886183790671537767526745028724h
|
||||
#define M_2_PI_H 0.636619772367581343075535053490057448h
|
||||
#define M_2_SQRTPI_H 1.12837916709551257389615890312154517h
|
||||
#define M_SQRT2_H 1.41421356237309504880168872420969808h
|
||||
#define M_SQRT1_2_H 0.707106781186547524400844362104849039h
|
||||
|
||||
#endif //cl_khr_fp16
|
||||
|
||||
#define CHAR_BIT 8
|
||||
#define SCHAR_MAX 127
|
||||
#define SCHAR_MIN (-128)
|
||||
#define UCHAR_MAX 255
|
||||
#define CHAR_MAX SCHAR_MAX
|
||||
#define CHAR_MIN SCHAR_MIN
|
||||
#define USHRT_MAX 65535
|
||||
#define SHRT_MAX 32767
|
||||
#define SHRT_MIN (-32768)
|
||||
#define UINT_MAX 0xffffffff
|
||||
#define INT_MAX 2147483647
|
||||
#define INT_MIN (-2147483647-1)
|
||||
#define ULONG_MAX 0xffffffffffffffffUL
|
||||
#define LONG_MAX 0x7fffffffffffffffL
|
||||
#define LONG_MIN (-0x7fffffffffffffffL-1)
|
||||
|
||||
// OpenCL v1.1 s6.11.8, v1.2 s6.12.8, v2.0 s6.13.8 - Synchronization Functions
|
||||
|
||||
// Flag type and values for barrier, mem_fence, read_mem_fence, write_mem_fence
|
||||
typedef uint cl_mem_fence_flags;
|
||||
|
||||
/**
|
||||
* Queue a memory fence to ensure correct
|
||||
* ordering of memory operations to local memory
|
||||
*/
|
||||
#define CLK_LOCAL_MEM_FENCE 0x01
|
||||
|
||||
/**
|
||||
* Queue a memory fence to ensure correct
|
||||
* ordering of memory operations to global memory
|
||||
*/
|
||||
#define CLK_GLOBAL_MEM_FENCE 0x02
|
||||
|
||||
#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||
|
||||
typedef enum memory_scope {
|
||||
memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
|
||||
memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
|
||||
memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
|
||||
memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
|
||||
#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups)
|
||||
memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
|
||||
#endif
|
||||
} memory_scope;
|
||||
|
||||
/**
|
||||
* Queue a memory fence to ensure correct ordering of memory
|
||||
* operations between work-items of a work-group to
|
||||
* image memory.
|
||||
*/
|
||||
#define CLK_IMAGE_MEM_FENCE 0x04
|
||||
|
||||
#ifndef ATOMIC_VAR_INIT
|
||||
#define ATOMIC_VAR_INIT(x) (x)
|
||||
#endif //ATOMIC_VAR_INIT
|
||||
#define ATOMIC_FLAG_INIT 0
|
||||
|
||||
// enum values aligned with what clang uses in EmitAtomicExpr()
|
||||
typedef enum memory_order
|
||||
{
|
||||
memory_order_relaxed = __ATOMIC_RELAXED,
|
||||
memory_order_acquire = __ATOMIC_ACQUIRE,
|
||||
memory_order_release = __ATOMIC_RELEASE,
|
||||
memory_order_acq_rel = __ATOMIC_ACQ_REL,
|
||||
memory_order_seq_cst = __ATOMIC_SEQ_CST
|
||||
} memory_order;
|
||||
|
||||
#endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||
|
||||
// OpenCL v1.1 s6.11.3, v1.2 s6.12.14, v2.0 s6.13.14 - Image Read and Write Functions
|
||||
|
||||
// These values need to match the runtime equivalent
|
||||
//
|
||||
// Addressing Mode.
|
||||
//
|
||||
#define CLK_ADDRESS_NONE 0
|
||||
#define CLK_ADDRESS_CLAMP_TO_EDGE 2
|
||||
#define CLK_ADDRESS_CLAMP 4
|
||||
#define CLK_ADDRESS_REPEAT 6
|
||||
#define CLK_ADDRESS_MIRRORED_REPEAT 8
|
||||
|
||||
//
|
||||
// Coordination Normalization
|
||||
//
|
||||
#define CLK_NORMALIZED_COORDS_FALSE 0
|
||||
#define CLK_NORMALIZED_COORDS_TRUE 1
|
||||
|
||||
//
|
||||
// Filtering Mode.
|
||||
//
|
||||
#define CLK_FILTER_NEAREST 0x10
|
||||
#define CLK_FILTER_LINEAR 0x20
|
||||
|
||||
#ifdef cl_khr_gl_msaa_sharing
|
||||
#pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing : enable
|
||||
#endif //cl_khr_gl_msaa_sharing
|
||||
|
||||
//
|
||||
// Channel Datatype.
|
||||
//
|
||||
#define CLK_SNORM_INT8 0x10D0
|
||||
#define CLK_SNORM_INT16 0x10D1
|
||||
#define CLK_UNORM_INT8 0x10D2
|
||||
#define CLK_UNORM_INT16 0x10D3
|
||||
#define CLK_UNORM_SHORT_565 0x10D4
|
||||
#define CLK_UNORM_SHORT_555 0x10D5
|
||||
#define CLK_UNORM_INT_101010 0x10D6
|
||||
#define CLK_SIGNED_INT8 0x10D7
|
||||
#define CLK_SIGNED_INT16 0x10D8
|
||||
#define CLK_SIGNED_INT32 0x10D9
|
||||
#define CLK_UNSIGNED_INT8 0x10DA
|
||||
#define CLK_UNSIGNED_INT16 0x10DB
|
||||
#define CLK_UNSIGNED_INT32 0x10DC
|
||||
#define CLK_HALF_FLOAT 0x10DD
|
||||
#define CLK_FLOAT 0x10DE
|
||||
#define CLK_UNORM_INT24 0x10DF
|
||||
|
||||
// Channel order, numbering must be aligned with cl_channel_order in cl.h
|
||||
//
|
||||
#define CLK_R 0x10B0
|
||||
#define CLK_A 0x10B1
|
||||
#define CLK_RG 0x10B2
|
||||
#define CLK_RA 0x10B3
|
||||
#define CLK_RGB 0x10B4
|
||||
#define CLK_RGBA 0x10B5
|
||||
#define CLK_BGRA 0x10B6
|
||||
#define CLK_ARGB 0x10B7
|
||||
#define CLK_INTENSITY 0x10B8
|
||||
#define CLK_LUMINANCE 0x10B9
|
||||
#define CLK_Rx 0x10BA
|
||||
#define CLK_RGx 0x10BB
|
||||
#define CLK_RGBx 0x10BC
|
||||
#define CLK_DEPTH 0x10BD
|
||||
#define CLK_DEPTH_STENCIL 0x10BE
|
||||
#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
|
||||
#define CLK_sRGB 0x10BF
|
||||
#define CLK_sRGBx 0x10C0
|
||||
#define CLK_sRGBA 0x10C1
|
||||
#define CLK_sBGRA 0x10C2
|
||||
#define CLK_ABGR 0x10C3
|
||||
#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
|
||||
|
||||
// OpenCL v2.0 s6.13.16 - Pipe Functions
|
||||
#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||
#define CLK_NULL_RESERVE_ID (__builtin_astype(((void*)(__SIZE_MAX__)), reserve_id_t))
|
||||
|
||||
// OpenCL v2.0 s6.13.17 - Enqueue Kernels
|
||||
#define CL_COMPLETE 0x0
|
||||
#define CL_RUNNING 0x1
|
||||
#define CL_SUBMITTED 0x2
|
||||
#define CL_QUEUED 0x3
|
||||
|
||||
#define CLK_SUCCESS 0
|
||||
#define CLK_ENQUEUE_FAILURE -101
|
||||
#define CLK_INVALID_QUEUE -102
|
||||
#define CLK_INVALID_NDRANGE -160
|
||||
#define CLK_INVALID_EVENT_WAIT_LIST -57
|
||||
#define CLK_DEVICE_QUEUE_FULL -161
|
||||
#define CLK_INVALID_ARG_SIZE -51
|
||||
#define CLK_EVENT_ALLOCATION_FAILURE -100
|
||||
#define CLK_OUT_OF_RESOURCES -5
|
||||
|
||||
#define CLK_NULL_QUEUE 0
|
||||
#define CLK_NULL_EVENT (__builtin_astype(((__SIZE_MAX__)), clk_event_t))
|
||||
|
||||
// execution model related definitions
|
||||
#define CLK_ENQUEUE_FLAGS_NO_WAIT 0x0
|
||||
#define CLK_ENQUEUE_FLAGS_WAIT_KERNEL 0x1
|
||||
#define CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP 0x2
|
||||
|
||||
typedef int kernel_enqueue_flags_t;
|
||||
typedef int clk_profiling_info;
|
||||
|
||||
// Profiling info name (see capture_event_profiling_info)
|
||||
#define CLK_PROFILING_COMMAND_EXEC_TIME 0x1
|
||||
|
||||
#define MAX_WORK_DIM 3
|
||||
|
||||
typedef struct {
|
||||
unsigned int workDimension;
|
||||
size_t globalWorkOffset[MAX_WORK_DIM];
|
||||
size_t globalWorkSize[MAX_WORK_DIM];
|
||||
size_t localWorkSize[MAX_WORK_DIM];
|
||||
} ndrange_t;
|
||||
|
||||
#endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
|
||||
|
||||
#ifdef cl_intel_device_side_avc_motion_estimation
|
||||
#pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : begin
|
||||
|
||||
#define CLK_AVC_ME_MAJOR_16x16_INTEL 0x0
|
||||
#define CLK_AVC_ME_MAJOR_16x8_INTEL 0x1
|
||||
#define CLK_AVC_ME_MAJOR_8x16_INTEL 0x2
|
||||
#define CLK_AVC_ME_MAJOR_8x8_INTEL 0x3
|
||||
|
||||
#define CLK_AVC_ME_MINOR_8x8_INTEL 0x0
|
||||
#define CLK_AVC_ME_MINOR_8x4_INTEL 0x1
|
||||
#define CLK_AVC_ME_MINOR_4x8_INTEL 0x2
|
||||
#define CLK_AVC_ME_MINOR_4x4_INTEL 0x3
|
||||
|
||||
#define CLK_AVC_ME_MAJOR_FORWARD_INTEL 0x0
|
||||
#define CLK_AVC_ME_MAJOR_BACKWARD_INTEL 0x1
|
||||
#define CLK_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL 0x2
|
||||
|
||||
#define CLK_AVC_ME_PARTITION_MASK_ALL_INTEL 0x0
|
||||
#define CLK_AVC_ME_PARTITION_MASK_16x16_INTEL 0x7E
|
||||
#define CLK_AVC_ME_PARTITION_MASK_16x8_INTEL 0x7D
|
||||
#define CLK_AVC_ME_PARTITION_MASK_8x16_INTEL 0x7B
|
||||
#define CLK_AVC_ME_PARTITION_MASK_8x8_INTEL 0x77
|
||||
#define CLK_AVC_ME_PARTITION_MASK_8x4_INTEL 0x6F
|
||||
#define CLK_AVC_ME_PARTITION_MASK_4x8_INTEL 0x5F
|
||||
#define CLK_AVC_ME_PARTITION_MASK_4x4_INTEL 0x3F
|
||||
|
||||
#define CLK_AVC_ME_SLICE_TYPE_PRED_INTEL 0x0
|
||||
#define CLK_AVC_ME_SLICE_TYPE_BPRED_INTEL 0x1
|
||||
#define CLK_AVC_ME_SLICE_TYPE_INTRA_INTEL 0x2
|
||||
|
||||
#define CLK_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL 0x0
|
||||
#define CLK_AVC_ME_SEARCH_WINDOW_SMALL_INTEL 0x1
|
||||
#define CLK_AVC_ME_SEARCH_WINDOW_TINY_INTEL 0x2
|
||||
#define CLK_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL 0x3
|
||||
#define CLK_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL 0x4
|
||||
#define CLK_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL 0x5
|
||||
#define CLK_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL 0x6
|
||||
#define CLK_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL 0x7
|
||||
#define CLK_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL 0x8
|
||||
|
||||
#define CLK_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0
|
||||
#define CLK_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x2
|
||||
|
||||
#define CLK_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0
|
||||
#define CLK_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1
|
||||
#define CLK_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL 0x3
|
||||
|
||||
#define CLK_AVC_ME_COST_PRECISION_QPEL_INTEL 0x0
|
||||
#define CLK_AVC_ME_COST_PRECISION_HPEL_INTEL 0x1
|
||||
#define CLK_AVC_ME_COST_PRECISION_PEL_INTEL 0x2
|
||||
#define CLK_AVC_ME_COST_PRECISION_DPEL_INTEL 0x3
|
||||
|
||||
#define CLK_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL 0x10
|
||||
#define CLK_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL 0x15
|
||||
#define CLK_AVC_ME_BIDIR_WEIGHT_HALF_INTEL 0x20
|
||||
#define CLK_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 0x2B
|
||||
#define CLK_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 0x30
|
||||
|
||||
#define CLK_AVC_ME_BORDER_REACHED_LEFT_INTEL 0x0
|
||||
#define CLK_AVC_ME_BORDER_REACHED_RIGHT_INTEL 0x2
|
||||
#define CLK_AVC_ME_BORDER_REACHED_TOP_INTEL 0x4
|
||||
#define CLK_AVC_ME_BORDER_REACHED_BOTTOM_INTEL 0x8
|
||||
|
||||
#define CLK_AVC_ME_INTRA_16x16_INTEL 0x0
|
||||
#define CLK_AVC_ME_INTRA_8x8_INTEL 0x1
|
||||
#define CLK_AVC_ME_INTRA_4x4_INTEL 0x2
|
||||
|
||||
#define CLK_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL 0x0
|
||||
#define CLK_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL 0x4000
|
||||
|
||||
#define CLK_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL (0x1 << 24)
|
||||
#define CLK_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL (0x2 << 24)
|
||||
#define CLK_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL (0x3 << 24)
|
||||
#define CLK_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL (0x55 << 24)
|
||||
#define CLK_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL (0xAA << 24)
|
||||
#define CLK_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL (0xFF << 24)
|
||||
#define CLK_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL (0x1 << 24)
|
||||
#define CLK_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL (0x2 << 24)
|
||||
#define CLK_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL (0x1 << 26)
|
||||
#define CLK_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL (0x2 << 26)
|
||||
#define CLK_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL (0x1 << 28)
|
||||
#define CLK_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL (0x2 << 28)
|
||||
#define CLK_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL (0x1 << 30)
|
||||
#define CLK_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL (0x2 << 30)
|
||||
|
||||
#define CLK_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL 0x00
|
||||
#define CLK_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL 0x80
|
||||
|
||||
#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_ALL_INTEL 0x0
|
||||
#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL 0x6
|
||||
#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL 0x5
|
||||
#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL 0x3
|
||||
|
||||
#define CLK_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL 0x60
|
||||
#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL 0x10
|
||||
#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL 0x8
|
||||
#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL 0x4
|
||||
|
||||
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0
|
||||
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
|
||||
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2
|
||||
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3
|
||||
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
|
||||
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4
|
||||
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5
|
||||
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6
|
||||
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7
|
||||
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8
|
||||
#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0
|
||||
#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
|
||||
#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2
|
||||
#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3
|
||||
|
||||
#define CLK_AVC_ME_FRAME_FORWARD_INTEL 0x1
|
||||
#define CLK_AVC_ME_FRAME_BACKWARD_INTEL 0x2
|
||||
#define CLK_AVC_ME_FRAME_DUAL_INTEL 0x3
|
||||
|
||||
#define CLK_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL 0x0
|
||||
#define CLK_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL 0x1
|
||||
|
||||
#define CLK_AVC_ME_INITIALIZE_INTEL 0x0
|
||||
|
||||
#define CLK_AVC_IME_PAYLOAD_INITIALIZE_INTEL 0x0
|
||||
#define CLK_AVC_REF_PAYLOAD_INITIALIZE_INTEL 0x0
|
||||
#define CLK_AVC_SIC_PAYLOAD_INITIALIZE_INTEL 0x0
|
||||
|
||||
#define CLK_AVC_IME_RESULT_INITIALIZE_INTEL 0x0
|
||||
#define CLK_AVC_REF_RESULT_INITIALIZE_INTEL 0x0
|
||||
#define CLK_AVC_SIC_RESULT_INITIALIZE_INTEL 0x0
|
||||
|
||||
#define CLK_AVC_IME_RESULT_SINGLE_REFERENCE_STREAMOUT_INITIALIZE_INTEL 0x0
|
||||
#define CLK_AVC_IME_RESULT_SINGLE_REFERENCE_STREAMIN_INITIALIZE_INTEL 0x0
|
||||
#define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMOUT_INITIALIZE_INTEL 0x0
|
||||
#define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMIN_INITIALIZE_INTEL 0x0
|
||||
|
||||
#pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : end
|
||||
#endif // cl_intel_device_side_avc_motion_estimation
|
||||
|
||||
#endif //_OPENCL_BASE_H_
|
||||
16502
benchmarks/opencl/compiler/share/pocl/include/opencl-c.h
Normal file
16502
benchmarks/opencl/compiler/share/pocl/include/opencl-c.h
Normal file
File diff suppressed because it is too large
Load Diff
395
benchmarks/opencl/compiler/share/pocl/include/pocl.h
Normal file
395
benchmarks/opencl/compiler/share/pocl/include/pocl.h
Normal file
@@ -0,0 +1,395 @@
|
||||
/* pocl.h - global pocl declarations for the host side runtime.
|
||||
|
||||
Copyright (c) 2011 Universidad Rey Juan Carlos
|
||||
2011-2019 Pekka Jääskeläinen
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file pocl.h
|
||||
*
|
||||
* The declarations in this file are such that are used both in the
|
||||
* libpocl implementation CL and the kernel compiler. Others should be
|
||||
* moved to pocl_cl.h of lib/CL or under the kernel compiler dir.
|
||||
* @todo Check if there are extra declarations here that could be moved.
|
||||
*/
|
||||
#ifndef POCL_H
|
||||
#define POCL_H
|
||||
|
||||
#ifndef CL_TARGET_OPENCL_VERSION
|
||||
#define CL_TARGET_OPENCL_VERSION 220
|
||||
#endif
|
||||
#include <CL/opencl.h>
|
||||
|
||||
#include "config.h"
|
||||
|
||||
#include "pocl_context.h"
|
||||
|
||||
/* detects restrict, variadic macros etc */
|
||||
#include "pocl_compiler_features.h"
|
||||
|
||||
#define POCL_FILENAME_LENGTH 1024
|
||||
|
||||
#define WORKGROUP_STRING_LENGTH 1024
|
||||
|
||||
typedef struct _mem_mapping mem_mapping_t;
|
||||
/* represents a single buffer to host memory mapping */
|
||||
struct _mem_mapping {
|
||||
void *host_ptr; /* the location of the mapped buffer chunk in the host memory */
|
||||
size_t offset; /* offset to the beginning of the buffer */
|
||||
size_t size;
|
||||
mem_mapping_t *prev, *next;
|
||||
/* This is required, because two clEnqueueMap() with the same buffer+size+offset,
|
||||
will create two identical mappings in the buffer->mappings LL.
|
||||
Without this flag, both corresponding clEnqUnmap()s will find
|
||||
the same mapping (the first one in mappings LL), which will lead
|
||||
to memory double-free corruption later. */
|
||||
long unmap_requested;
|
||||
cl_map_flags map_flags;
|
||||
/* image mapping data */
|
||||
size_t origin[3];
|
||||
size_t region[3];
|
||||
size_t row_pitch;
|
||||
size_t slice_pitch;
|
||||
};
|
||||
|
||||
/* memory identifier: id to point the global memory where memory resides
|
||||
+ pointer to actual data */
|
||||
typedef struct _pocl_mem_identifier
|
||||
{
|
||||
int available; /* ... in this mem objs context */
|
||||
int global_mem_id;
|
||||
void *mem_ptr;
|
||||
void *image_data;
|
||||
} pocl_mem_identifier;
|
||||
|
||||
typedef struct _mem_destructor_callback mem_destructor_callback_t;
|
||||
/* represents a memory object destructor callback */
|
||||
struct _mem_destructor_callback
|
||||
{
|
||||
void (CL_CALLBACK * pfn_notify) (cl_mem, void*); /* callback function */
|
||||
void *user_data; /* user supplied data passed to callback function */
|
||||
mem_destructor_callback_t *next;
|
||||
};
|
||||
|
||||
typedef struct _build_program_callback build_program_callback_t;
|
||||
struct _build_program_callback
|
||||
{
|
||||
void (CL_CALLBACK * callback_function) (cl_program, void*); /* callback function */
|
||||
void *user_data; /* user supplied data passed to callback function */
|
||||
};
|
||||
|
||||
// Command Queue datatypes
|
||||
|
||||
#define POCL_KERNEL_DIGEST_SIZE 16
|
||||
typedef uint8_t pocl_kernel_hash_t[POCL_KERNEL_DIGEST_SIZE];
|
||||
|
||||
// clEnqueueNDRangeKernel
|
||||
typedef struct
|
||||
{
|
||||
void *hash;
|
||||
void *wg; /* The work group function ptr. Device specific. */
|
||||
cl_kernel kernel;
|
||||
/* The launch data that can be passed to the kernel execution environment. */
|
||||
struct pocl_context pc;
|
||||
struct pocl_argument *arguments;
|
||||
/* Can be used to store/cache arbitrary device-specific data. */
|
||||
void *device_data;
|
||||
/* If set to 1, disallow any work-group function specialization. */
|
||||
int force_generic_wg_func;
|
||||
/* If set to 1, disallow "small grid" WG function specialization. */
|
||||
int force_large_grid_wg_func;
|
||||
unsigned device_i;
|
||||
} _cl_command_run;
|
||||
|
||||
// clEnqueueNativeKernel
|
||||
typedef struct
|
||||
{
|
||||
void *args;
|
||||
size_t cb_args;
|
||||
void (*user_func)(void *);
|
||||
} _cl_command_native;
|
||||
|
||||
// clEnqueueReadBuffer
|
||||
typedef struct
|
||||
{
|
||||
void *__restrict__ dst_host_ptr;
|
||||
pocl_mem_identifier *src_mem_id;
|
||||
size_t offset;
|
||||
size_t size;
|
||||
} _cl_command_read;
|
||||
|
||||
// clEnqueueWriteBuffer
|
||||
typedef struct
|
||||
{
|
||||
const void *__restrict__ src_host_ptr;
|
||||
pocl_mem_identifier *dst_mem_id;
|
||||
size_t offset;
|
||||
size_t size;
|
||||
} _cl_command_write;
|
||||
|
||||
// clEnqueueCopyBuffer
|
||||
typedef struct
|
||||
{
|
||||
pocl_mem_identifier *src_mem_id;
|
||||
pocl_mem_identifier *dst_mem_id;
|
||||
size_t src_offset;
|
||||
size_t dst_offset;
|
||||
size_t size;
|
||||
} _cl_command_copy;
|
||||
|
||||
// clEnqueueReadBufferRect
|
||||
typedef struct
|
||||
{
|
||||
void *__restrict__ dst_host_ptr;
|
||||
pocl_mem_identifier *src_mem_id;
|
||||
size_t buffer_origin[3];
|
||||
size_t host_origin[3];
|
||||
size_t region[3];
|
||||
size_t buffer_row_pitch;
|
||||
size_t buffer_slice_pitch;
|
||||
size_t host_row_pitch;
|
||||
size_t host_slice_pitch;
|
||||
} _cl_command_read_rect;
|
||||
|
||||
// clEnqueueWriteBufferRect
|
||||
typedef struct
|
||||
{
|
||||
const void *__restrict__ src_host_ptr;
|
||||
pocl_mem_identifier *dst_mem_id;
|
||||
size_t buffer_origin[3];
|
||||
size_t host_origin[3];
|
||||
size_t region[3];
|
||||
size_t buffer_row_pitch;
|
||||
size_t buffer_slice_pitch;
|
||||
size_t host_row_pitch;
|
||||
size_t host_slice_pitch;
|
||||
} _cl_command_write_rect;
|
||||
|
||||
// clEnqueueCopyBufferRect
|
||||
typedef struct
|
||||
{
|
||||
pocl_mem_identifier *src_mem_id;
|
||||
pocl_mem_identifier *dst_mem_id;
|
||||
size_t dst_origin[3];
|
||||
size_t src_origin[3];
|
||||
size_t region[3];
|
||||
size_t src_row_pitch;
|
||||
size_t src_slice_pitch;
|
||||
size_t dst_row_pitch;
|
||||
size_t dst_slice_pitch;
|
||||
} _cl_command_copy_rect;
|
||||
|
||||
// clEnqueueMapBuffer
|
||||
typedef struct
|
||||
{
|
||||
pocl_mem_identifier *mem_id;
|
||||
mem_mapping_t *mapping;
|
||||
} _cl_command_map;
|
||||
|
||||
/* clEnqueueUnMapMemObject */
|
||||
typedef struct
|
||||
{
|
||||
pocl_mem_identifier *mem_id;
|
||||
mem_mapping_t *mapping;
|
||||
} _cl_command_unmap;
|
||||
|
||||
/* clEnqueueFillBuffer */
|
||||
typedef struct
|
||||
{
|
||||
pocl_mem_identifier *dst_mem_id;
|
||||
size_t size;
|
||||
size_t offset;
|
||||
void *__restrict__ pattern;
|
||||
size_t pattern_size;
|
||||
} _cl_command_fill_mem;
|
||||
|
||||
/* clEnqueue(Write/Read)Image */
|
||||
typedef struct
|
||||
{
|
||||
pocl_mem_identifier *src_mem_id;
|
||||
void *__restrict__ dst_host_ptr;
|
||||
pocl_mem_identifier *dst_mem_id;
|
||||
size_t dst_offset;
|
||||
size_t origin[3];
|
||||
size_t region[3];
|
||||
size_t dst_row_pitch;
|
||||
size_t dst_slice_pitch;
|
||||
} _cl_command_read_image;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
pocl_mem_identifier *dst_mem_id;
|
||||
const void *__restrict__ src_host_ptr;
|
||||
pocl_mem_identifier *src_mem_id;
|
||||
size_t src_offset;
|
||||
size_t origin[3];
|
||||
size_t region[3];
|
||||
size_t src_row_pitch;
|
||||
size_t src_slice_pitch;
|
||||
} _cl_command_write_image;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
pocl_mem_identifier *src_mem_id;
|
||||
pocl_mem_identifier *dst_mem_id;
|
||||
size_t dst_origin[3];
|
||||
size_t src_origin[3];
|
||||
size_t region[3];
|
||||
} _cl_command_copy_image;
|
||||
|
||||
/* clEnqueueFillImage */
|
||||
typedef struct
|
||||
{
|
||||
pocl_mem_identifier *mem_id;
|
||||
size_t origin[3];
|
||||
size_t region[3];
|
||||
void *__restrict__ fill_pixel;
|
||||
size_t pixel_size;
|
||||
} _cl_command_fill_image;
|
||||
|
||||
/* clEnqueueMarkerWithWaitlist */
|
||||
typedef struct
|
||||
{
|
||||
void *data;
|
||||
int has_wait_list;
|
||||
} _cl_command_marker;
|
||||
|
||||
/* clEnqueueBarrierWithWaitlist */
|
||||
typedef _cl_command_marker _cl_command_barrier;
|
||||
|
||||
/* clEnqueueMigrateMemObjects */
|
||||
typedef struct
|
||||
{
|
||||
void *data;
|
||||
size_t num_mem_objects;
|
||||
cl_mem *mem_objects;
|
||||
cl_device_id *source_devices;
|
||||
} _cl_command_migrate;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
void* data;
|
||||
void* queue;
|
||||
unsigned num_svm_pointers;
|
||||
void **svm_pointers;
|
||||
void (CL_CALLBACK *pfn_free_func) ( cl_command_queue queue,
|
||||
cl_uint num_svm_pointers,
|
||||
void *svm_pointers[],
|
||||
void *user_data);
|
||||
} _cl_command_svm_free;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
void* svm_ptr;
|
||||
size_t size;
|
||||
cl_map_flags flags;
|
||||
} _cl_command_svm_map;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
void* svm_ptr;
|
||||
} _cl_command_svm_unmap;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
const void *__restrict__ src;
|
||||
void *__restrict__ dst;
|
||||
size_t size;
|
||||
} _cl_command_svm_cpy;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
void *__restrict__ svm_ptr;
|
||||
size_t size;
|
||||
void *__restrict__ pattern;
|
||||
size_t pattern_size;
|
||||
} _cl_command_svm_fill;
|
||||
|
||||
typedef union
|
||||
{
|
||||
_cl_command_run run;
|
||||
_cl_command_native native;
|
||||
|
||||
_cl_command_read read;
|
||||
_cl_command_write write;
|
||||
_cl_command_copy copy;
|
||||
_cl_command_read_rect read_rect;
|
||||
_cl_command_write_rect write_rect;
|
||||
_cl_command_copy_rect copy_rect;
|
||||
_cl_command_fill_mem memfill;
|
||||
|
||||
_cl_command_read_image read_image;
|
||||
_cl_command_write_image write_image;
|
||||
_cl_command_copy_image copy_image;
|
||||
_cl_command_fill_image fill_image;
|
||||
|
||||
_cl_command_map map;
|
||||
_cl_command_unmap unmap;
|
||||
|
||||
_cl_command_marker marker;
|
||||
_cl_command_barrier barrier;
|
||||
_cl_command_migrate migrate;
|
||||
|
||||
_cl_command_svm_free svm_free;
|
||||
_cl_command_svm_map svm_map;
|
||||
_cl_command_svm_unmap svm_unmap;
|
||||
_cl_command_svm_cpy svm_memcpy;
|
||||
_cl_command_svm_fill svm_fill;
|
||||
} _cl_command_t;
|
||||
|
||||
// one item in the command queue
|
||||
typedef struct _cl_command_node _cl_command_node;
|
||||
struct _cl_command_node
|
||||
{
|
||||
_cl_command_t command;
|
||||
cl_command_type type;
|
||||
_cl_command_node *next; // for linked-list storage
|
||||
_cl_command_node *prev;
|
||||
cl_event event;
|
||||
const cl_event *event_wait_list;
|
||||
cl_device_id device;
|
||||
/* The index of the targeted device in the platform's device list. */
|
||||
unsigned device_i;
|
||||
cl_int ready;
|
||||
};
|
||||
|
||||
#ifndef LLVM_10_0
|
||||
#define LLVM_OLDER_THAN_10_0 1
|
||||
|
||||
#ifndef LLVM_9_0
|
||||
#define LLVM_OLDER_THAN_9_0 1
|
||||
|
||||
#ifndef LLVM_8_0
|
||||
#define LLVM_OLDER_THAN_8_0 1
|
||||
|
||||
#ifndef LLVM_7_0
|
||||
#define LLVM_OLDER_THAN_7_0 1
|
||||
|
||||
#ifndef LLVM_6_0
|
||||
#define LLVM_OLDER_THAN_6_0 1
|
||||
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#endif /* POCL_H */
|
||||
80
benchmarks/opencl/compiler/share/pocl/include/pocl_device.h
Normal file
80
benchmarks/opencl/compiler/share/pocl/include/pocl_device.h
Normal file
@@ -0,0 +1,80 @@
|
||||
/* pocl_device.h - global pocl declarations to be used in the device binaries in
|
||||
case applicable by the target
|
||||
|
||||
Copyright (c) 2012-2018 Pekka Jääskeläinen / Tampere University of Technology
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef POCL_DEVICE_H
|
||||
#define POCL_DEVICE_H
|
||||
|
||||
#include "pocl_types.h"
|
||||
|
||||
#define MAX_KERNEL_ARGS 64
|
||||
#define MAX_KERNEL_NAME_LENGTH 64
|
||||
|
||||
/* Metadata of a single kernel stored in the device.*/
|
||||
typedef struct {
|
||||
const uchar name[MAX_KERNEL_NAME_LENGTH];
|
||||
ushort num_args;
|
||||
ushort num_locals;
|
||||
void *work_group_func;
|
||||
} __kernel_metadata;
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define ALIGN4(x) __declspec(align(4)) x
|
||||
#define ALIGN8(x) __declspec(align(4)) x
|
||||
#else
|
||||
#define ALIGN4(x) x __attribute__ ((aligned (4)))
|
||||
#define ALIGN8(x) x __attribute__ ((aligned (8)))
|
||||
#endif
|
||||
|
||||
/* A kernel invocation command. */
|
||||
typedef struct {
|
||||
/* The execution status of this queue slot. */
|
||||
ALIGN8(uint status);
|
||||
/* The kernel to execute. Points to the metadata in the device global
|
||||
memory. It will be casted to a __kernel_metadata* */
|
||||
ALIGN8(uint kernel);
|
||||
/* Pointers to the kernel arguments in the global memory. Will be
|
||||
casted to 32 bit void* */
|
||||
ALIGN8(uint args[MAX_KERNEL_ARGS]);
|
||||
/* Sizes of the dynamically allocated local buffers. */
|
||||
/* uint32_t dynamic_local_arg_sizes[MAX_KERNEL_ARGS] ALIGN4; */
|
||||
/* Number of dimensions in the work space. */
|
||||
ALIGN4(uint work_dim);
|
||||
ALIGN4(uint num_groups[3]);
|
||||
ALIGN4(uint global_offset[3]);
|
||||
} __kernel_exec_cmd;
|
||||
|
||||
/* Kernel execution statuses. */
|
||||
|
||||
/* The invocation entry is free to use. */
|
||||
#define POCL_KST_FREE 1
|
||||
/* The kernel structure has been populated and is waiting to be
|
||||
executed. */
|
||||
#define POCL_KST_READY 2
|
||||
/* The kernel is currently running in the device. */
|
||||
#define POCL_KST_RUNNING 3
|
||||
/* The kernel has finished execution. The results can be collected and the
|
||||
execution entry be freed (by writing POCL_KST_FREE to the status). */
|
||||
#define POCL_KST_FINISHED 4
|
||||
|
||||
#endif
|
||||
@@ -0,0 +1,52 @@
|
||||
/* pocl_image_types.h - image data structure used by device implementations
|
||||
|
||||
Copyright (c) 2013 Ville Korhonen
|
||||
Copyright (c) 2017 Michal Babej / Tampere University of Technology
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef __X86_IMAGE_H__
|
||||
#define __X86_IMAGE_H__
|
||||
|
||||
#ifdef __CBUILD__
|
||||
#define INTTYPE cl_int
|
||||
#else
|
||||
#define INTTYPE int
|
||||
#endif
|
||||
|
||||
typedef uintptr_t dev_sampler_t;
|
||||
|
||||
typedef struct dev_image_t {
|
||||
void *_data;
|
||||
INTTYPE _width;
|
||||
INTTYPE _height;
|
||||
INTTYPE _depth;
|
||||
INTTYPE _image_array_size;
|
||||
INTTYPE _row_pitch;
|
||||
INTTYPE _slice_pitch;
|
||||
INTTYPE _num_mip_levels; /* maybe not needed */
|
||||
INTTYPE _num_samples; /* maybe not needed */
|
||||
INTTYPE _order;
|
||||
INTTYPE _data_type;
|
||||
INTTYPE _num_channels;
|
||||
INTTYPE _elem_size;
|
||||
} dev_image_t;
|
||||
|
||||
#endif
|
||||
33
benchmarks/opencl/compiler/share/pocl/include/pocl_spir.h
Normal file
33
benchmarks/opencl/compiler/share/pocl/include/pocl_spir.h
Normal file
@@ -0,0 +1,33 @@
|
||||
/* pocl-spir.h - global pocl declarations for the SPIR support.
|
||||
|
||||
Copyright (c) 2018-2019 Pekka Jääskeläinen
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef POCL_SPIR_H
|
||||
#define POCL_SPIR_H
|
||||
|
||||
#define SPIR_ADDRESS_SPACE_PRIVATE 0
|
||||
#define SPIR_ADDRESS_SPACE_GLOBAL 1
|
||||
#define SPIR_ADDRESS_SPACE_CONSTANT 2
|
||||
#define SPIR_ADDRESS_SPACE_LOCAL 3
|
||||
#define SPIR_ADDRESS_SPACE_GENERIC 4
|
||||
|
||||
#endif
|
||||
171
benchmarks/opencl/compiler/share/pocl/include/pocl_types.h
Normal file
171
benchmarks/opencl/compiler/share/pocl/include/pocl_types.h
Normal file
@@ -0,0 +1,171 @@
|
||||
/* pocl_types.h - The basic OpenCL C device side scalar data types.
|
||||
|
||||
Copyright (c) 2018 Pekka Jääskeläinen / Tampere University of Technology
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* This header is designed to be included both from the device and the host.
|
||||
In case compiling OpenCL C sources, __OPENCL_VERSION__ should be set.
|
||||
In case compiling in the host, all but the device-specific types are
|
||||
defined (size_t and others). Devices should avoid including the C
|
||||
stdint.h instead of this one as OpenCL C size_t et al. is allowed to
|
||||
be of different width than when targeting C.
|
||||
|
||||
TODO: replace this header (partially) with Clang's opencl-c.h
|
||||
*/
|
||||
|
||||
#ifndef POCL_DEVICE_TYPES_H
|
||||
#define POCL_DEVICE_TYPES_H
|
||||
|
||||
#ifdef __OPENCL_VERSION__
|
||||
|
||||
#ifdef __USE_CLANG_OPENCL_C_H
|
||||
|
||||
/* Minimal definitions, only the target specific macro overrides,
|
||||
just in case Clang export the C ones which might differ for
|
||||
OpenCL C. */
|
||||
|
||||
#ifdef __INTPTR_TYPE__
|
||||
#undef __INTPTR_TYPE__
|
||||
#endif
|
||||
|
||||
#ifdef __UINTPTR_TYPE__
|
||||
#undef __UINTPTR_TYPE__
|
||||
#endif
|
||||
|
||||
#ifdef __SIZE_TYPE__
|
||||
#undef __SIZE_TYPE__
|
||||
#endif
|
||||
|
||||
#ifdef __SIZE_MAX__
|
||||
#undef __SIZE_MAX__
|
||||
#endif
|
||||
|
||||
#if defined(POCL_DEVICE_ADDRESS_BITS) && POCL_DEVICE_ADDRESS_BITS == 32
|
||||
#define __SIZE_TYPE__ uint
|
||||
#define __SIZE_MAX__ UINT_MAX
|
||||
#else
|
||||
#define __SIZE_TYPE__ ulong
|
||||
#define __SIZE_MAX__ ULONG_MAX
|
||||
#endif
|
||||
|
||||
#define __INTPTR_TYPE__ __SIZE_TYPE__
|
||||
#define __UINTPTR_TYPE__ __INTPTR_TYPE__
|
||||
|
||||
#else
|
||||
|
||||
/* Compiling Device-specific OpenCL C or builtin library C. */
|
||||
|
||||
#if defined cl_khr_fp64 && !defined cl_khr_int64
|
||||
#error "cl_khr_fp64 requires cl_khr_int64"
|
||||
#endif
|
||||
|
||||
/* TODO FIXME We should not use these in OpenCL library's C code at all.
|
||||
* The problem is that 1) these are predefined by glibc, 2) while we can
|
||||
* re-define "ulong", we cannot control the size of "long" at all.
|
||||
* which can lead to "ulong" being 64bit and "long" 32bit, resulting in
|
||||
* mysterious errors and bugs. Therefore OpenCL library's C code should
|
||||
* use the fixed size C types where integer size matters. */
|
||||
|
||||
#ifdef __CBUILD__
|
||||
|
||||
/* Builtin library C code definitions. */
|
||||
|
||||
#define size_t csize_t
|
||||
#define uintptr_t cuintptr_t
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#undef size_t
|
||||
#undef uintptr_t
|
||||
|
||||
typedef uint8_t uchar;
|
||||
typedef uint16_t ushort;
|
||||
typedef uint32_t uint;
|
||||
|
||||
#ifdef cl_khr_int64
|
||||
typedef uint64_t ulong;
|
||||
#else
|
||||
typedef uint32_t ulong;
|
||||
#endif
|
||||
|
||||
#ifndef cl_khr_fp16
|
||||
typedef short half;
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
/* The definitions below intentionally lead to errors if these types
|
||||
are used when they are not available in the language. This prevents
|
||||
accidentally using them if the compiler does not disable these
|
||||
types, but only e.g. defines them with an incorrect size.*/
|
||||
|
||||
#ifndef cl_khr_fp64
|
||||
typedef struct error_undefined_type_double error_undefined_type_double;
|
||||
#define double error_undefined_type_double
|
||||
#endif
|
||||
|
||||
#ifdef __SIZE_TYPE__
|
||||
#undef __SIZE_TYPE__
|
||||
#endif
|
||||
|
||||
#ifdef __SIZE_MAX__
|
||||
#undef __SIZE_MAX__
|
||||
#endif
|
||||
|
||||
#if defined(POCL_DEVICE_ADDRESS_BITS) && POCL_DEVICE_ADDRESS_BITS == 32
|
||||
#define __SIZE_TYPE__ uint
|
||||
#define __SIZE_MAX__ UINT_MAX
|
||||
#else
|
||||
#define __SIZE_TYPE__ ulong
|
||||
#define __SIZE_MAX__ ULONG_MAX
|
||||
#endif
|
||||
|
||||
typedef __SIZE_TYPE__ size_t;
|
||||
typedef __PTRDIFF_TYPE__ ptrdiff_t;
|
||||
typedef ptrdiff_t intptr_t;
|
||||
typedef size_t uintptr_t;
|
||||
|
||||
#endif /* #ifdef __USE_CLANG_OPENCL_C_H */
|
||||
|
||||
#else /* #ifdef __OPENCL_VERSION__ */
|
||||
|
||||
/* Including from a host source (runtime API implementation). Introduce
|
||||
the fixed width datatypes, but do not override C's size_t and other
|
||||
target specific datatypes. */
|
||||
|
||||
typedef unsigned char uchar;
|
||||
|
||||
/* FIXME see the above TODO about these types. */
|
||||
|
||||
#if !(defined(_SYS_TYPES_H) && defined(__USE_MISC))
|
||||
/* glibc, when including sys/types.h, typedefs these. */
|
||||
|
||||
typedef unsigned long int ulong;
|
||||
typedef unsigned short int ushort;
|
||||
typedef unsigned int uint;
|
||||
|
||||
#endif
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
Binary file not shown.
1
benchmarks/opencl/convolution/.gitignore
vendored
Normal file
1
benchmarks/opencl/convolution/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
convolution
|
||||
@@ -1,68 +1,47 @@
|
||||
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
|
||||
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
|
||||
POCL_INC_PATH ?= $(wildcard ../include)
|
||||
POCL_LIB_PATH ?= $(wildcard ../lib)
|
||||
VX_RT_PATH ?= $(wildcard ../../../runtime)
|
||||
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
|
||||
LLVM_HOME ?= ~/dev/llvm-project/drops
|
||||
TOOLCHAIN_PATH ?= ~/dev/riscv-gnu-toolchain/drops
|
||||
SYSROOT ?= $(TOOLCHAIN_PATH)/riscv32-unknown-elf
|
||||
POCL_CC_PATH ?= $(realpath ../compiler)
|
||||
POCL_RT_PATH ?= $(realpath ../runtime)
|
||||
VORTEX_DRV_PATH ?= $(realpath ../../../driver/sw)
|
||||
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
||||
|
||||
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
|
||||
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
|
||||
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
|
||||
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
|
||||
CXXFLAGS += -std=c++11 -O0 -g -fpermissive -Wall -Wextra -pedantic -Wfatal-errors
|
||||
|
||||
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
|
||||
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
|
||||
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
|
||||
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
|
||||
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
|
||||
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
|
||||
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
|
||||
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
|
||||
CXXFLAGS += -I$(POCLRT_PATH)/include
|
||||
|
||||
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
|
||||
|
||||
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
|
||||
CXXFLAGS += -ffreestanding # program may not begin at main()
|
||||
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
|
||||
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
|
||||
CXXFLAGS += -I$(POCL_INC_PATH)
|
||||
|
||||
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
LDFLAGS += -L$(POCLRT_PATH)/lib -L$(DRIVER_PATH)/simx -lOpenCL -lvortex
|
||||
|
||||
PROJECT = convolution
|
||||
|
||||
SRCS = main.cpp utils.cpp
|
||||
|
||||
all: $(PROJECT).dump $(PROJECT).hex
|
||||
all: $(PROJECT)
|
||||
|
||||
lib$(PROJECT).a: kernel.cl
|
||||
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
|
||||
kernel.pocl: kernel.cl
|
||||
TOOLCHAIN_PATH=$(TOOLCHAIN_PATH) SYSROOT=$(SYSROOT) LLVM_HOME=$(LLVM_HOME) VORTEX_RUNTIME_PATH=$(VORTEX_RT_PATH) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_HOME)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
|
||||
run-fpga: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
|
||||
run-ase: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
$(PROJECT).hex: $(PROJECT).elf
|
||||
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
|
||||
run-simx: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
$(PROJECT).dump: $(PROJECT).elf
|
||||
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||
run-rtlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run: $(PROJECT).hex
|
||||
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
|
||||
|
||||
qemu: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-s: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-c: $(PROJECT).qemu
|
||||
$(GDB) $(PROJECT).qemu
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug
|
||||
rm -rf $(PROJECT) *.o *.dump .depend
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
||||
|
||||
@@ -1,54 +1,54 @@
|
||||
__kernel
|
||||
void convolution(
|
||||
__read_only image2d_t sourceImage,
|
||||
__write_only image2d_t outputImage,
|
||||
int rows,
|
||||
int cols,
|
||||
__constant float* filter,
|
||||
int filterWidth,
|
||||
sampler_t sampler)
|
||||
{
|
||||
// Store each work-item’s unique row and column
|
||||
int column = get_global_id(0);
|
||||
int row = get_global_id(1);
|
||||
|
||||
// Half the width of the filter is needed for indexing
|
||||
// memory later
|
||||
int halfWidth = (int)(filterWidth/2);
|
||||
|
||||
// All accesses to images return data as four-element vector
|
||||
// (i.e., float4), although only the 'x' component will contain
|
||||
// meaningful data in this code
|
||||
float4 sum = {0.0f, 0.0f, 0.0f, 0.0f};
|
||||
|
||||
// Iterator for the filter
|
||||
int filterIdx = 0;
|
||||
|
||||
// Each work-item iterates around its local area based on the
|
||||
// size of the filter
|
||||
int2 coords; // Coordinates for accessing the image
|
||||
// Iterate the filter rows
|
||||
for(int i = -halfWidth; i <= halfWidth; i++) {
|
||||
coords.y = row + i;
|
||||
|
||||
// Iterate over the filter columns
|
||||
for(int j = -halfWidth; j <= halfWidth; j++) {
|
||||
coords.x = column + j;
|
||||
|
||||
float4 pixel;
|
||||
// Read a pixel from the image. A single channel image
|
||||
// stores the pixel in the 'x' coordinate of the returned
|
||||
// vector.
|
||||
pixel = read_imagef(sourceImage, sampler, coords);
|
||||
sum.x += pixel.x * filter[filterIdx++];
|
||||
}
|
||||
}
|
||||
|
||||
// Copy the data to the output image if the
|
||||
// work-item is in bounds
|
||||
if(row < rows && column < cols) {
|
||||
coords.x = column;
|
||||
coords.y = row;
|
||||
write_imagef(outputImage, coords, sum);
|
||||
}
|
||||
__kernel
|
||||
void convolution(
|
||||
__read_only image2d_t sourceImage,
|
||||
__write_only image2d_t outputImage,
|
||||
int rows,
|
||||
int cols,
|
||||
__constant float* filter,
|
||||
int filterWidth,
|
||||
sampler_t sampler)
|
||||
{
|
||||
// Store each work-item’s unique row and column
|
||||
int column = get_global_id(0);
|
||||
int row = get_global_id(1);
|
||||
|
||||
// Half the width of the filter is needed for indexing
|
||||
// memory later
|
||||
int halfWidth = (int)(filterWidth/2);
|
||||
|
||||
// All accesses to images return data as four-element vector
|
||||
// (i.e., float4), although only the 'x' component will contain
|
||||
// meaningful data in this code
|
||||
float4 sum = {0.0f, 0.0f, 0.0f, 0.0f};
|
||||
|
||||
// Iterator for the filter
|
||||
int filterIdx = 0;
|
||||
|
||||
// Each work-item iterates around its local area based on the
|
||||
// size of the filter
|
||||
int2 coords; // Coordinates for accessing the image
|
||||
// Iterate the filter rows
|
||||
for(int i = -halfWidth; i <= halfWidth; i++) {
|
||||
coords.y = row + i;
|
||||
|
||||
// Iterate over the filter columns
|
||||
for(int j = -halfWidth; j <= halfWidth; j++) {
|
||||
coords.x = column + j;
|
||||
|
||||
float4 pixel;
|
||||
// Read a pixel from the image. A single channel image
|
||||
// stores the pixel in the 'x' coordinate of the returned
|
||||
// vector.
|
||||
pixel = read_imagef(sourceImage, sampler, coords);
|
||||
sum.x += pixel.x * filter[filterIdx++];
|
||||
}
|
||||
}
|
||||
|
||||
// Copy the data to the output image if the
|
||||
// work-item is in bounds
|
||||
if(row < rows && column < cols) {
|
||||
coords.x = column;
|
||||
coords.y = row;
|
||||
write_imagef(outputImage, coords, sum);
|
||||
}
|
||||
}
|
||||
@@ -1,261 +1,261 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <CL/cl.h>
|
||||
|
||||
#include "utils.h"
|
||||
|
||||
// This function takes a positive integer and rounds it up to
|
||||
// the nearest multiple of another provided integer
|
||||
unsigned int roundUp(unsigned int value, unsigned int multiple) {
|
||||
|
||||
// Determine how far past the nearest multiple the value is
|
||||
unsigned int remainder = value % multiple;
|
||||
|
||||
// Add the difference to make the value a multiple
|
||||
if(remainder != 0) {
|
||||
value += (multiple-remainder);
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
// This function reads in a text file and stores it as a char pointer
|
||||
char* readSource(char* kernelPath) {
|
||||
|
||||
cl_int status;
|
||||
FILE *fp;
|
||||
char *source;
|
||||
long int size;
|
||||
|
||||
printf("Program file is: %s\n", kernelPath);
|
||||
|
||||
fp = fopen(kernelPath, "rb");
|
||||
if(!fp) {
|
||||
printf("Could not open kernel file\n");
|
||||
exit(-1);
|
||||
}
|
||||
status = fseek(fp, 0, SEEK_END);
|
||||
if(status != 0) {
|
||||
printf("Error seeking to end of file\n");
|
||||
exit(-1);
|
||||
}
|
||||
size = ftell(fp);
|
||||
if(size < 0) {
|
||||
printf("Error getting file position\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
rewind(fp);
|
||||
|
||||
source = (char *)malloc(size + 1);
|
||||
|
||||
int i;
|
||||
for (i = 0; i < size+1; i++) {
|
||||
source[i]='\0';
|
||||
}
|
||||
|
||||
if(source == NULL) {
|
||||
printf("Error allocating space for the kernel source\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
fread(source, 1, size, fp);
|
||||
source[size] = '\0';
|
||||
|
||||
return source;
|
||||
}
|
||||
|
||||
void chk(cl_int status, const char* cmd) {
|
||||
|
||||
if(status != CL_SUCCESS) {
|
||||
printf("%s failed (%d)\n", cmd, status);
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
|
||||
int i, j, k, l;
|
||||
|
||||
// Rows and columns in the input image
|
||||
int imageHeight;
|
||||
int imageWidth;
|
||||
|
||||
const char* inputFile = "input.bmp";
|
||||
const char* outputFile = "output.bmp";
|
||||
|
||||
// Homegrown function to read a BMP from file
|
||||
float* inputImage = readImage(inputFile, &imageWidth,
|
||||
&imageHeight);
|
||||
|
||||
// Size of the input and output images on the host
|
||||
int dataSize = imageHeight*imageWidth*sizeof(float);
|
||||
|
||||
// Output image on the host
|
||||
float* outputImage = NULL;
|
||||
outputImage = (float*)malloc(dataSize);
|
||||
float* refImage = NULL;
|
||||
refImage = (float*)malloc(dataSize);
|
||||
|
||||
// 45 degree motion blur
|
||||
float filter[49] =
|
||||
{0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, -1, 0, 1, 0, 0,
|
||||
0, 0, -2, 0, 2, 0, 0,
|
||||
0, 0, -1, 0, 1, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0};
|
||||
|
||||
// The convolution filter is 7x7
|
||||
int filterWidth = 7;
|
||||
int filterSize = filterWidth*filterWidth; // Assume a square kernel
|
||||
|
||||
// Set up the OpenCL environment
|
||||
cl_int status;
|
||||
|
||||
// Discovery platform
|
||||
cl_platform_id platform;
|
||||
status = clGetPlatformIDs(1, &platform, NULL);
|
||||
chk(status, "clGetPlatformIDs");
|
||||
|
||||
// Discover device
|
||||
cl_device_id device;
|
||||
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
|
||||
chk(status, "clGetDeviceIDs");
|
||||
|
||||
// Create context
|
||||
cl_context_properties props[3] = {CL_CONTEXT_PLATFORM,
|
||||
(cl_context_properties)(platform), 0};
|
||||
cl_context context;
|
||||
context = clCreateContext(props, 1, &device, NULL, NULL, &status);
|
||||
chk(status, "clCreateContext");
|
||||
|
||||
// Create command queue
|
||||
cl_command_queue queue;
|
||||
queue = clCreateCommandQueue(context, device, 0, &status);
|
||||
chk(status, "clCreateCommandQueue");
|
||||
|
||||
// The image format describes how the data will be stored in memory
|
||||
cl_image_format format;
|
||||
format.image_channel_order = CL_R; // single channel
|
||||
format.image_channel_data_type = CL_FLOAT; // float data type
|
||||
|
||||
// Create space for the source image on the device
|
||||
cl_mem d_inputImage = clCreateImage2D(context, 0, &format, imageWidth,
|
||||
imageHeight, 0, NULL, &status);
|
||||
chk(status, "clCreateImage2D");
|
||||
|
||||
// Create space for the output image on the device
|
||||
cl_mem d_outputImage = clCreateImage2D(context, 0, &format, imageWidth,
|
||||
imageHeight, 0, NULL, &status);
|
||||
chk(status, "clCreateImage2D");
|
||||
|
||||
// Create space for the 7x7 filter on the device
|
||||
cl_mem d_filter = clCreateBuffer(context, 0, filterSize*sizeof(float),
|
||||
NULL, &status);
|
||||
chk(status, "clCreateBuffer");
|
||||
|
||||
// Copy the source image to the device
|
||||
size_t origin[3] = {0, 0, 0}; // Offset within the image to copy from
|
||||
size_t region[3] = {imageWidth, imageHeight, 1}; // Elements to per dimension
|
||||
status = clEnqueueWriteImage(queue, d_inputImage, CL_FALSE, origin, region,
|
||||
0, 0, inputImage, 0, NULL, NULL);
|
||||
chk(status, "clEnqueueWriteImage");
|
||||
|
||||
// Copy the 7x7 filter to the device
|
||||
status = clEnqueueWriteBuffer(queue, d_filter, CL_FALSE, 0,
|
||||
filterSize*sizeof(float), filter, 0, NULL, NULL);
|
||||
chk(status, "clEnqueueWriteBuffer");
|
||||
|
||||
// Create the image sampler
|
||||
cl_sampler sampler = clCreateSampler(context, CL_FALSE,
|
||||
CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &status);
|
||||
chk(status, "clCreateSampler");
|
||||
|
||||
const char* source = readSource("kernel.cl");
|
||||
|
||||
// Create a program object with source and build it
|
||||
cl_program program;
|
||||
program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);
|
||||
chk(status, "clCreateProgramWithSource");
|
||||
status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
|
||||
chk(status, "clBuildProgram");
|
||||
|
||||
// Create the kernel object
|
||||
cl_kernel kernel;
|
||||
kernel = clCreateKernel(program, "convolution", &status);
|
||||
chk(status, "clCreateKernel");
|
||||
|
||||
// Set the kernel arguments
|
||||
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_inputImage);
|
||||
status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_outputImage);
|
||||
status |= clSetKernelArg(kernel, 2, sizeof(int), &imageHeight);
|
||||
status |= clSetKernelArg(kernel, 3, sizeof(int), &imageWidth);
|
||||
status |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_filter);
|
||||
status |= clSetKernelArg(kernel, 5, sizeof(int), &filterWidth);
|
||||
status |= clSetKernelArg(kernel, 6, sizeof(cl_sampler), &sampler);
|
||||
chk(status, "clSetKernelArg");
|
||||
|
||||
// Set the work item dimensions
|
||||
size_t globalSize[2] = {imageWidth, imageHeight};
|
||||
status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, NULL, 0,
|
||||
NULL, NULL);
|
||||
chk(status, "clEnqueueNDRange");
|
||||
|
||||
// Read the image back to the host
|
||||
status = clEnqueueReadImage(queue, d_outputImage, CL_TRUE, origin,
|
||||
region, 0, 0, outputImage, 0, NULL, NULL);
|
||||
chk(status, "clEnqueueReadImage");
|
||||
|
||||
// Write the output image to file
|
||||
storeImage(outputImage, outputFile, imageHeight, imageWidth, inputFile);
|
||||
|
||||
// Compute the reference image
|
||||
for(i = 0; i < imageHeight; i++) {
|
||||
for(j = 0; j < imageWidth; j++) {
|
||||
refImage[i*imageWidth+j] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Iterate over the rows of the source image
|
||||
int halfFilterWidth = filterWidth/2;
|
||||
float sum;
|
||||
for(i = 0; i < imageHeight; i++) {
|
||||
// Iterate over the columns of the source image
|
||||
for(j = 0; j < imageWidth; j++) {
|
||||
sum = 0; // Reset sum for new source pixel
|
||||
// Apply the filter to the neighborhood
|
||||
for(k = - halfFilterWidth; k <= halfFilterWidth; k++) {
|
||||
for(l = - halfFilterWidth; l <= halfFilterWidth; l++) {
|
||||
if(i+k >= 0 && i+k < imageHeight &&
|
||||
j+l >= 0 && j+l < imageWidth) {
|
||||
sum += inputImage[(i+k)*imageWidth + j+l] *
|
||||
filter[(k+halfFilterWidth)*filterWidth +
|
||||
l+halfFilterWidth];
|
||||
}
|
||||
}
|
||||
}
|
||||
refImage[i*imageWidth+j] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
int failed = 0;
|
||||
for(i = 0; i < imageHeight; i++) {
|
||||
for(j = 0; j < imageWidth; j++) {
|
||||
if(abs(outputImage[i*imageWidth+j]-refImage[i*imageWidth+j]) > 0.01) {
|
||||
printf("Results are INCORRECT\n");
|
||||
printf("Pixel mismatch at <%d,%d> (%f vs. %f)\n", i, j,
|
||||
outputImage[i*imageWidth+j], refImage[i*imageWidth+j]);
|
||||
failed = 1;
|
||||
}
|
||||
if(failed) break;
|
||||
}
|
||||
if(failed) break;
|
||||
}
|
||||
if(!failed) {
|
||||
printf("Results are correct\n");
|
||||
}
|
||||
|
||||
return 0;
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <CL/cl.h>
|
||||
|
||||
#include "utils.h"
|
||||
|
||||
// This function takes a positive integer and rounds it up to
|
||||
// the nearest multiple of another provided integer
|
||||
unsigned int roundUp(unsigned int value, unsigned int multiple) {
|
||||
|
||||
// Determine how far past the nearest multiple the value is
|
||||
unsigned int remainder = value % multiple;
|
||||
|
||||
// Add the difference to make the value a multiple
|
||||
if(remainder != 0) {
|
||||
value += (multiple-remainder);
|
||||
}
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
// This function reads in a text file and stores it as a char pointer
|
||||
char* readSource(char* kernelPath) {
|
||||
|
||||
cl_int status;
|
||||
FILE *fp;
|
||||
char *source;
|
||||
long int size;
|
||||
|
||||
printf("Program file is: %s\n", kernelPath);
|
||||
|
||||
fp = fopen(kernelPath, "rb");
|
||||
if(!fp) {
|
||||
printf("Could not open kernel file\n");
|
||||
exit(-1);
|
||||
}
|
||||
status = fseek(fp, 0, SEEK_END);
|
||||
if(status != 0) {
|
||||
printf("Error seeking to end of file\n");
|
||||
exit(-1);
|
||||
}
|
||||
size = ftell(fp);
|
||||
if(size < 0) {
|
||||
printf("Error getting file position\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
rewind(fp);
|
||||
|
||||
source = (char *)malloc(size + 1);
|
||||
|
||||
int i;
|
||||
for (i = 0; i < size+1; i++) {
|
||||
source[i]='\0';
|
||||
}
|
||||
|
||||
if(source == NULL) {
|
||||
printf("Error allocating space for the kernel source\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
fread(source, 1, size, fp);
|
||||
source[size] = '\0';
|
||||
|
||||
return source;
|
||||
}
|
||||
|
||||
void chk(cl_int status, const char* cmd) {
|
||||
|
||||
if(status != CL_SUCCESS) {
|
||||
printf("%s failed (%d)\n", cmd, status);
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
|
||||
int i, j, k, l;
|
||||
|
||||
// Rows and columns in the input image
|
||||
int imageHeight;
|
||||
int imageWidth;
|
||||
|
||||
const char* inputFile = "input.bmp";
|
||||
const char* outputFile = "output.bmp";
|
||||
|
||||
// Homegrown function to read a BMP from file
|
||||
float* inputImage = readImage(inputFile, &imageWidth,
|
||||
&imageHeight);
|
||||
|
||||
// Size of the input and output images on the host
|
||||
int dataSize = imageHeight*imageWidth*sizeof(float);
|
||||
|
||||
// Output image on the host
|
||||
float* outputImage = NULL;
|
||||
outputImage = (float*)malloc(dataSize);
|
||||
float* refImage = NULL;
|
||||
refImage = (float*)malloc(dataSize);
|
||||
|
||||
// 45 degree motion blur
|
||||
float filter[49] =
|
||||
{0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, -1, 0, 1, 0, 0,
|
||||
0, 0, -2, 0, 2, 0, 0,
|
||||
0, 0, -1, 0, 1, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0,
|
||||
0, 0, 0, 0, 0, 0, 0};
|
||||
|
||||
// The convolution filter is 7x7
|
||||
int filterWidth = 7;
|
||||
int filterSize = filterWidth*filterWidth; // Assume a square kernel
|
||||
|
||||
// Set up the OpenCL environment
|
||||
cl_int status;
|
||||
|
||||
// Discovery platform
|
||||
cl_platform_id platform;
|
||||
status = clGetPlatformIDs(1, &platform, NULL);
|
||||
chk(status, "clGetPlatformIDs");
|
||||
|
||||
// Discover device
|
||||
cl_device_id device;
|
||||
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
|
||||
chk(status, "clGetDeviceIDs");
|
||||
|
||||
// Create context
|
||||
cl_context_properties props[3] = {CL_CONTEXT_PLATFORM,
|
||||
(cl_context_properties)(platform), 0};
|
||||
cl_context context;
|
||||
context = clCreateContext(props, 1, &device, NULL, NULL, &status);
|
||||
chk(status, "clCreateContext");
|
||||
|
||||
// Create command queue
|
||||
cl_command_queue queue;
|
||||
queue = clCreateCommandQueue(context, device, 0, &status);
|
||||
chk(status, "clCreateCommandQueue");
|
||||
|
||||
// The image format describes how the data will be stored in memory
|
||||
cl_image_format format;
|
||||
format.image_channel_order = CL_R; // single channel
|
||||
format.image_channel_data_type = CL_FLOAT; // float data type
|
||||
|
||||
// Create space for the source image on the device
|
||||
cl_mem d_inputImage = clCreateImage2D(context, 0, &format, imageWidth,
|
||||
imageHeight, 0, NULL, &status);
|
||||
chk(status, "clCreateImage2D");
|
||||
|
||||
// Create space for the output image on the device
|
||||
cl_mem d_outputImage = clCreateImage2D(context, 0, &format, imageWidth,
|
||||
imageHeight, 0, NULL, &status);
|
||||
chk(status, "clCreateImage2D");
|
||||
|
||||
// Create space for the 7x7 filter on the device
|
||||
cl_mem d_filter = clCreateBuffer(context, 0, filterSize*sizeof(float),
|
||||
NULL, &status);
|
||||
chk(status, "clCreateBuffer");
|
||||
|
||||
// Copy the source image to the device
|
||||
size_t origin[3] = {0, 0, 0}; // Offset within the image to copy from
|
||||
size_t region[3] = {imageWidth, imageHeight, 1}; // Elements to per dimension
|
||||
status = clEnqueueWriteImage(queue, d_inputImage, CL_FALSE, origin, region,
|
||||
0, 0, inputImage, 0, NULL, NULL);
|
||||
chk(status, "clEnqueueWriteImage");
|
||||
|
||||
// Copy the 7x7 filter to the device
|
||||
status = clEnqueueWriteBuffer(queue, d_filter, CL_FALSE, 0,
|
||||
filterSize*sizeof(float), filter, 0, NULL, NULL);
|
||||
chk(status, "clEnqueueWriteBuffer");
|
||||
|
||||
// Create the image sampler
|
||||
cl_sampler sampler = clCreateSampler(context, CL_FALSE,
|
||||
CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &status);
|
||||
chk(status, "clCreateSampler");
|
||||
|
||||
const char* source = readSource("kernel.cl");
|
||||
|
||||
// Create a program object with source and build it
|
||||
cl_program program;
|
||||
program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);
|
||||
chk(status, "clCreateProgramWithSource");
|
||||
status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
|
||||
chk(status, "clBuildProgram");
|
||||
|
||||
// Create the kernel object
|
||||
cl_kernel kernel;
|
||||
kernel = clCreateKernel(program, "convolution", &status);
|
||||
chk(status, "clCreateKernel");
|
||||
|
||||
// Set the kernel arguments
|
||||
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_inputImage);
|
||||
status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_outputImage);
|
||||
status |= clSetKernelArg(kernel, 2, sizeof(int), &imageHeight);
|
||||
status |= clSetKernelArg(kernel, 3, sizeof(int), &imageWidth);
|
||||
status |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_filter);
|
||||
status |= clSetKernelArg(kernel, 5, sizeof(int), &filterWidth);
|
||||
status |= clSetKernelArg(kernel, 6, sizeof(cl_sampler), &sampler);
|
||||
chk(status, "clSetKernelArg");
|
||||
|
||||
// Set the work item dimensions
|
||||
size_t globalSize[2] = {imageWidth, imageHeight};
|
||||
status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, NULL, 0,
|
||||
NULL, NULL);
|
||||
chk(status, "clEnqueueNDRange");
|
||||
|
||||
// Read the image back to the host
|
||||
status = clEnqueueReadImage(queue, d_outputImage, CL_TRUE, origin,
|
||||
region, 0, 0, outputImage, 0, NULL, NULL);
|
||||
chk(status, "clEnqueueReadImage");
|
||||
|
||||
// Write the output image to file
|
||||
storeImage(outputImage, outputFile, imageHeight, imageWidth, inputFile);
|
||||
|
||||
// Compute the reference image
|
||||
for(i = 0; i < imageHeight; i++) {
|
||||
for(j = 0; j < imageWidth; j++) {
|
||||
refImage[i*imageWidth+j] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// Iterate over the rows of the source image
|
||||
int halfFilterWidth = filterWidth/2;
|
||||
float sum;
|
||||
for(i = 0; i < imageHeight; i++) {
|
||||
// Iterate over the columns of the source image
|
||||
for(j = 0; j < imageWidth; j++) {
|
||||
sum = 0; // Reset sum for new source pixel
|
||||
// Apply the filter to the neighborhood
|
||||
for(k = - halfFilterWidth; k <= halfFilterWidth; k++) {
|
||||
for(l = - halfFilterWidth; l <= halfFilterWidth; l++) {
|
||||
if(i+k >= 0 && i+k < imageHeight &&
|
||||
j+l >= 0 && j+l < imageWidth) {
|
||||
sum += inputImage[(i+k)*imageWidth + j+l] *
|
||||
filter[(k+halfFilterWidth)*filterWidth +
|
||||
l+halfFilterWidth];
|
||||
}
|
||||
}
|
||||
}
|
||||
refImage[i*imageWidth+j] = sum;
|
||||
}
|
||||
}
|
||||
|
||||
int failed = 0;
|
||||
for(i = 0; i < imageHeight; i++) {
|
||||
for(j = 0; j < imageWidth; j++) {
|
||||
if(abs(outputImage[i*imageWidth+j]-refImage[i*imageWidth+j]) > 0.01) {
|
||||
printf("Results are INCORRECT\n");
|
||||
printf("Pixel mismatch at <%d,%d> (%f vs. %f)\n", i, j,
|
||||
outputImage[i*imageWidth+j], refImage[i*imageWidth+j]);
|
||||
failed = 1;
|
||||
}
|
||||
if(failed) break;
|
||||
}
|
||||
if(failed) break;
|
||||
}
|
||||
if(!failed) {
|
||||
printf("Results are correct\n");
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1,68 +0,0 @@
|
||||
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
|
||||
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
|
||||
POCL_INC_PATH ?= $(wildcard ../include)
|
||||
POCL_LIB_PATH ?= $(wildcard ../lib)
|
||||
VX_RT_PATH ?= $(wildcard ../../../runtime)
|
||||
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
|
||||
|
||||
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
|
||||
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
|
||||
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
|
||||
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
|
||||
|
||||
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
|
||||
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
|
||||
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
|
||||
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
|
||||
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
|
||||
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
|
||||
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
|
||||
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
|
||||
|
||||
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
|
||||
|
||||
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
|
||||
CXXFLAGS += -ffreestanding # program may not begin at main()
|
||||
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
|
||||
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
|
||||
CXXFLAGS += -I$(POCL_INC_PATH) -I.
|
||||
|
||||
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
|
||||
PROJECT = cutcp
|
||||
|
||||
SRCS = main.cc args.c parboil_opencl.c ocl.c gpu_info.c cutoff.c cutcpu.c output.c readatom.c excl.c
|
||||
|
||||
all: $(PROJECT).dump $(PROJECT).hex
|
||||
|
||||
lib$(PROJECT).a: kernel.cl
|
||||
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
|
||||
|
||||
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
|
||||
|
||||
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
|
||||
|
||||
$(PROJECT).hex: $(PROJECT).elf
|
||||
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
|
||||
|
||||
$(PROJECT).dump: $(PROJECT).elf
|
||||
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||
|
||||
run: $(PROJECT).hex
|
||||
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
|
||||
|
||||
qemu: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-s: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-c: $(PROJECT).qemu
|
||||
$(GDB) $(PROJECT).qemu
|
||||
|
||||
clean:
|
||||
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug
|
||||
@@ -1,617 +0,0 @@
|
||||
|
||||
#include <parboil.h>
|
||||
#include <errno.h>
|
||||
#include <limits.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
/*****************************************************************************/
|
||||
/* Memory management routines */
|
||||
|
||||
/* Free an array of owned strings. */
|
||||
void
|
||||
pb_FreeStringArray(char **string_array)
|
||||
{
|
||||
char **p;
|
||||
|
||||
if (!string_array) return;
|
||||
for (p = string_array; *p; p++) free(*p);
|
||||
free(string_array);
|
||||
}
|
||||
|
||||
struct pb_PlatformParam *
|
||||
pb_PlatformParam(char *name, char *version)
|
||||
{
|
||||
if (name == NULL) {
|
||||
fprintf(stderr, "pb_PlatformParam: Invalid argument\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
struct pb_PlatformParam *ret =
|
||||
(struct pb_PlatformParam *)malloc(sizeof (struct pb_PlatformParam));
|
||||
|
||||
ret->name = name;
|
||||
ret->version = version;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void
|
||||
pb_FreePlatformParam(struct pb_PlatformParam *p)
|
||||
{
|
||||
if (p == NULL) return;
|
||||
|
||||
free(p->name);
|
||||
free(p->version);
|
||||
free(p);
|
||||
}
|
||||
|
||||
struct pb_DeviceParam *
|
||||
pb_DeviceParam_index(int index)
|
||||
{
|
||||
struct pb_DeviceParam *ret =
|
||||
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
|
||||
ret->criterion = pb_Device_INDEX;
|
||||
ret->index = index;
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct pb_DeviceParam *
|
||||
pb_DeviceParam_cpu(void)
|
||||
{
|
||||
struct pb_DeviceParam *ret =
|
||||
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
|
||||
ret->criterion = pb_Device_CPU;
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct pb_DeviceParam *
|
||||
pb_DeviceParam_gpu(void)
|
||||
{
|
||||
struct pb_DeviceParam *ret =
|
||||
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
|
||||
ret->criterion = pb_Device_GPU;
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct pb_DeviceParam *
|
||||
pb_DeviceParam_accelerator(void)
|
||||
{
|
||||
struct pb_DeviceParam *ret =
|
||||
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
|
||||
ret->criterion = pb_Device_ACCELERATOR;
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct pb_DeviceParam *
|
||||
pb_DeviceParam_name(char *name)
|
||||
{
|
||||
struct pb_DeviceParam *ret =
|
||||
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
|
||||
ret->criterion = pb_Device_NAME;
|
||||
ret->name = name;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void
|
||||
pb_FreeDeviceParam(struct pb_DeviceParam *p)
|
||||
{
|
||||
if (p == NULL) return;
|
||||
|
||||
switch(p->criterion) {
|
||||
case pb_Device_NAME:
|
||||
free(p->name);
|
||||
break;
|
||||
case pb_Device_INDEX:
|
||||
case pb_Device_CPU:
|
||||
case pb_Device_ACCELERATOR:
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "pb_FreeDeviceParam: Invalid argument\n");
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
pb_FreeParameters(struct pb_Parameters *p)
|
||||
{
|
||||
free(p->outFile);
|
||||
pb_FreeStringArray(p->inpFiles);
|
||||
pb_FreePlatformParam(p->platform);
|
||||
pb_FreeDeviceParam(p->device);
|
||||
free(p);
|
||||
}
|
||||
|
||||
/*****************************************************************************/
|
||||
|
||||
/* Parse a comma-delimited list of strings into an
|
||||
* array of strings. */
|
||||
static char **
|
||||
read_string_array(char *in)
|
||||
{
|
||||
char **ret;
|
||||
int i;
|
||||
int count; /* Number of items in the input */
|
||||
char *substring; /* Current substring within 'in' */
|
||||
|
||||
/* Count the number of items in the string */
|
||||
count = 1;
|
||||
for (i = 0; in[i]; i++) if (in[i] == ',') count++;
|
||||
|
||||
/* Allocate storage */
|
||||
ret = (char **)malloc((count + 1) * sizeof(char *));
|
||||
|
||||
/* Create copies of the strings from the list */
|
||||
substring = in;
|
||||
for (i = 0; i < count; i++) {
|
||||
char *substring_end;
|
||||
int substring_length;
|
||||
|
||||
/* Find length of substring */
|
||||
for (substring_end = substring;
|
||||
(*substring_end != ',') && (*substring_end != 0);
|
||||
substring_end++);
|
||||
|
||||
substring_length = substring_end - substring;
|
||||
|
||||
/* Allocate memory and copy the substring */
|
||||
ret[i] = (char *)malloc(substring_length + 1);
|
||||
memcpy(ret[i], substring, substring_length);
|
||||
ret[i][substring_length] = 0;
|
||||
|
||||
/* go to next substring */
|
||||
substring = substring_end + 1;
|
||||
}
|
||||
ret[i] = NULL; /* Write the sentinel value */
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void
|
||||
report_parse_error(const char *str)
|
||||
{
|
||||
fputs(str, stderr);
|
||||
}
|
||||
|
||||
/* Interpret a string as a 'pb_DeviceParam' value.
|
||||
* Return a pointer to a new value, or NULL on failure.
|
||||
*/
|
||||
static struct pb_DeviceParam *
|
||||
read_device_param(char *str)
|
||||
{
|
||||
/* Try different ways of interpreting 'device_string' until one works */
|
||||
|
||||
/* If argument is an integer, then interpret it as a device index */
|
||||
errno = 0;
|
||||
char *end;
|
||||
long device_int = strtol(str, &end, 10);
|
||||
if (!errno) {
|
||||
/* Negative numbers are not valid */
|
||||
if (device_int < 0 || device_int > INT_MAX) return NULL;
|
||||
|
||||
return pb_DeviceParam_index(device_int);
|
||||
}
|
||||
|
||||
/* Match against predefined strings */
|
||||
if (strcmp(str, "CPU") == 0)
|
||||
return pb_DeviceParam_cpu();
|
||||
if (strcmp(str, "GPU") == 0)
|
||||
return pb_DeviceParam_gpu();
|
||||
if (strcmp(str, "ACCELERATOR") == 0)
|
||||
return pb_DeviceParam_accelerator();
|
||||
|
||||
/* Assume any other string is a device name */
|
||||
return pb_DeviceParam_name(strdup(str));
|
||||
}
|
||||
|
||||
/* Interpret a string as a 'pb_PlatformParam' value.
|
||||
* Return a pointer to a new value, or NULL on failure.
|
||||
*/
|
||||
static struct pb_PlatformParam *
|
||||
read_platform_param(char *str)
|
||||
{
|
||||
int separator_index; /* Index of the '-' character separating
|
||||
* name and version number. It's -1 if
|
||||
* there's no '-' character. */
|
||||
|
||||
/* Find the last occurrence of '-' in 'str' */
|
||||
{
|
||||
char *cur;
|
||||
separator_index = -1;
|
||||
for (cur = str; *cur; cur++) {
|
||||
if (*cur == '-') separator_index = cur - str;
|
||||
}
|
||||
}
|
||||
|
||||
/* The platform name is either the entire string, or all characters before
|
||||
* the separator */
|
||||
int name_length = separator_index == -1 ? strlen(str) : separator_index;
|
||||
char *name_str = (char *)malloc(name_length + 1);
|
||||
memcpy(name_str, str, name_length);
|
||||
name_str[name_length] = 0;
|
||||
|
||||
/* The version is either NULL, or all characters after the separator */
|
||||
char *version_str;
|
||||
if (separator_index == -1) {
|
||||
version_str = NULL;
|
||||
}
|
||||
else {
|
||||
const char *version_input_str = str + separator_index + 1;
|
||||
int version_length = strlen(version_input_str);
|
||||
|
||||
version_str = (char *)malloc(version_length + 1);
|
||||
memcpy(version_str, version_input_str, version_length);
|
||||
version_str[version_length] = 0;
|
||||
}
|
||||
|
||||
/* Create output structure */
|
||||
return pb_PlatformParam(name_str, version_str);
|
||||
}
|
||||
|
||||
/****************************************************************************/
|
||||
/* Argument parsing state */
|
||||
|
||||
/* Argument parsing state.
|
||||
*
|
||||
* Arguments that are interpreted by the argument parser are removed from
|
||||
* the list. Variables 'argc' and 'argn' do not count arguments that have
|
||||
* been removed.
|
||||
*
|
||||
* During argument parsing, the array of arguments is compacted, overwriting
|
||||
* the erased arguments. Variable 'argv_put' points to the array element
|
||||
* where the next argument will be written. Variable 'argv_get' points to
|
||||
* the array element where the next argument will be read from.
|
||||
*/
|
||||
struct argparse {
|
||||
int argc; /* Number of arguments. Mutable. */
|
||||
int argn; /* Current argument index. */
|
||||
char **argv_get; /* Argument value being read. */
|
||||
char **argv_put; /* Argument value being written.
|
||||
* argv_put <= argv_get. */
|
||||
};
|
||||
|
||||
static void
|
||||
initialize_argparse(struct argparse *ap, int argc, char **argv)
|
||||
{
|
||||
ap->argc = argc;
|
||||
ap->argn = 0;
|
||||
ap->argv_get = ap->argv_put = argv;
|
||||
}
|
||||
|
||||
/* Finish argument parsing, without processing the remaining arguments.
|
||||
* Write new argument count into _argc. */
|
||||
static void
|
||||
finalize_argparse(struct argparse *ap, int *_argc, char **argv)
|
||||
{
|
||||
/* Move the remaining arguments */
|
||||
for(; ap->argn < ap->argc; ap->argn++)
|
||||
*ap->argv_put++ = *ap->argv_get++;
|
||||
|
||||
/* Update the argument count */
|
||||
*_argc = ap->argc;
|
||||
|
||||
/* Insert a terminating NULL */
|
||||
argv[ap->argc] = NULL;
|
||||
}
|
||||
|
||||
/* Delete the current argument. The argument will not be visible
|
||||
* when argument parsing is done. */
|
||||
static void
|
||||
delete_argument(struct argparse *ap)
|
||||
{
|
||||
if (ap->argn >= ap->argc) {
|
||||
fprintf(stderr, "delete_argument\n");
|
||||
}
|
||||
ap->argc--;
|
||||
ap->argv_get++;
|
||||
}
|
||||
|
||||
/* Go to the next argument. Also, move the current argument to its
|
||||
* final location in argv. */
|
||||
static void
|
||||
next_argument(struct argparse *ap)
|
||||
{
|
||||
if (ap->argn >= ap->argc) {
|
||||
fprintf(stderr, "next_argument\n");
|
||||
}
|
||||
/* Move argument to its new location. */
|
||||
*ap->argv_put++ = *ap->argv_get++;
|
||||
ap->argn++;
|
||||
}
|
||||
|
||||
static int
|
||||
is_end_of_arguments(struct argparse *ap)
|
||||
{
|
||||
return ap->argn == ap->argc;
|
||||
}
|
||||
|
||||
/* Get the current argument */
|
||||
static char *
|
||||
get_argument(struct argparse *ap)
|
||||
{
|
||||
return *ap->argv_get;
|
||||
}
|
||||
|
||||
/* Get the current argument, and also delete it */
|
||||
static char *
|
||||
consume_argument(struct argparse *ap)
|
||||
{
|
||||
char *ret = get_argument(ap);
|
||||
delete_argument(ap);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/****************************************************************************/
|
||||
|
||||
/* The result of parsing a command-line argument */
|
||||
typedef enum {
|
||||
ARGPARSE_OK, /* Success */
|
||||
ARGPARSE_ERROR, /* Error */
|
||||
ARGPARSE_DONE /* Success, and do not continue parsing */
|
||||
} result;
|
||||
|
||||
typedef result parse_action(struct argparse *ap, struct pb_Parameters *params);
|
||||
|
||||
|
||||
/* A command-line option */
|
||||
struct option {
|
||||
char short_name; /* If not 0, the one-character
|
||||
* name of this option */
|
||||
const char *long_name; /* If not NULL, the long name of this option */
|
||||
parse_action *action; /* What to do when this option occurs.
|
||||
* Sentinel value is NULL.
|
||||
*/
|
||||
};
|
||||
|
||||
/* Output file
|
||||
*
|
||||
* -o FILE
|
||||
*/
|
||||
static result
|
||||
parse_output_file(struct argparse *ap, struct pb_Parameters *params)
|
||||
{
|
||||
if (is_end_of_arguments(ap))
|
||||
{
|
||||
report_parse_error("Expecting file name after '-o'\n");
|
||||
return ARGPARSE_ERROR;
|
||||
}
|
||||
|
||||
/* Replace the output file name */
|
||||
free(params->outFile);
|
||||
params->outFile = strdup(consume_argument(ap));
|
||||
|
||||
return ARGPARSE_OK;
|
||||
}
|
||||
|
||||
/* Input files
|
||||
*
|
||||
* -i FILE,FILE,...
|
||||
*/
|
||||
static result
|
||||
parse_input_files(struct argparse *ap, struct pb_Parameters *params)
|
||||
{
|
||||
if (is_end_of_arguments(ap))
|
||||
{
|
||||
report_parse_error("Expecting file name after '-i'\n");
|
||||
return ARGPARSE_ERROR;
|
||||
}
|
||||
|
||||
/* Replace the input file list */
|
||||
pb_FreeStringArray(params->inpFiles);
|
||||
params->inpFiles = read_string_array(consume_argument(ap));
|
||||
return ARGPARSE_OK;
|
||||
}
|
||||
|
||||
/* End of options
|
||||
*
|
||||
* --
|
||||
*/
|
||||
|
||||
static result
|
||||
parse_end_options(struct argparse *ap, struct pb_Parameters *params)
|
||||
{
|
||||
return ARGPARSE_DONE;
|
||||
}
|
||||
|
||||
/* OpenCL device
|
||||
*
|
||||
* --device X
|
||||
*/
|
||||
|
||||
static result
|
||||
parse_device(struct argparse *ap, struct pb_Parameters *params)
|
||||
{
|
||||
/* Read the next argument, which specifies a device */
|
||||
|
||||
if (is_end_of_arguments(ap))
|
||||
{
|
||||
report_parse_error("Expecting device specification after '--device'\n");
|
||||
return ARGPARSE_ERROR;
|
||||
}
|
||||
|
||||
char *device_string = consume_argument(ap);
|
||||
struct pb_DeviceParam *device_param = read_device_param(device_string);
|
||||
|
||||
if (!device_param) {
|
||||
report_parse_error("Unrecognized device specification format on command line\n");
|
||||
return ARGPARSE_ERROR;
|
||||
}
|
||||
|
||||
/* Save the result */
|
||||
pb_FreeDeviceParam(params->device);
|
||||
params->device = device_param;
|
||||
|
||||
return ARGPARSE_OK;
|
||||
}
|
||||
|
||||
static result
|
||||
parse_platform(struct argparse *ap, struct pb_Parameters *params)
|
||||
{
|
||||
/* Read the next argument, which specifies a platform */
|
||||
|
||||
if (is_end_of_arguments(ap))
|
||||
{
|
||||
report_parse_error("Expecting device specification after '--platform'\n");
|
||||
return ARGPARSE_ERROR;
|
||||
}
|
||||
|
||||
char *platform_string = consume_argument(ap);
|
||||
struct pb_PlatformParam *platform_param = read_platform_param(platform_string);
|
||||
|
||||
if (!platform_param) {
|
||||
report_parse_error("Unrecognized platform specification format on command line\n");
|
||||
return ARGPARSE_ERROR;
|
||||
}
|
||||
|
||||
/* Save the result */
|
||||
pb_FreePlatformParam(params->platform);
|
||||
params->platform = platform_param;
|
||||
|
||||
return ARGPARSE_OK;
|
||||
}
|
||||
|
||||
|
||||
static struct option options[] = {
|
||||
{ 'o', NULL, &parse_output_file },
|
||||
{ 'i', NULL, &parse_input_files },
|
||||
{ '-', NULL, &parse_end_options },
|
||||
{ 0, "device", &parse_device },
|
||||
{ 0, "platform", &parse_platform },
|
||||
{ 0, NULL, NULL }
|
||||
};
|
||||
|
||||
static int
|
||||
is_last_option(struct option *op)
|
||||
{
|
||||
return op->action == NULL;
|
||||
}
|
||||
|
||||
/****************************************************************************/
|
||||
|
||||
/* Parse command-line parameters.
|
||||
* Return zero on error, nonzero otherwise.
|
||||
* On error, the other outputs may be invalid.
|
||||
*
|
||||
* The information collected from parameters is used to update
|
||||
* 'ret'. 'ret' should be initialized.
|
||||
*
|
||||
* '_argc' and 'argv' are updated to contain only the unprocessed arguments.
|
||||
*/
|
||||
static int
|
||||
pb_ParseParameters (struct pb_Parameters *ret, int *_argc, char **argv)
|
||||
{
|
||||
char *err_message;
|
||||
struct argparse ap;
|
||||
|
||||
/* Each argument */
|
||||
initialize_argparse(&ap, *_argc, argv);
|
||||
while(!is_end_of_arguments(&ap)) {
|
||||
result arg_result; /* Result of parsing this option */
|
||||
char *arg = get_argument(&ap);
|
||||
|
||||
/* Process this argument */
|
||||
if (arg[0] == '-') {
|
||||
/* Single-character flag */
|
||||
if ((arg[1] != 0) && (arg[2] == 0)) {
|
||||
delete_argument(&ap); /* This argument is consumed here */
|
||||
|
||||
/* Find a matching short option */
|
||||
struct option *op;
|
||||
for (op = options; !is_last_option(op); op++) {
|
||||
if (op->short_name == arg[1]) {
|
||||
arg_result = (*op->action)(&ap, ret);
|
||||
goto option_was_processed;
|
||||
}
|
||||
}
|
||||
|
||||
/* No option matches */
|
||||
report_parse_error("Unexpected command-line parameter\n");
|
||||
arg_result = ARGPARSE_ERROR;
|
||||
goto option_was_processed;
|
||||
}
|
||||
|
||||
/* Long flag */
|
||||
if (arg[1] == '-') {
|
||||
delete_argument(&ap); /* This argument is consumed here */
|
||||
|
||||
/* Find a matching long option */
|
||||
struct option *op;
|
||||
for (op = options; !is_last_option(op); op++) {
|
||||
if (op->long_name && strcmp(&arg[2], op->long_name) == 0) {
|
||||
arg_result = (*op->action)(&ap, ret);
|
||||
goto option_was_processed;
|
||||
}
|
||||
}
|
||||
|
||||
/* No option matches */
|
||||
report_parse_error("Unexpected command-line parameter\n");
|
||||
arg_result = ARGPARSE_ERROR;
|
||||
goto option_was_processed;
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* Other arguments are ignored */
|
||||
next_argument(&ap);
|
||||
arg_result = ARGPARSE_OK;
|
||||
goto option_was_processed;
|
||||
}
|
||||
|
||||
option_was_processed:
|
||||
/* Decide what to do next based on 'arg_result' */
|
||||
switch(arg_result) {
|
||||
case ARGPARSE_OK:
|
||||
/* Continue processing */
|
||||
break;
|
||||
|
||||
case ARGPARSE_ERROR:
|
||||
/* Error exit from the function */
|
||||
return 0;
|
||||
|
||||
case ARGPARSE_DONE:
|
||||
/* Normal exit from the argument parsing loop */
|
||||
goto end_of_options;
|
||||
}
|
||||
} /* end for each argument */
|
||||
|
||||
/* If all arguments were processed, then normal exit from the loop */
|
||||
|
||||
end_of_options:
|
||||
finalize_argparse(&ap, _argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*****************************************************************************/
|
||||
/* Other exported functions */
|
||||
|
||||
struct pb_Parameters *
|
||||
pb_ReadParameters(int *_argc, char **argv)
|
||||
{
|
||||
struct pb_Parameters *ret =
|
||||
(struct pb_Parameters *)malloc(sizeof(struct pb_Parameters));
|
||||
|
||||
/* Initialize the parameters structure */
|
||||
ret->outFile = NULL;
|
||||
ret->inpFiles = (char **)malloc(sizeof(char *));
|
||||
ret->inpFiles[0] = NULL;
|
||||
ret->platform = NULL;
|
||||
ret->device = NULL;
|
||||
|
||||
/* Read parameters and update _argc, argv */
|
||||
if (!pb_ParseParameters(ret, _argc, argv)) {
|
||||
/* Parse error */
|
||||
pb_FreeParameters(ret);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int
|
||||
pb_Parameters_CountInputs(struct pb_Parameters *p)
|
||||
{
|
||||
int n;
|
||||
|
||||
for (n = 0; p->inpFiles[n]; n++);
|
||||
return n;
|
||||
}
|
||||
|
||||
@@ -1,37 +0,0 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2008-2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
#ifndef ATOM_H
|
||||
#define ATOM_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct Atom_t {
|
||||
float x, y, z, q;
|
||||
} Atom;
|
||||
|
||||
typedef struct Atoms_t {
|
||||
Atom *atoms;
|
||||
int size;
|
||||
} Atoms;
|
||||
|
||||
typedef struct Vec3_t {
|
||||
float x, y, z;
|
||||
} Vec3;
|
||||
|
||||
Atoms *read_atom_file(const char *fname);
|
||||
void free_atom(Atoms *atom);
|
||||
void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* ATOM_H */
|
||||
@@ -1,195 +0,0 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2008-2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <parboil.h>
|
||||
#include "atom.h"
|
||||
#include "cutoff.h"
|
||||
|
||||
#undef DEBUG_PASS_RATE
|
||||
#define CHECK_CYLINDER_CPU
|
||||
|
||||
#define CELLEN 4.f
|
||||
#define INV_CELLEN (1.f/CELLEN)
|
||||
|
||||
extern int cpu_compute_cutoff_potential_lattice(
|
||||
Lattice *lattice, /* the lattice */
|
||||
float cutoff, /* cutoff distance */
|
||||
Atoms *atoms /* array of atoms */
|
||||
)
|
||||
{
|
||||
int nx = lattice->dim.nx;
|
||||
int ny = lattice->dim.ny;
|
||||
int nz = lattice->dim.nz;
|
||||
float xlo = lattice->dim.lo.x;
|
||||
float ylo = lattice->dim.lo.y;
|
||||
float zlo = lattice->dim.lo.z;
|
||||
float gridspacing = lattice->dim.h;
|
||||
int natoms = atoms->size;
|
||||
Atom *atom = atoms->atoms;
|
||||
|
||||
const float a2 = cutoff * cutoff;
|
||||
const float inv_a2 = 1.f / a2;
|
||||
float s;
|
||||
const float inv_gridspacing = 1.f / gridspacing;
|
||||
const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
|
||||
/* lattice point radius about each atom */
|
||||
|
||||
int n;
|
||||
int i, j, k;
|
||||
int ia, ib, ic;
|
||||
int ja, jb, jc;
|
||||
int ka, kb, kc;
|
||||
int index;
|
||||
int koff, jkoff;
|
||||
|
||||
float x, y, z, q;
|
||||
float dx, dy, dz;
|
||||
float dz2, dydz2, r2;
|
||||
float e;
|
||||
float xstart, ystart;
|
||||
|
||||
float *pg;
|
||||
|
||||
int gindex;
|
||||
int ncell, nxcell, nycell, nzcell;
|
||||
int *first, *next;
|
||||
float inv_cellen = INV_CELLEN;
|
||||
Vec3 minext, maxext; /* Extent of atom bounding box */
|
||||
float xmin, ymin, zmin;
|
||||
float xmax, ymax, zmax;
|
||||
|
||||
#if DEBUG_PASS_RATE
|
||||
unsigned long long pass_count = 0;
|
||||
unsigned long long fail_count = 0;
|
||||
#endif
|
||||
|
||||
/* find min and max extent */
|
||||
get_atom_extent(&minext, &maxext, atoms);
|
||||
|
||||
/* number of cells in each dimension */
|
||||
nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
|
||||
nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
|
||||
nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
|
||||
ncell = nxcell * nycell * nzcell;
|
||||
|
||||
/* allocate for cursor link list implementation */
|
||||
first = (int *) malloc(ncell * sizeof(int));
|
||||
for (gindex = 0; gindex < ncell; gindex++) {
|
||||
first[gindex] = -1;
|
||||
}
|
||||
next = (int *) malloc(natoms * sizeof(int));
|
||||
for (n = 0; n < natoms; n++) {
|
||||
next[n] = -1;
|
||||
}
|
||||
|
||||
/* geometric hashing */
|
||||
for (n = 0; n < natoms; n++) {
|
||||
if (0==atom[n].q) continue; /* skip any non-contributing atoms */
|
||||
i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
|
||||
j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
|
||||
k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
|
||||
gindex = (k*nycell + j)*nxcell + i;
|
||||
next[n] = first[gindex];
|
||||
first[gindex] = n;
|
||||
}
|
||||
|
||||
/* traverse the grid cells */
|
||||
for (gindex = 0; gindex < ncell; gindex++) {
|
||||
for (n = first[gindex]; n != -1; n = next[n]) {
|
||||
x = atom[n].x - xlo;
|
||||
y = atom[n].y - ylo;
|
||||
z = atom[n].z - zlo;
|
||||
q = atom[n].q;
|
||||
|
||||
/* find closest grid point with position less than or equal to atom */
|
||||
ic = (int) (x * inv_gridspacing);
|
||||
jc = (int) (y * inv_gridspacing);
|
||||
kc = (int) (z * inv_gridspacing);
|
||||
|
||||
/* find extent of surrounding box of grid points */
|
||||
ia = ic - radius;
|
||||
ib = ic + radius + 1;
|
||||
ja = jc - radius;
|
||||
jb = jc + radius + 1;
|
||||
ka = kc - radius;
|
||||
kb = kc + radius + 1;
|
||||
|
||||
/* trim box edges so that they are within grid point lattice */
|
||||
if (ia < 0) ia = 0;
|
||||
if (ib >= nx) ib = nx-1;
|
||||
if (ja < 0) ja = 0;
|
||||
if (jb >= ny) jb = ny-1;
|
||||
if (ka < 0) ka = 0;
|
||||
if (kb >= nz) kb = nz-1;
|
||||
|
||||
/* loop over surrounding grid points */
|
||||
xstart = ia*gridspacing - x;
|
||||
ystart = ja*gridspacing - y;
|
||||
dz = ka*gridspacing - z;
|
||||
for (k = ka; k <= kb; k++, dz += gridspacing) {
|
||||
koff = k*ny;
|
||||
dz2 = dz*dz;
|
||||
dy = ystart;
|
||||
for (j = ja; j <= jb; j++, dy += gridspacing) {
|
||||
jkoff = (koff + j)*nx;
|
||||
dydz2 = dy*dy + dz2;
|
||||
#ifdef CHECK_CYLINDER_CPU
|
||||
if (dydz2 >= a2) continue;
|
||||
#endif
|
||||
|
||||
dx = xstart;
|
||||
index = jkoff + ia;
|
||||
pg = lattice->lattice + index;
|
||||
|
||||
#if defined(__INTEL_COMPILER)
|
||||
for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
|
||||
r2 = dx*dx + dydz2;
|
||||
s = (1.f - r2 * inv_a2) * (1.f - r2 * inv_a2);
|
||||
e = q * (1/sqrtf(r2)) * s;
|
||||
*pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */
|
||||
}
|
||||
#else
|
||||
for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
|
||||
r2 = dx*dx + dydz2;
|
||||
if (r2 >= a2)
|
||||
{
|
||||
#ifdef DEBUG_PASS_RATE
|
||||
fail_count++;
|
||||
#endif
|
||||
continue;
|
||||
}
|
||||
#ifdef DEBUG_PASS_RATE
|
||||
pass_count++;
|
||||
#endif
|
||||
s = (1.f - r2 * inv_a2);
|
||||
e = q * (1/sqrtf(r2)) * s * s;
|
||||
*pg += e;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
} /* end loop over surrounding grid points */
|
||||
|
||||
} /* end loop over atoms in a gridcell */
|
||||
} /* end loop over gridcells */
|
||||
|
||||
/* free memory */
|
||||
free(next);
|
||||
free(first);
|
||||
|
||||
/* For debugging: print the number of times that the test passed/failed */
|
||||
#ifdef DEBUG_PASS_RATE
|
||||
printf ("Pass :%lld\n", pass_count);
|
||||
printf ("Fail :%lld\n", fail_count);
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1,499 +0,0 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2008-2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
#include <CL/cl.h>
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <parboil.h>
|
||||
|
||||
#include "atom.h"
|
||||
#include "cutoff.h"
|
||||
#include "macros.h"
|
||||
#include "ocl.h"
|
||||
|
||||
// OpenCL 1.1 support for int3 is not uniform on all implementations, so
|
||||
// we use int4 instead. Only the 'x', 'y', and 'z' fields of xyz are used.
|
||||
typedef cl_int4 xyz;
|
||||
|
||||
//extern "C" int gpu_compute_cutoff_potential_lattice(
|
||||
int gpu_compute_cutoff_potential_lattice(
|
||||
struct pb_TimerSet *timers,
|
||||
Lattice *lattice, /* the lattice */
|
||||
float cutoff, /* cutoff distance */
|
||||
Atoms *atoms, /* array of atoms */
|
||||
int verbose, /* print info/debug messages */
|
||||
struct pb_Parameters *parameters
|
||||
)
|
||||
{
|
||||
int nx = lattice->dim.nx;
|
||||
int ny = lattice->dim.ny;
|
||||
int nz = lattice->dim.nz;
|
||||
float xlo = lattice->dim.lo.x;
|
||||
float ylo = lattice->dim.lo.y;
|
||||
float zlo = lattice->dim.lo.z;
|
||||
float h = lattice->dim.h;
|
||||
int natoms = atoms->size;
|
||||
Atom *atom = atoms->atoms;
|
||||
|
||||
xyz nbrlist[NBRLIST_MAXLEN];
|
||||
int nbrlistlen = 0;
|
||||
|
||||
int binHistoFull[BIN_DEPTH+1] = { 0 }; /* clear every array element */
|
||||
int binHistoCover[BIN_DEPTH+1] = { 0 }; /* clear every array element */
|
||||
int num_excluded = 0;
|
||||
|
||||
int xRegionDim, yRegionDim, zRegionDim;
|
||||
int xRegionIndex, yRegionIndex, zRegionIndex;
|
||||
int xOffset, yOffset, zOffset;
|
||||
int lnx, lny, lnz, lnall;
|
||||
float *regionZeroAddr, *thisRegion;
|
||||
cl_mem regionZeroCl;
|
||||
int index, indexRegion;
|
||||
|
||||
int c;
|
||||
xyz binDim;
|
||||
int nbins;
|
||||
cl_float4 *binBaseAddr, *binZeroAddr;
|
||||
cl_mem binBaseCl, binZeroCl;
|
||||
int *bincntBaseAddr, *bincntZeroAddr;
|
||||
Atoms *extra = NULL;
|
||||
|
||||
cl_mem NbrListLen;
|
||||
cl_mem NbrList;
|
||||
|
||||
int i, j, k, n;
|
||||
int sum, total;
|
||||
|
||||
float avgFillFull, avgFillCover;
|
||||
const float cutoff2 = cutoff * cutoff;
|
||||
const float inv_cutoff2 = 1.f / cutoff2;
|
||||
|
||||
size_t gridDim[3], blockDim[3];
|
||||
|
||||
// The "compute" timer should be active upon entry to this function
|
||||
|
||||
/* pad lattice to be factor of 8 in each dimension */
|
||||
xRegionDim = (int) ceilf(nx/8.f);
|
||||
yRegionDim = (int) ceilf(ny/8.f);
|
||||
zRegionDim = (int) ceilf(nz/8.f);
|
||||
|
||||
lnx = 8 * xRegionDim;
|
||||
lny = 8 * yRegionDim;
|
||||
lnz = 8 * zRegionDim;
|
||||
lnall = lnx * lny * lnz;
|
||||
|
||||
/* will receive energies from OpenCL */
|
||||
regionZeroAddr = (float *) malloc(lnall * sizeof(float));
|
||||
|
||||
/* create bins */
|
||||
c = (int) ceil(cutoff * BIN_INVLEN); /* count extra bins around lattice */
|
||||
binDim.x = (int) ceil(lnx * h * BIN_INVLEN) + 2*c;
|
||||
binDim.y = (int) ceil(lny * h * BIN_INVLEN) + 2*c;
|
||||
binDim.z = (int) ceil(lnz * h * BIN_INVLEN) + 2*c;
|
||||
nbins = binDim.x * binDim.y * binDim.z;
|
||||
binBaseAddr = (cl_float4 *) calloc(nbins * BIN_DEPTH, sizeof(cl_float4));
|
||||
binZeroAddr = binBaseAddr + ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;
|
||||
|
||||
bincntBaseAddr = (int *) calloc(nbins, sizeof(int));
|
||||
bincntZeroAddr = bincntBaseAddr + (c * binDim.y + c) * binDim.x + c;
|
||||
|
||||
/* create neighbor list */
|
||||
if (ceilf(BIN_LENGTH / (8*h)) == floorf(BIN_LENGTH / (8*h))) {
|
||||
float s = sqrtf(3);
|
||||
float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH);
|
||||
int cnt = 0;
|
||||
/* develop neighbor list around 1 cell */
|
||||
if (2*c + 1 > NBRLIST_DIM) {
|
||||
fprintf(stderr, "must have cutoff <= %f\n",
|
||||
(NBRLIST_DIM-1)/2 * BIN_LENGTH);
|
||||
return -1;
|
||||
}
|
||||
for (k = -c; k <= c; k++) {
|
||||
for (j = -c; j <= c; j++) {
|
||||
for (i = -c; i <= c; i++) {
|
||||
if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue;
|
||||
nbrlist[cnt].x = i;
|
||||
nbrlist[cnt].y = j;
|
||||
nbrlist[cnt].z = k;
|
||||
cnt++;
|
||||
}
|
||||
}
|
||||
}
|
||||
nbrlistlen = cnt;
|
||||
}
|
||||
else if (8*h <= 2*BIN_LENGTH) {
|
||||
float s = 2.f*sqrtf(3);
|
||||
float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH);
|
||||
int cnt = 0;
|
||||
/* develop neighbor list around 3-cube of cells */
|
||||
if (2*c + 3 > NBRLIST_DIM) {
|
||||
fprintf(stderr, "must have cutoff <= %f\n",
|
||||
(NBRLIST_DIM-3)/2 * BIN_LENGTH);
|
||||
return -1;
|
||||
}
|
||||
for (k = -c; k <= c; k++) {
|
||||
for (j = -c; j <= c; j++) {
|
||||
for (i = -c; i <= c; i++) {
|
||||
if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue;
|
||||
nbrlist[cnt].x = i;
|
||||
nbrlist[cnt].y = j;
|
||||
nbrlist[cnt].z = k;
|
||||
cnt++;
|
||||
}
|
||||
}
|
||||
}
|
||||
nbrlistlen = cnt;
|
||||
}
|
||||
else {
|
||||
fprintf(stderr, "must have h <= %f\n", 0.25 * BIN_LENGTH);
|
||||
return -1;
|
||||
}
|
||||
|
||||
/* perform geometric hashing of atoms into bins */
|
||||
{
|
||||
/* array of extra atoms, permit average of one extra per bin */
|
||||
Atom *extra_atoms = (Atom *) calloc(nbins, sizeof(Atom));
|
||||
int extra_len = 0;
|
||||
|
||||
for (n = 0; n < natoms; n++) {
|
||||
cl_float4 p;
|
||||
p.x = atom[n].x - xlo;
|
||||
p.y = atom[n].y - ylo;
|
||||
p.z = atom[n].z - zlo;
|
||||
p.w = atom[n].q;
|
||||
i = (int) floorf(p.x * BIN_INVLEN);
|
||||
j = (int) floorf(p.y * BIN_INVLEN);
|
||||
k = (int) floorf(p.z * BIN_INVLEN);
|
||||
if (i >= -c && i < binDim.x - c &&
|
||||
j >= -c && j < binDim.y - c &&
|
||||
k >= -c && k < binDim.z - c &&
|
||||
atom[n].q != 0) {
|
||||
int index = (k * binDim.y + j) * binDim.x + i;
|
||||
cl_float4 *bin = binZeroAddr + index * BIN_DEPTH;
|
||||
int bindex = bincntZeroAddr[index];
|
||||
if (bindex < BIN_DEPTH) {
|
||||
/* copy atom into bin and increase counter for this bin */
|
||||
bin[bindex] = p;
|
||||
bincntZeroAddr[index]++;
|
||||
}
|
||||
else {
|
||||
/* add index to array of extra atoms to be computed with CPU */
|
||||
if (extra_len >= nbins) {
|
||||
fprintf(stderr, "exceeded space for storing extra atoms\n");
|
||||
return -1;
|
||||
}
|
||||
extra_atoms[extra_len] = atom[n];
|
||||
extra_len++;
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* excluded atoms are either outside bins or neutrally charged */
|
||||
num_excluded++;
|
||||
}
|
||||
}
|
||||
|
||||
/* Save result */
|
||||
extra = (Atoms *)malloc(sizeof(Atoms));
|
||||
extra->atoms = extra_atoms;
|
||||
extra->size = extra_len;
|
||||
}
|
||||
|
||||
/* bin stats */
|
||||
sum = total = 0;
|
||||
for (n = 0; n < nbins; n++) {
|
||||
binHistoFull[ bincntBaseAddr[n] ]++;
|
||||
sum += bincntBaseAddr[n];
|
||||
total += BIN_DEPTH;
|
||||
}
|
||||
avgFillFull = sum / (float) total;
|
||||
sum = total = 0;
|
||||
for (k = 0; k < binDim.z - 2*c; k++) {
|
||||
for (j = 0; j < binDim.y - 2*c; j++) {
|
||||
for (i = 0; i < binDim.x - 2*c; i++) {
|
||||
int index = (k * binDim.y + j) * binDim.x + i;
|
||||
binHistoCover[ bincntZeroAddr[index] ]++;
|
||||
sum += bincntZeroAddr[index];
|
||||
total += BIN_DEPTH;
|
||||
}
|
||||
}
|
||||
}
|
||||
avgFillCover = sum / (float) total;
|
||||
|
||||
if (verbose) {
|
||||
/* report */
|
||||
printf("number of atoms = %d\n", natoms);
|
||||
printf("lattice spacing = %g\n", h);
|
||||
printf("cutoff distance = %g\n", cutoff);
|
||||
printf("\n");
|
||||
printf("requested lattice dimensions = %d %d %d\n", nx, ny, nz);
|
||||
printf("requested space dimensions = %g %g %g\n", nx*h, ny*h, nz*h);
|
||||
printf("expanded lattice dimensions = %d %d %d\n", lnx, lny, lnz);
|
||||
printf("expanded space dimensions = %g %g %g\n", lnx*h, lny*h, lnz*h);
|
||||
printf("number of bytes for lattice data = %u\n", (unsigned int) (lnall*sizeof(float)));
|
||||
printf("\n");
|
||||
printf("bin padding thickness = %d\n", c);
|
||||
printf("bin cover dimensions = %d %d %d\n",
|
||||
binDim.x - 2*c, binDim.y - 2*c, binDim.z - 2*c);
|
||||
printf("bin full dimensions = %d %d %d\n", binDim.x, binDim.y, binDim.z);
|
||||
printf("number of bins = %d\n", nbins);
|
||||
printf("total number of atom slots = %d\n", nbins * BIN_DEPTH);
|
||||
printf("%% overhead space = %g\n",
|
||||
(natoms / (double) (nbins * BIN_DEPTH)) * 100);
|
||||
printf("number of bytes for bin data = %u\n",
|
||||
(unsigned int)(nbins * BIN_DEPTH * sizeof(cl_float4)));
|
||||
printf("\n");
|
||||
printf("bin histogram with padding:\n");
|
||||
sum = 0;
|
||||
for (n = 0; n <= BIN_DEPTH; n++) {
|
||||
printf(" number of bins with %d atoms: %d\n", n, binHistoFull[n]);
|
||||
sum += binHistoFull[n];
|
||||
}
|
||||
printf(" total number of bins: %d\n", sum);
|
||||
printf(" %% average fill: %g\n", avgFillFull * 100);
|
||||
printf("\n");
|
||||
printf("bin histogram excluding padding:\n");
|
||||
sum = 0;
|
||||
for (n = 0; n <= BIN_DEPTH; n++) {
|
||||
printf(" number of bins with %d atoms: %d\n", n, binHistoCover[n]);
|
||||
sum += binHistoCover[n];
|
||||
}
|
||||
printf(" total number of bins: %d\n", sum);
|
||||
printf(" %% average fill: %g\n", avgFillCover * 100);
|
||||
printf("\n");
|
||||
printf("number of extra atoms = %d\n", extra->size);
|
||||
printf("%% atoms that are extra = %g\n", (extra->size / (double) natoms) * 100);
|
||||
printf("\n");
|
||||
|
||||
/* sanity check on bins */
|
||||
sum = 0;
|
||||
for (n = 0; n <= BIN_DEPTH; n++) {
|
||||
sum += n * binHistoFull[n];
|
||||
}
|
||||
sum += extra->size + num_excluded;
|
||||
printf("sanity check on bin histogram with edges: "
|
||||
"sum + others = %d\n", sum);
|
||||
sum = 0;
|
||||
for (n = 0; n <= BIN_DEPTH; n++) {
|
||||
sum += n * binHistoCover[n];
|
||||
}
|
||||
sum += extra->size + num_excluded;
|
||||
printf("sanity check on bin histogram excluding edges: "
|
||||
"sum + others = %d\n", sum);
|
||||
printf("\n");
|
||||
|
||||
/* neighbor list */
|
||||
printf("neighbor list length = %d\n", nbrlistlen);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
printf("Ok!\n");
|
||||
|
||||
pb_Context* pb_context;
|
||||
pb_context = pb_InitOpenCLContext(parameters);
|
||||
if (pb_context == NULL) {
|
||||
fprintf (stderr, "Error: No OpenCL platform/device can be found.");
|
||||
return -1;
|
||||
}
|
||||
|
||||
printf("Ok!\n");
|
||||
|
||||
cl_int clStatus;
|
||||
cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId;
|
||||
cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId;
|
||||
cl_context clContext = (cl_context) pb_context->clContext;
|
||||
|
||||
cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
|
||||
CHECK_ERROR("clCreateCommandQueue")
|
||||
|
||||
pb_SetOpenCL(&clContext, &clCommandQueue);
|
||||
|
||||
//const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
|
||||
//cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
|
||||
cl_program clProgram = clCreateProgramWithBuiltInKernels(
|
||||
clContext, 1, &clDevice, "opencl_cutoff_potential_lattice", &clStatus);
|
||||
CHECK_ERROR("clCreateProgramWithSource")
|
||||
|
||||
char clOptions[50];
|
||||
sprintf(clOptions,"-I src/opencl_base"); //-cl-nv-verbose
|
||||
|
||||
clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
|
||||
if (clStatus != CL_SUCCESS) {
|
||||
size_t string_size = 0;
|
||||
clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG,
|
||||
0, NULL, &string_size);
|
||||
char* string = (char*)malloc(string_size*sizeof(char));
|
||||
clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG,
|
||||
string_size, string, NULL);
|
||||
puts(string);
|
||||
}
|
||||
|
||||
CHECK_ERROR("clBuildProgram")
|
||||
|
||||
cl_kernel clKernel = clCreateKernel(clProgram,"opencl_cutoff_potential_lattice",&clStatus);
|
||||
CHECK_ERROR("clCreateKernel")
|
||||
|
||||
/* setup OpenCL kernel parameters */
|
||||
blockDim[0] = 8;
|
||||
blockDim[1] = 8;
|
||||
blockDim[2] = 2;
|
||||
gridDim[0] = 4 * xRegionDim * blockDim[0];
|
||||
gridDim[1] = yRegionDim * blockDim[1];
|
||||
gridDim[2] = 1 * blockDim[2];
|
||||
|
||||
/* allocate and initialize memory on OpenCL device */
|
||||
pb_SwitchToTimer(timers, pb_TimerID_COPY);
|
||||
if (verbose) {
|
||||
printf("Allocating %.2fMB on OpenCL device for potentials\n",
|
||||
lnall * sizeof(float) / (double) (1024*1024));
|
||||
}
|
||||
|
||||
regionZeroCl = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,lnall*sizeof(float),NULL,&clStatus);
|
||||
CHECK_ERROR("clCreateBuffer")
|
||||
|
||||
// clMemSet(clCommandQueue,regionZeroCl,0,lnall*sizeof(float));
|
||||
|
||||
if (verbose) {
|
||||
printf("Allocating %.2fMB on OpenCL device for atom bins\n",
|
||||
nbins * BIN_DEPTH * sizeof(cl_float4) / (double) (1024*1024));
|
||||
}
|
||||
|
||||
binBaseCl = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nbins*BIN_DEPTH*sizeof(cl_float4),NULL,&clStatus);
|
||||
CHECK_ERROR("clCreateBuffer")
|
||||
|
||||
clStatus = clEnqueueWriteBuffer(clCommandQueue,binBaseCl,CL_TRUE,0,nbins*BIN_DEPTH*sizeof(cl_float4),binBaseAddr,0,NULL,NULL);
|
||||
CHECK_ERROR("clEnqueueWriteBuffer")
|
||||
|
||||
//Sub buffers are not supported in OpenCL v1.0
|
||||
int offset = ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;
|
||||
|
||||
NbrListLen = clCreateBuffer(clContext,CL_MEM_READ_ONLY,sizeof(int),NULL,&clStatus);
|
||||
CHECK_ERROR("clCreateBuffer")
|
||||
clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrListLen,CL_TRUE,0,sizeof(int),&nbrlistlen,0,NULL,NULL);
|
||||
CHECK_ERROR("clEnqueueWriteBuffer")
|
||||
|
||||
NbrList = clCreateBuffer(clContext,CL_MEM_READ_ONLY,NBRLIST_MAXLEN*sizeof(xyz),NULL,&clStatus);
|
||||
CHECK_ERROR("clCreateBuffer")
|
||||
clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrList,CL_TRUE,0,nbrlistlen*sizeof(xyz),nbrlist,0,NULL,NULL);
|
||||
CHECK_ERROR("clEnqueueWriteBuffer")
|
||||
|
||||
if (verbose)
|
||||
printf("\n");
|
||||
|
||||
clStatus = clSetKernelArg(clKernel,0,sizeof(int),&(binDim.x));
|
||||
clStatus = clSetKernelArg(clKernel,1,sizeof(int),&(binDim.y));
|
||||
clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&binBaseCl);
|
||||
clStatus = clSetKernelArg(clKernel,3,sizeof(int),&offset);
|
||||
clStatus = clSetKernelArg(clKernel,4,sizeof(float),&h);
|
||||
clStatus = clSetKernelArg(clKernel,5,sizeof(float),&cutoff2);
|
||||
clStatus = clSetKernelArg(clKernel,6,sizeof(float),&inv_cutoff2);
|
||||
clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),®ionZeroCl);
|
||||
clStatus = clSetKernelArg(clKernel,9,sizeof(cl_mem),&NbrListLen);
|
||||
clStatus = clSetKernelArg(clKernel,10,sizeof(cl_mem),&NbrList);
|
||||
CHECK_ERROR("clSetKernelArg")
|
||||
|
||||
printf("Ok!!\n");
|
||||
|
||||
|
||||
/* loop over z-dimension, invoke OpenCL kernel for each x-y plane */
|
||||
pb_SwitchToTimer(timers, pb_TimerID_KERNEL);
|
||||
printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim);
|
||||
for (zRegionIndex = 0; zRegionIndex < zRegionDim; zRegionIndex++) {
|
||||
printf(" computing plane %d\r", zRegionIndex);
|
||||
fflush(stdout);
|
||||
|
||||
clStatus = clSetKernelArg(clKernel,8,sizeof(int),&zRegionIndex);
|
||||
CHECK_ERROR("clSetKernelArg")
|
||||
|
||||
printf("Ok**!2\n");
|
||||
|
||||
clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,gridDim,blockDim,0,NULL,NULL);
|
||||
|
||||
printf("Ok**!2\n");
|
||||
|
||||
CHECK_ERROR("clEnqueueNDRangeKernel")
|
||||
|
||||
printf("Ok**!2\n");
|
||||
|
||||
clStatus = clFinish(clCommandQueue);
|
||||
|
||||
printf("Ok**!2\n");
|
||||
|
||||
CHECK_ERROR("clFinish")
|
||||
}
|
||||
|
||||
printf("Ok++!\n");
|
||||
|
||||
printf("Finished OpenCL kernel calls \n");
|
||||
|
||||
/* copy result regions from OpenCL device */
|
||||
pb_SwitchToTimer(timers, pb_TimerID_COPY);
|
||||
clStatus = clEnqueueReadBuffer(clCommandQueue,regionZeroCl,CL_TRUE,0,lnall*sizeof(float),regionZeroAddr,0,NULL,NULL);
|
||||
CHECK_ERROR("clEnqueueReadBuffer")
|
||||
|
||||
/* free OpenCL memory allocations */
|
||||
clStatus = clReleaseMemObject(regionZeroCl);
|
||||
clStatus = clReleaseMemObject(binBaseCl);
|
||||
clStatus = clReleaseMemObject(NbrListLen);
|
||||
clStatus = clReleaseMemObject(NbrList);
|
||||
CHECK_ERROR("clReleaseMemObject")
|
||||
|
||||
clStatus = clReleaseKernel(clKernel);
|
||||
clStatus = clReleaseProgram(clProgram);
|
||||
clStatus = clReleaseCommandQueue(clCommandQueue);
|
||||
clStatus = clReleaseContext(clContext);
|
||||
|
||||
//free((void*)clSource[0]);
|
||||
|
||||
/* transpose regions back into lattice */
|
||||
pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
|
||||
for (k = 0; k < nz; k++) {
|
||||
zRegionIndex = (k >> 3);
|
||||
zOffset = (k & 7);
|
||||
|
||||
for (j = 0; j < ny; j++) {
|
||||
yRegionIndex = (j >> 3);
|
||||
yOffset = (j & 7);
|
||||
|
||||
for (i = 0; i < nx; i++) {
|
||||
xRegionIndex = (i >> 3);
|
||||
xOffset = (i & 7);
|
||||
|
||||
thisRegion = regionZeroAddr
|
||||
+ ((zRegionIndex * yRegionDim + yRegionIndex) * xRegionDim
|
||||
+ xRegionIndex) * REGION_SIZE;
|
||||
|
||||
indexRegion = (zOffset * 8 + yOffset) * 8 + xOffset;
|
||||
index = (k * ny + j) * nx + i;
|
||||
|
||||
lattice->lattice[index] = thisRegion[indexRegion];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* handle extra atoms */
|
||||
if (extra->size > 0) {
|
||||
printf("computing extra atoms on CPU\n");
|
||||
if (cpu_compute_cutoff_potential_lattice(lattice, cutoff, extra)) {
|
||||
fprintf(stderr, "cpu_compute_cutoff_potential_lattice() failed "
|
||||
"for extra atoms\n");
|
||||
return -1;
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
/* cleanup memory allocations */
|
||||
free(regionZeroAddr);
|
||||
free(binBaseAddr);
|
||||
free(bincntBaseAddr);
|
||||
free_atom(extra);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1,72 +0,0 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2008-2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
#ifndef CUTOFF_H
|
||||
#define CUTOFF_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define SHIFTED
|
||||
|
||||
/* A structure to record how points in 3D space map to array
|
||||
elements. Array element (z, y, x)
|
||||
where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz
|
||||
maps to coordinate (xlo, ylo, zlo) + h * (x, y, z).
|
||||
*/
|
||||
typedef struct LatticeDim_t {
|
||||
/* Number of lattice points in x, y, z dimensions */
|
||||
int nx, ny, nz;
|
||||
|
||||
/* Lowest corner of lattice */
|
||||
Vec3 lo;
|
||||
|
||||
/* Lattice spacing */
|
||||
float h;
|
||||
} LatticeDim;
|
||||
|
||||
/* An electric potential field sampled on a regular grid. The
|
||||
lattice size and grid point positions are specified by 'dim'.
|
||||
*/
|
||||
typedef struct Lattice_t {
|
||||
LatticeDim dim;
|
||||
float *lattice;
|
||||
} Lattice;
|
||||
|
||||
LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h);
|
||||
|
||||
Lattice *create_lattice(LatticeDim dim);
|
||||
void destroy_lattice(Lattice *);
|
||||
|
||||
int gpu_compute_cutoff_potential_lattice(
|
||||
struct pb_TimerSet *timers,
|
||||
Lattice *lattice,
|
||||
float cutoff, /* cutoff distance */
|
||||
Atoms *atom, /* array of atoms */
|
||||
int verbose, /* print info/debug messages */
|
||||
struct pb_Parameters *parameters
|
||||
);
|
||||
|
||||
int cpu_compute_cutoff_potential_lattice(
|
||||
Lattice *lattice, /* the lattice */
|
||||
float cutoff, /* cutoff distance */
|
||||
Atoms *atoms /* array of atoms */
|
||||
);
|
||||
|
||||
int remove_exclusions(
|
||||
Lattice *lattice, /* the lattice */
|
||||
float exclcutoff, /* exclusion cutoff distance */
|
||||
Atoms *atom /* array of atoms */
|
||||
);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* CUTOFF_H */
|
||||
@@ -1,157 +0,0 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2008-2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <parboil.h>
|
||||
|
||||
#include "atom.h"
|
||||
#include "cutoff.h"
|
||||
|
||||
#define CELLEN 4.f
|
||||
#define INV_CELLEN (1.f/CELLEN)
|
||||
|
||||
extern int remove_exclusions(
|
||||
Lattice *lattice, /* the lattice */
|
||||
float cutoff, /* exclusion cutoff distance */
|
||||
Atoms *atoms /* array of atoms */
|
||||
)
|
||||
{
|
||||
int nx = lattice->dim.nx;
|
||||
int ny = lattice->dim.ny;
|
||||
int nz = lattice->dim.nz;
|
||||
float xlo = lattice->dim.lo.x;
|
||||
float ylo = lattice->dim.lo.y;
|
||||
float zlo = lattice->dim.lo.z;
|
||||
float gridspacing = lattice->dim.h;
|
||||
Atom *atom = atoms->atoms;
|
||||
|
||||
const float a2 = cutoff * cutoff;
|
||||
const float inv_gridspacing = 1.f / gridspacing;
|
||||
const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
|
||||
/* lattice point radius about each atom */
|
||||
|
||||
int n;
|
||||
int i, j, k;
|
||||
int ia, ib, ic;
|
||||
int ja, jb, jc;
|
||||
int ka, kb, kc;
|
||||
int index;
|
||||
int koff, jkoff;
|
||||
|
||||
float x, y, z, q;
|
||||
float dx, dy, dz;
|
||||
float dz2, dydz2, r2;
|
||||
float e;
|
||||
float xstart, ystart;
|
||||
|
||||
float *pg;
|
||||
|
||||
int gindex;
|
||||
int ncell, nxcell, nycell, nzcell;
|
||||
int *first, *next;
|
||||
float inv_cellen = INV_CELLEN;
|
||||
Vec3 minext, maxext;
|
||||
|
||||
/* find min and max extent */
|
||||
get_atom_extent(&minext, &maxext, atoms);
|
||||
|
||||
/* number of cells in each dimension */
|
||||
nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
|
||||
nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
|
||||
nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
|
||||
ncell = nxcell * nycell * nzcell;
|
||||
|
||||
/* allocate for cursor link list implementation */
|
||||
first = (int *) malloc(ncell * sizeof(int));
|
||||
for (gindex = 0; gindex < ncell; gindex++) {
|
||||
first[gindex] = -1;
|
||||
}
|
||||
next = (int *) malloc(atoms->size * sizeof(int));
|
||||
for (n = 0; n < atoms->size; n++) {
|
||||
next[n] = -1;
|
||||
}
|
||||
|
||||
/* geometric hashing */
|
||||
for (n = 0; n < atoms->size; n++) {
|
||||
if (0==atom[n].q) continue; /* skip any non-contributing atoms */
|
||||
i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
|
||||
j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
|
||||
k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
|
||||
gindex = (k*nycell + j)*nxcell + i;
|
||||
next[n] = first[gindex];
|
||||
first[gindex] = n;
|
||||
}
|
||||
|
||||
/* traverse the grid cells */
|
||||
for (gindex = 0; gindex < ncell; gindex++) {
|
||||
for (n = first[gindex]; n != -1; n = next[n]) {
|
||||
x = atom[n].x - xlo;
|
||||
y = atom[n].y - ylo;
|
||||
z = atom[n].z - zlo;
|
||||
q = atom[n].q;
|
||||
|
||||
/* find closest grid point with position less than or equal to atom */
|
||||
ic = (int) (x * inv_gridspacing);
|
||||
jc = (int) (y * inv_gridspacing);
|
||||
kc = (int) (z * inv_gridspacing);
|
||||
|
||||
/* find extent of surrounding box of grid points */
|
||||
ia = ic - radius;
|
||||
ib = ic + radius + 1;
|
||||
ja = jc - radius;
|
||||
jb = jc + radius + 1;
|
||||
ka = kc - radius;
|
||||
kb = kc + radius + 1;
|
||||
|
||||
/* trim box edges so that they are within grid point lattice */
|
||||
if (ia < 0) ia = 0;
|
||||
if (ib >= nx) ib = nx-1;
|
||||
if (ja < 0) ja = 0;
|
||||
if (jb >= ny) jb = ny-1;
|
||||
if (ka < 0) ka = 0;
|
||||
if (kb >= nz) kb = nz-1;
|
||||
|
||||
/* loop over surrounding grid points */
|
||||
xstart = ia*gridspacing - x;
|
||||
ystart = ja*gridspacing - y;
|
||||
dz = ka*gridspacing - z;
|
||||
for (k = ka; k <= kb; k++, dz += gridspacing) {
|
||||
koff = k*ny;
|
||||
dz2 = dz*dz;
|
||||
|
||||
dy = ystart;
|
||||
for (j = ja; j <= jb; j++, dy += gridspacing) {
|
||||
jkoff = (koff + j)*nx;
|
||||
dydz2 = dy*dy + dz2;
|
||||
|
||||
dx = xstart;
|
||||
index = jkoff + ia;
|
||||
pg = lattice->lattice + index;
|
||||
|
||||
for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
|
||||
r2 = dx*dx + dydz2;
|
||||
|
||||
/* If atom and lattice point are too close, set the lattice value
|
||||
* to zero */
|
||||
if (r2 < a2) *pg = 0;
|
||||
}
|
||||
}
|
||||
} /* end loop over surrounding grid points */
|
||||
|
||||
} /* end loop over atoms in a gridcell */
|
||||
} /* end loop over gridcells */
|
||||
|
||||
/* free memory */
|
||||
free(next);
|
||||
free(first);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1,55 +0,0 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
//#include <endian.h>
|
||||
#include <stdlib.h>
|
||||
#include <malloc.h>
|
||||
#include <stdio.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
#include "gpu_info.h"
|
||||
|
||||
void compute_active_thread(size_t *thread,
|
||||
size_t *grid,
|
||||
int task,
|
||||
int pad,
|
||||
int major,
|
||||
int minor,
|
||||
int sm)
|
||||
{
|
||||
int max_thread;
|
||||
int max_block=8;
|
||||
if(major==1)
|
||||
{
|
||||
if(minor>=2)
|
||||
max_thread=1024;
|
||||
else
|
||||
max_thread=768;
|
||||
}
|
||||
else if(major==2)
|
||||
max_thread=1536;
|
||||
else
|
||||
//newer GPU //keep using 2.0
|
||||
max_thread=1536;
|
||||
|
||||
int _grid;
|
||||
int _thread;
|
||||
|
||||
if(task*pad>sm*max_thread)
|
||||
{
|
||||
_thread=max_thread/max_block;
|
||||
_grid = ((task*pad+_thread-1)/_thread)*_thread;
|
||||
}
|
||||
else
|
||||
{
|
||||
_thread=pad;
|
||||
_grid=task*pad;
|
||||
}
|
||||
|
||||
thread[0]=_thread;
|
||||
grid[0]=_grid;
|
||||
}
|
||||
@@ -1,20 +0,0 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
#ifndef __GPUINFOH__
|
||||
#define __GPUINFOH__
|
||||
|
||||
void compute_active_thread(size_t *thread,
|
||||
size_t *grid,
|
||||
int task,
|
||||
int pad,
|
||||
int major,
|
||||
int minor,
|
||||
int sm);
|
||||
|
||||
#endif
|
||||
@@ -1,104 +0,0 @@
|
||||
/*
|
||||
* potential lattice is decomposed into size 8^3 lattice point "regions"
|
||||
*
|
||||
* THIS IMPLEMENTATION: one thread per lattice point
|
||||
* thread block size 128 gives 4 thread blocks per region
|
||||
* kernel is invoked for each x-y plane of regions,
|
||||
* where gridDim.x is 4*(x region dimension) so that blockIdx.x
|
||||
* can absorb the z sub-region index in its 2 lowest order bits
|
||||
*
|
||||
* Regions are stored contiguously in memory in row-major order
|
||||
*
|
||||
* The bins have to not only cover the region, but they need to surround
|
||||
* the outer edges so that region sides and corners can still use
|
||||
* neighbor list stencil. The binZeroAddr is actually a shifted pointer into
|
||||
* the bin array (binZeroAddr = binBaseAddr + (c*binDim_y + c)*binDim_x + c)
|
||||
* where c = ceil(cutoff / binsize). This allows for negative offsets to
|
||||
* be added to myBinIndex.
|
||||
*
|
||||
* The (0,0,0) spatial origin corresponds to lower left corner of both
|
||||
* regionZeroAddr and binZeroAddr. The atom coordinates are translated
|
||||
* during binning to enforce this assumption.
|
||||
*/
|
||||
|
||||
#include "macros.h"
|
||||
|
||||
// OpenCL 1.1 support for int3 is not uniform on all implementations, so
|
||||
// we use int4 instead. Only the 'x', 'y', and 'z' fields of xyz are used.
|
||||
typedef int4 xyz;
|
||||
|
||||
__kernel void opencl_cutoff_potential_lattice(
|
||||
int binDim_x,
|
||||
int binDim_y,
|
||||
__global float4 *binBaseAddr,
|
||||
int offset,
|
||||
float h, /* lattice spacing */
|
||||
float cutoff2, /* square of cutoff distance */
|
||||
float inv_cutoff2,
|
||||
__global float *regionZeroAddr, /* address of lattice regions starting at origin */
|
||||
int zRegionIndex,
|
||||
__constant int *NbrListLen,
|
||||
__constant xyz *NbrList
|
||||
)
|
||||
{
|
||||
__global float4* binZeroAddr = binBaseAddr + offset;
|
||||
|
||||
__global float *myRegionAddr;
|
||||
int Bx, By, Bz;
|
||||
|
||||
/* thread id */
|
||||
const int tid = (get_local_id(2)*get_local_size(1) +
|
||||
get_local_id(1))*get_local_size(0) + get_local_id(0);
|
||||
|
||||
/* this is the start of the sub-region indexed by tid */
|
||||
myRegionAddr = regionZeroAddr + ((zRegionIndex*get_num_groups(1)
|
||||
+ get_group_id(1))*(get_num_groups(0)>>2) + (get_group_id(0)>>2))*REGION_SIZE
|
||||
+ (get_group_id(0)&3)*SUB_REGION_SIZE;
|
||||
|
||||
/* spatial coordinate of this lattice point */
|
||||
float x = (8 * (get_group_id(0) >> 2) + get_local_id(0)) * h;
|
||||
float y = (8 * get_group_id(1) + get_local_id(1)) * h;
|
||||
float z = (8 * zRegionIndex + 2*(get_group_id(0)&3) + get_local_id(2)) * h;
|
||||
|
||||
float dx;
|
||||
float dy;
|
||||
float dz;
|
||||
float r2;
|
||||
float s;
|
||||
|
||||
int totalbins = 0;
|
||||
|
||||
/* bin number determined by center of region */
|
||||
Bx = (int) floor((8 * (get_group_id(0) >> 2) + 4) * h * BIN_INVLEN);
|
||||
By = (int) floor((8 * get_group_id(1) + 4) * h * BIN_INVLEN);
|
||||
Bz = (int) floor((8 * zRegionIndex + 4) * h * BIN_INVLEN);
|
||||
|
||||
float energy = 0.f;
|
||||
int bincnt;
|
||||
for (bincnt = 0; bincnt < *NbrListLen; bincnt++) {
|
||||
int i = Bx + NbrList[bincnt].x;
|
||||
int j = By + NbrList[bincnt].y;
|
||||
int k = Bz + NbrList[bincnt].z;
|
||||
|
||||
__global float4* p_global = binZeroAddr +
|
||||
(((k*binDim_y + j)*binDim_x + i) * BIN_DEPTH);
|
||||
|
||||
int m;
|
||||
for (m = 0; m < BIN_DEPTH; m++) {
|
||||
float aq = p_global[m].w;
|
||||
if (0.f != aq) {
|
||||
dx = p_global[m].x - x;
|
||||
dy = p_global[m].y - y;
|
||||
dz = p_global[m].z - z;
|
||||
r2 = dx*dx + dy*dy + dz*dz;
|
||||
if (r2 < cutoff2) {
|
||||
s = (1.f - r2 * inv_cutoff2);
|
||||
energy += aq * rsqrt(r2) * s * s;
|
||||
}
|
||||
}
|
||||
} /* end loop over atoms in bin */
|
||||
} /* end loop over neighbor list */
|
||||
|
||||
/* store into global memory */
|
||||
myRegionAddr[tid+0] = energy;
|
||||
}
|
||||
Binary file not shown.
@@ -1,69 +0,0 @@
|
||||
#ifndef __MACROSH__
|
||||
#define __MACROSH__
|
||||
|
||||
#ifdef __DEVICE_EMULATION__
|
||||
#define DEBUG
|
||||
/* define which grid block and which thread to examine */
|
||||
#define BX 0
|
||||
#define BY 0
|
||||
#define TX 0
|
||||
#define TY 0
|
||||
#define TZ 0
|
||||
#define EMU(code) do { \
|
||||
if (blockIdx.x==BX && blockIdx.y==BY && \
|
||||
threadIdx.x==TX && threadIdx.y==TY && threadIdx.z==TZ) { \
|
||||
code; \
|
||||
} \
|
||||
} while (0)
|
||||
#define INT(n) printf("%s = %d\n", #n, n)
|
||||
#define FLOAT(f) printf("%s = %g\n", #f, (double)(f))
|
||||
#define INT3(n) printf("%s = %d %d %d\n", #n, (n).x, (n).y, (n).z)
|
||||
#define FLOAT4(f) printf("%s = %g %g %g %g\n", #f, (double)(f).x, \
|
||||
(double)(f).y, (double)(f).z, (double)(f).w)
|
||||
#else
|
||||
#define EMU(code)
|
||||
#define INT(n)
|
||||
#define FLOAT(f)
|
||||
#define INT3(n)
|
||||
#define FLOAT4(f)
|
||||
#endif
|
||||
|
||||
/* report error from OpenCL */
|
||||
#define CHECK_ERROR(errorMessage) \
|
||||
if(clStatus != CL_SUCCESS) \
|
||||
{ \
|
||||
printf("Error: %s!\n",errorMessage); \
|
||||
printf("Line: %d\n",__LINE__); \
|
||||
exit(1); \
|
||||
}
|
||||
|
||||
/*
|
||||
* neighbor list:
|
||||
* stored in constant memory as table of offsets
|
||||
* flat index addressing is computed by kernel
|
||||
*
|
||||
* reserve enough memory for 11^3 stencil of grid cells
|
||||
* this fits within 16K of memory
|
||||
*/
|
||||
#define NBRLIST_DIM 11
|
||||
#define NBRLIST_MAXLEN (NBRLIST_DIM * NBRLIST_DIM * NBRLIST_DIM)
|
||||
|
||||
/*
|
||||
* atom bins cached into shared memory for processing
|
||||
*
|
||||
* this reserves 4K of shared memory for 32 atom bins each containing 8 atoms,
|
||||
* should permit scheduling of up to 3 thread blocks per SM
|
||||
*/
|
||||
#define BIN_DEPTH 8 /* max number of atoms per bin */
|
||||
#define BIN_SIZE 32 /* size of bin in floats */
|
||||
#define BIN_CACHE_MAXLEN 32 /* max number of atom bins to cache */
|
||||
|
||||
#define BIN_LENGTH 4.f /* spatial length in Angstroms */
|
||||
#define BIN_INVLEN (1.f / BIN_LENGTH)
|
||||
/* assuming density of 1 atom / 10 A^3, expectation is 6.4 atoms per bin
|
||||
* so that bin fill should be 80% (for non-empty regions of space) */
|
||||
|
||||
#define REGION_SIZE 512 /* number of floats in lattice region */
|
||||
#define SUB_REGION_SIZE 128 /* number of floats in lattice sub-region */
|
||||
|
||||
#endif
|
||||
@@ -1,194 +0,0 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2008-2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <parboil.h>
|
||||
|
||||
#include "atom.h"
|
||||
#include "cutoff.h"
|
||||
#include "output.h"
|
||||
|
||||
#define ERRTOL 1e-4f
|
||||
|
||||
#define NOKERNELS 0
|
||||
#define CUTOFF1 1
|
||||
#define CUTOFF6 32
|
||||
#define CUTOFF6OVERLAP 64
|
||||
#define CUTOFFCPU 16384
|
||||
|
||||
|
||||
int appenddata(const char *filename, int size, double time) {
|
||||
FILE *fp;
|
||||
fp=fopen(filename, "a");
|
||||
if (fp == NULL) {
|
||||
printf("error appending to file %s..\n", filename);
|
||||
return -1;
|
||||
}
|
||||
fprintf(fp, "%d %.3f\n", size, time);
|
||||
fclose(fp);
|
||||
return 0;
|
||||
}
|
||||
|
||||
LatticeDim
|
||||
lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h)
|
||||
{
|
||||
LatticeDim ret;
|
||||
|
||||
ret.nx = (int) floorf((hi.x-lo.x)/h) + 1;
|
||||
ret.ny = (int) floorf((hi.y-lo.y)/h) + 1;
|
||||
ret.nz = (int) floorf((hi.z-lo.z)/h) + 1;
|
||||
ret.lo = lo;
|
||||
ret.h = h;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
Lattice *
|
||||
create_lattice(LatticeDim dim)
|
||||
{
|
||||
int size;
|
||||
Lattice *lat = (Lattice *)malloc(sizeof(Lattice));
|
||||
|
||||
if (lat == NULL) {
|
||||
fprintf(stderr, "Out of memory\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
lat->dim = dim;
|
||||
|
||||
/* Round up the allocated size to a multiple of 8 */
|
||||
size = ((dim.nx * dim.ny * dim.nz) + 7) & ~7;
|
||||
lat->lattice = (float *)calloc(size, sizeof(float));
|
||||
|
||||
if (lat->lattice == NULL) {
|
||||
fprintf(stderr, "Out of memory\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
return lat;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
destroy_lattice(Lattice *lat)
|
||||
{
|
||||
if (lat) {
|
||||
free(lat->lattice);
|
||||
free(lat);
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
Atoms *atom;
|
||||
|
||||
LatticeDim lattice_dim;
|
||||
Lattice *gpu_lattice;
|
||||
Vec3 min_ext, max_ext; /* Bounding box of atoms */
|
||||
Vec3 lo, hi; /* Bounding box with padding */
|
||||
|
||||
float h = 0.5f; /* Lattice spacing */
|
||||
float cutoff = 12.f; /* Cutoff radius */
|
||||
float exclcutoff = 1.f; /* Radius for exclusion */
|
||||
float padding = 0.5f; /* Bounding box padding distance */
|
||||
|
||||
int n;
|
||||
|
||||
struct pb_Parameters *parameters;
|
||||
struct pb_TimerSet timers;
|
||||
|
||||
/* Read input parameters */
|
||||
parameters = pb_ReadParameters(&argc, argv);
|
||||
if (parameters == NULL) {
|
||||
exit(1);
|
||||
}
|
||||
|
||||
parameters->inpFiles = (char **)malloc(sizeof(char *) * 2);
|
||||
parameters->inpFiles[0] = (char *)malloc(100);
|
||||
parameters->inpFiles[1] = NULL;
|
||||
strncpy(parameters->inpFiles[0], "watbox.sl40.pqr", 100);
|
||||
|
||||
/* Expect one input file */
|
||||
if (pb_Parameters_CountInputs(parameters) != 1) {
|
||||
fprintf(stderr, "Expecting one input file\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
pb_InitializeTimerSet(&timers);
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_IO);
|
||||
|
||||
printf("OK\n");
|
||||
|
||||
{
|
||||
const char *pqrfilename = parameters->inpFiles[0];
|
||||
|
||||
if (!(atom = read_atom_file(pqrfilename))) {
|
||||
fprintf(stderr, "read_atom_file() failed\n");
|
||||
exit(1);
|
||||
}
|
||||
printf("read %d atoms from file '%s'\n", atom->size, pqrfilename);
|
||||
}
|
||||
|
||||
printf("OK\n");
|
||||
|
||||
/* find extent of domain */
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
|
||||
get_atom_extent(&min_ext, &max_ext, atom);
|
||||
printf("extent of domain is:\n");
|
||||
printf(" minimum %g %g %g\n", min_ext.x, min_ext.y, min_ext.z);
|
||||
printf(" maximum %g %g %g\n", max_ext.x, max_ext.y, max_ext.z);
|
||||
|
||||
printf("padding domain by %g Angstroms\n", padding);
|
||||
lo = (Vec3) {min_ext.x - padding, min_ext.y - padding, min_ext.z - padding};
|
||||
hi = (Vec3) {max_ext.x + padding, max_ext.y + padding, max_ext.z + padding};
|
||||
printf("domain lengths are %g by %g by %g\n", hi.x-lo.x, hi.y-lo.y, hi.z-lo.z);
|
||||
|
||||
lattice_dim = lattice_from_bounding_box(lo, hi, h);
|
||||
gpu_lattice = create_lattice(lattice_dim);
|
||||
printf("\n");
|
||||
|
||||
/*
|
||||
* Run OpenCL kernel
|
||||
* (Begin and end with COMPUTE timer active)
|
||||
*/
|
||||
if (gpu_compute_cutoff_potential_lattice(&timers, gpu_lattice, cutoff, atom, 0, parameters)) {
|
||||
fprintf(stderr, "Computation failed\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Zero the lattice points that are too close to an atom. This is
|
||||
* necessary for numerical stability.
|
||||
*/
|
||||
if (remove_exclusions(gpu_lattice, exclcutoff, atom)) {
|
||||
fprintf(stderr, "remove_exclusions() failed for gpu lattice\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
printf("\n");
|
||||
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_IO);
|
||||
|
||||
/* Print output */
|
||||
if (parameters->outFile) {
|
||||
//write_lattice_summary(parameters->outFile, gpu_lattice);
|
||||
}
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
|
||||
|
||||
/* Cleanup */
|
||||
destroy_lattice(gpu_lattice);
|
||||
free_atom(atom);
|
||||
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
|
||||
pb_PrintTimerSet(&timers);
|
||||
pb_FreeParameters(parameters);
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -1,49 +0,0 @@
|
||||
#include <CL/cl.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include "ocl.h"
|
||||
|
||||
char* readFile(const char* fileName)
|
||||
{
|
||||
FILE* fp;
|
||||
fp = fopen(fileName,"r");
|
||||
if(fp == NULL)
|
||||
{
|
||||
printf("Error 1!\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
fseek(fp,0,SEEK_END);
|
||||
long size = ftell(fp);
|
||||
rewind(fp);
|
||||
|
||||
char* buffer = (char*)malloc(sizeof(char)*(size+1));
|
||||
if(buffer == NULL)
|
||||
{
|
||||
printf("Error 2!\n");
|
||||
fclose(fp);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
size_t res = fread(buffer,1,size,fp);
|
||||
if(res != size)
|
||||
{
|
||||
printf("Error 3!\n");
|
||||
fclose(fp);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
buffer[size] = 0;
|
||||
fclose(fp);
|
||||
return buffer;
|
||||
}
|
||||
|
||||
void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
|
||||
{
|
||||
cl_int clStatus;
|
||||
char* temp = (char*)malloc(size);
|
||||
memset(temp,val,size);
|
||||
clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
|
||||
CHECK_ERROR("clEnqueueWriteBuffer")
|
||||
free(temp);
|
||||
}
|
||||
@@ -1,17 +0,0 @@
|
||||
#ifndef __OCLH__
|
||||
#define __OCLH__
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
void clMemSet(cl_command_queue, cl_mem, int, size_t);
|
||||
char* readFile(const char*);
|
||||
|
||||
#define CHECK_ERROR(errorMessage) \
|
||||
if(clStatus != CL_SUCCESS) \
|
||||
{ \
|
||||
printf("Error: %s!\n",errorMessage); \
|
||||
printf("Line: %d\n",__LINE__); \
|
||||
exit(1); \
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -1,67 +0,0 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2008-2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <inttypes.h>
|
||||
#include <math.h>
|
||||
#include <parboil.h>
|
||||
|
||||
#include "atom.h"
|
||||
#include "cutoff.h"
|
||||
|
||||
void
|
||||
write_lattice_summary(const char *filename, Lattice *lattice)
|
||||
{
|
||||
float *lattice_data = lattice->lattice;
|
||||
int nx = lattice->dim.nx;
|
||||
int ny = lattice->dim.ny;
|
||||
int nz = lattice->dim.nz;
|
||||
|
||||
/* Open output file */
|
||||
FILE *outfile = fopen(filename, "w");
|
||||
|
||||
if (outfile == NULL) {
|
||||
fprintf(stderr, "Cannot open output file\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* Write the sum of the the absolute values of all lattice potentials */
|
||||
{
|
||||
double abspotential = 0.0;
|
||||
float tmp;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < nx * ny * nz; i++)
|
||||
abspotential += fabs((double) lattice_data[i]);
|
||||
|
||||
tmp = (float) abspotential;
|
||||
|
||||
fwrite(&tmp, 1, sizeof(float), outfile);
|
||||
}
|
||||
|
||||
/* Write the size of a lattice plane */
|
||||
{
|
||||
uint32_t tmp;
|
||||
|
||||
tmp = (uint32_t) (lattice->dim.nx * lattice->dim.ny);
|
||||
fwrite(&tmp, 1, sizeof(uint32_t), outfile);
|
||||
}
|
||||
|
||||
/* Write the plane of lattice data at z=0 and z = nz-1 */
|
||||
{
|
||||
int plane_size = nx * ny;
|
||||
|
||||
fwrite(lattice_data, plane_size, sizeof(float), outfile);
|
||||
fwrite(lattice_data + (nz-1) * plane_size, plane_size, sizeof(float),
|
||||
outfile);
|
||||
}
|
||||
|
||||
/* Cleanup */
|
||||
fclose(outfile);
|
||||
}
|
||||
@@ -1,25 +0,0 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2008-2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
#ifndef OUTPUT_H
|
||||
#define OUTPUT_H
|
||||
|
||||
#include "cutoff.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void
|
||||
write_lattice_summary(const char *filename, Lattice *lattice);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -1,348 +0,0 @@
|
||||
/*
|
||||
* (c) 2010 The Board of Trustees of the University of Illinois.
|
||||
*/
|
||||
#ifndef PARBOIL_HEADER
|
||||
#define PARBOIL_HEADER
|
||||
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include <unistd.h>
|
||||
|
||||
/* A platform as specified by the user on the command line */
|
||||
struct pb_PlatformParam {
|
||||
char *name; /* The platform name. This string is owned. */
|
||||
char *version; /* The platform version; may be NULL.
|
||||
* This string is owned. */
|
||||
};
|
||||
|
||||
/* Create a PlatformParam from the given strings.
|
||||
* 'name' must not be NULL. 'version' may be NULL.
|
||||
* If not NULL, the strings should have been allocated by malloc(),
|
||||
* and they will be owned by the returned object.
|
||||
*/
|
||||
struct pb_PlatformParam *
|
||||
pb_PlatformParam(char *name, char *version);
|
||||
|
||||
void
|
||||
pb_FreePlatformParam(struct pb_PlatformParam *);
|
||||
|
||||
/* A criterion for how to select a device */
|
||||
enum pb_DeviceSelectionCriterion {
|
||||
pb_Device_INDEX, /* Enumerate the devices and select one
|
||||
* by its number */
|
||||
pb_Device_CPU, /* Select a CPU device */
|
||||
pb_Device_GPU, /* Select a GPU device */
|
||||
pb_Device_ACCELERATOR, /* Select an accelerator device */
|
||||
pb_Device_NAME /* Select a device by name */
|
||||
};
|
||||
|
||||
/* A device as specified by the user on the command line */
|
||||
struct pb_DeviceParam {
|
||||
enum pb_DeviceSelectionCriterion criterion;
|
||||
union {
|
||||
int index; /* If criterion == pb_Device_INDEX,
|
||||
* the index of the device */
|
||||
char *name; /* If criterion == pb_Device_NAME,
|
||||
* the name of the device.
|
||||
* This string is owned. */
|
||||
};
|
||||
};
|
||||
|
||||
struct pb_DeviceParam *
|
||||
pb_DeviceParam_index(int index);
|
||||
|
||||
struct pb_DeviceParam *
|
||||
pb_DeviceParam_cpu(void);
|
||||
|
||||
struct pb_DeviceParam *
|
||||
pb_DeviceParam_gpu(void);
|
||||
|
||||
struct pb_DeviceParam *
|
||||
pb_DeviceParam_accelerator(void);
|
||||
|
||||
/* Create a by-name device selection criterion.
|
||||
* The string should have been allocated by malloc(), and it will will be
|
||||
* owned by the returned object.
|
||||
*/
|
||||
struct pb_DeviceParam *
|
||||
pb_DeviceParam_name(char *name);
|
||||
|
||||
void
|
||||
pb_FreeDeviceParam(struct pb_DeviceParam *);
|
||||
|
||||
/* Command line parameters for benchmarks */
|
||||
struct pb_Parameters {
|
||||
char *outFile; /* If not NULL, the raw output of the
|
||||
* computation should be saved to this
|
||||
* file. The string is owned. */
|
||||
char **inpFiles; /* A NULL-terminated array of strings
|
||||
* holding the input file(s) for the
|
||||
* computation. The array and strings
|
||||
* are owned. */
|
||||
struct pb_PlatformParam *platform; /* If not NULL, the platform
|
||||
* specified on the command line. */
|
||||
struct pb_DeviceParam *device; /* If not NULL, the device
|
||||
* specified on the command line. */
|
||||
};
|
||||
|
||||
/* Read command-line parameters.
|
||||
*
|
||||
* The argc and argv parameters to main are read, and any parameters
|
||||
* interpreted by this function are removed from the argument list.
|
||||
*
|
||||
* A new instance of struct pb_Parameters is returned.
|
||||
* If there is an error, then an error message is printed on stderr
|
||||
* and NULL is returned.
|
||||
*/
|
||||
struct pb_Parameters *
|
||||
pb_ReadParameters(int *_argc, char **argv);
|
||||
|
||||
/* Free an instance of struct pb_Parameters.
|
||||
*/
|
||||
void
|
||||
pb_FreeParameters(struct pb_Parameters *p);
|
||||
|
||||
void
|
||||
pb_FreeStringArray(char **);
|
||||
|
||||
/* Count the number of input files in a pb_Parameters instance.
|
||||
*/
|
||||
int
|
||||
pb_Parameters_CountInputs(struct pb_Parameters *p);
|
||||
|
||||
/* A time or duration. */
|
||||
//#if _POSIX_VERSION >= 200112L
|
||||
typedef unsigned long long pb_Timestamp; /* time in microseconds */
|
||||
//#else
|
||||
//# error "Timestamps not implemented"
|
||||
//#endif
|
||||
|
||||
enum pb_TimerState {
|
||||
pb_Timer_STOPPED,
|
||||
pb_Timer_RUNNING,
|
||||
};
|
||||
|
||||
struct pb_Timer {
|
||||
enum pb_TimerState state;
|
||||
pb_Timestamp elapsed; /* Amount of time elapsed so far */
|
||||
pb_Timestamp init; /* Beginning of the current time interval,
|
||||
* if state is RUNNING. End of the last
|
||||
* recorded time interfal otherwise. */
|
||||
};
|
||||
|
||||
/* Reset a timer.
|
||||
* Use this to initialize a timer or to clear
|
||||
* its elapsed time. The reset timer is stopped.
|
||||
*/
|
||||
void
|
||||
pb_ResetTimer(struct pb_Timer *timer);
|
||||
|
||||
/* Start a timer. The timer is set to RUNNING mode and
|
||||
* time elapsed while the timer is running is added to
|
||||
* the timer.
|
||||
* The timer should not already be running.
|
||||
*/
|
||||
void
|
||||
pb_StartTimer(struct pb_Timer *timer);
|
||||
|
||||
/* Stop a timer.
|
||||
* This stops adding elapsed time to the timer.
|
||||
* The timer should not already be stopped.
|
||||
*/
|
||||
void
|
||||
pb_StopTimer(struct pb_Timer *timer);
|
||||
|
||||
/* Get the elapsed time in seconds. */
|
||||
double
|
||||
pb_GetElapsedTime(struct pb_Timer *timer);
|
||||
|
||||
/* Execution time is assigned to one of these categories. */
|
||||
enum pb_TimerID {
|
||||
pb_TimerID_NONE = 0,
|
||||
pb_TimerID_IO, /* Time spent in input/output */
|
||||
pb_TimerID_KERNEL, /* Time spent computing on the device,
|
||||
* recorded asynchronously */
|
||||
pb_TimerID_COPY, /* Time spent synchronously moving data
|
||||
* to/from device and allocating/freeing
|
||||
* memory on the device */
|
||||
pb_TimerID_DRIVER, /* Time spent in the host interacting with the
|
||||
* driver, primarily for recording the time
|
||||
* spent queueing asynchronous operations */
|
||||
pb_TimerID_COPY_ASYNC, /* Time spent in asynchronous transfers */
|
||||
pb_TimerID_COMPUTE, /* Time for all program execution other
|
||||
* than parsing command line arguments,
|
||||
* I/O, kernel, and copy */
|
||||
pb_TimerID_OVERLAP, /* Time double-counted in asynchronous and
|
||||
* host activity: automatically filled in,
|
||||
* not intended for direct usage */
|
||||
pb_TimerID_LAST /* Number of timer IDs */
|
||||
};
|
||||
|
||||
/* Dynamic list of asynchronously tracked times between events */
|
||||
struct pb_async_time_marker_list {
|
||||
char *label; // actually just a pointer to a string
|
||||
enum pb_TimerID timerID; /* The ID to which the interval beginning
|
||||
* with this marker should be attributed */
|
||||
void * marker;
|
||||
//cudaEvent_t marker; /* The driver event for this marker */
|
||||
struct pb_async_time_marker_list *next;
|
||||
};
|
||||
|
||||
struct pb_SubTimer {
|
||||
char *label;
|
||||
struct pb_Timer timer;
|
||||
struct pb_SubTimer *next;
|
||||
};
|
||||
|
||||
struct pb_SubTimerList {
|
||||
struct pb_SubTimer *current;
|
||||
struct pb_SubTimer *subtimer_list;
|
||||
};
|
||||
|
||||
/* A set of timers for recording execution times. */
|
||||
struct pb_TimerSet {
|
||||
enum pb_TimerID current;
|
||||
struct pb_async_time_marker_list* async_markers;
|
||||
pb_Timestamp async_begin;
|
||||
pb_Timestamp wall_begin;
|
||||
struct pb_Timer timers[pb_TimerID_LAST];
|
||||
struct pb_SubTimerList *sub_timer_list[pb_TimerID_LAST];
|
||||
};
|
||||
|
||||
/* Reset all timers in the set. */
|
||||
void
|
||||
pb_InitializeTimerSet(struct pb_TimerSet *timers);
|
||||
|
||||
void
|
||||
pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category);
|
||||
|
||||
/* Select which timer the next interval of time should be accounted
|
||||
* to. The selected timer is started and other timers are stopped.
|
||||
* Using pb_TimerID_NONE stops all timers. */
|
||||
void
|
||||
pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer);
|
||||
|
||||
void
|
||||
pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category);
|
||||
|
||||
/* Print timer values to standard output. */
|
||||
void
|
||||
pb_PrintTimerSet(struct pb_TimerSet *timers);
|
||||
|
||||
/* Release timer resources */
|
||||
void
|
||||
pb_DestroyTimerSet(struct pb_TimerSet * timers);
|
||||
|
||||
void
|
||||
pb_SetOpenCL(void *clContextPtr, void *clCommandQueuePtr);
|
||||
|
||||
|
||||
typedef struct pb_Device_tag {
|
||||
char* name;
|
||||
void* clDevice;
|
||||
int id;
|
||||
unsigned int in_use;
|
||||
unsigned int available;
|
||||
} pb_Device;
|
||||
|
||||
struct pb_Context_tag;
|
||||
typedef struct pb_Context_tag pb_Context;
|
||||
|
||||
typedef struct pb_Platform_tag {
|
||||
char* name;
|
||||
char* version;
|
||||
void* clPlatform;
|
||||
unsigned int in_use;
|
||||
pb_Context** contexts;
|
||||
pb_Device** devices;
|
||||
} pb_Platform;
|
||||
|
||||
struct pb_Context_tag {
|
||||
void* clPlatformId;
|
||||
void* clContext;
|
||||
void* clDeviceId;
|
||||
pb_Platform* pb_platform;
|
||||
pb_Device* pb_device;
|
||||
};
|
||||
|
||||
// verbosely print out list of platforms and their devices to the console.
|
||||
pb_Platform**
|
||||
pb_GetPlatforms();
|
||||
|
||||
// Choose a platform according to the given platform specification
|
||||
pb_Platform*
|
||||
pb_GetPlatform(struct pb_PlatformParam *platform);
|
||||
|
||||
// choose a platform: by name, name & version
|
||||
pb_Platform*
|
||||
pb_GetPlatformByName(const char* name);
|
||||
|
||||
pb_Platform*
|
||||
pb_GetPlatformByNameAndVersion(const char* name, const char* version);
|
||||
|
||||
// Choose a device according to the given device specification
|
||||
pb_Device*
|
||||
pb_GetDevice(pb_Platform* pb_platform, struct pb_DeviceParam *device);
|
||||
|
||||
pb_Device**
|
||||
pb_GetDevices(pb_Platform* pb_platform);
|
||||
|
||||
// choose a device by name.
|
||||
pb_Device*
|
||||
pb_GetDeviceByName(pb_Platform* pb_platform, const char* name);
|
||||
|
||||
pb_Platform*
|
||||
pb_GetPlatformByEnvVars();
|
||||
|
||||
pb_Context*
|
||||
pb_InitOpenCLContext(struct pb_Parameters* parameters);
|
||||
|
||||
void
|
||||
pb_ReleasePlatforms();
|
||||
|
||||
void
|
||||
pb_ReleaseContext(pb_Context* c);
|
||||
|
||||
void
|
||||
pb_PrintPlatformInfo(pb_Context* c);
|
||||
|
||||
void
|
||||
perf_init();
|
||||
|
||||
//#define MEASURE_KERNEL_TIME
|
||||
|
||||
#include <CL/cl.h>
|
||||
|
||||
#ifdef MEASURE_KERNEL_TIME
|
||||
#define clEnqueueNDRangeKernel(q,k,d,o,dg,db,a,b,c) pb_clEnqueueNDRangeKernel((q), (k), (d), (o), (dg), (db), (a), (b), (c))
|
||||
cl_int
|
||||
pb_clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
|
||||
cl_kernel /* kernel */,
|
||||
cl_uint /* work_dim */,
|
||||
const size_t * /* global_work_offset */,
|
||||
const size_t * /* global_work_size */,
|
||||
const size_t * /* local_work_size */,
|
||||
cl_uint /* num_events_in_wait_list */,
|
||||
const cl_event * /* event_wait_list */,
|
||||
cl_event * /* event */);
|
||||
#endif
|
||||
|
||||
enum { T_FLOAT, T_DOUBLE, T_SHORT, T_INT, T_UCHAR };
|
||||
void pb_sig_float(char*, float*, int);
|
||||
void pb_sig_double(char*, double*, int);
|
||||
void pb_sig_short(char*, short*, int);
|
||||
void pb_sig_int(char*, int*, int);
|
||||
void pb_sig_uchar(char*, unsigned char*, unsigned int);
|
||||
void pb_sig_clmem(char*, cl_command_queue, cl_mem, int);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif //PARBOIL_HEADER
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,139 +0,0 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2008-2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include "atom.h"
|
||||
|
||||
|
||||
#define LINELEN 96
|
||||
#define INITLEN 20
|
||||
|
||||
|
||||
Atoms *read_atom_file(const char *fname)
|
||||
{
|
||||
FILE *file;
|
||||
char line[LINELEN];
|
||||
|
||||
Atom *atom; /* Atom array */
|
||||
int len = INITLEN; /* Size of atom array */
|
||||
int cnt = 0; /* Number of atoms read */
|
||||
|
||||
/* allocate initial atom array */
|
||||
atom = (Atom *) malloc(len * sizeof(Atom));
|
||||
if (NULL==atom) {
|
||||
fprintf(stderr, "can't allocate memory\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int i;
|
||||
for (i = 0; i < len; ++i) {
|
||||
atom[i].x = i+0;
|
||||
atom[i].y = i+1;
|
||||
atom[i].z = i+2;
|
||||
atom[i].q = 1;
|
||||
}
|
||||
|
||||
#if 0
|
||||
/* open atom "pqr" file */
|
||||
file = fopen(fname, "r");
|
||||
if (NULL==file) {
|
||||
fprintf(stderr, "can't open file \"%s\" for reading\n", fname);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* loop to read pqr file line by line */
|
||||
while (fgets(line, LINELEN, file) != NULL) {
|
||||
|
||||
if (strncmp(line, "ATOM ", 6) != 0 && strncmp(line, "HETATM", 6) != 0) {
|
||||
continue; /* skip anything that isn't an atom record */
|
||||
}
|
||||
|
||||
if (cnt==len) { /* extend atom array */
|
||||
void *tmp = realloc(atom, 2*len*sizeof(Atom));
|
||||
if (NULL==tmp) {
|
||||
fprintf(stderr, "can't allocate more memory\n");
|
||||
return NULL;
|
||||
}
|
||||
atom = (Atom *) tmp;
|
||||
len *= 2;
|
||||
}
|
||||
|
||||
/* read position coordinates and charge from atom record */
|
||||
if (sscanf(line, "%*s %*d %*s %*s %*d %f %f %f %f", &(atom[cnt].x),
|
||||
&(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) {
|
||||
fprintf(stderr, "atom record %d does not have expected format\n", cnt+1);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
cnt++; /* count atoms as we store them */
|
||||
}
|
||||
|
||||
/* verify EOF and close file */
|
||||
if ( !feof(file) ) {
|
||||
fprintf(stderr, "did not find EOF\n");
|
||||
return NULL;
|
||||
}
|
||||
if (fclose(file)) {
|
||||
fprintf(stderr, "can't close file\n");
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Build the output data structure */
|
||||
{
|
||||
Atoms *out = (Atoms *)malloc(sizeof(Atoms));
|
||||
|
||||
if (NULL == out) {
|
||||
fprintf(stderr, "can't allocate memory\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
out->size = cnt;
|
||||
out->atoms = atom;
|
||||
|
||||
return out;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void free_atom(Atoms *atom)
|
||||
{
|
||||
if (atom) {
|
||||
free(atom->atoms);
|
||||
free(atom);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom)
|
||||
{
|
||||
Atom *atoms = atom->atoms;
|
||||
int natoms = atom->size;
|
||||
Vec3 lo;
|
||||
Vec3 hi;
|
||||
int n;
|
||||
|
||||
hi.x = lo.x = atoms[0].x;
|
||||
hi.y = lo.y = atoms[0].y;
|
||||
hi.z = lo.z = atoms[0].z;
|
||||
|
||||
for (n = 1; n < natoms; n++) {
|
||||
lo.x = fminf(lo.x, atoms[n].x);
|
||||
hi.x = fmaxf(hi.x, atoms[n].x);
|
||||
lo.y = fminf(lo.y, atoms[n].y);
|
||||
hi.y = fmaxf(hi.y, atoms[n].y);
|
||||
lo.z = fminf(lo.z, atoms[n].z);
|
||||
hi.z = fmaxf(hi.z, atoms[n].z);
|
||||
}
|
||||
|
||||
*out_lo = lo;
|
||||
*out_hi = hi;
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,68 +1,47 @@
|
||||
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
|
||||
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
|
||||
POCL_INC_PATH ?= $(wildcard ../include)
|
||||
POCL_LIB_PATH ?= $(wildcard ../lib)
|
||||
VX_RT_PATH ?= $(wildcard ../../../runtime)
|
||||
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
|
||||
LLVM_HOME ?= ~/dev/llvm-project/drops
|
||||
TOOLCHAIN_PATH ?= ~/dev/riscv-gnu-toolchain/drops
|
||||
SYSROOT ?= $(TOOLCHAIN_PATH)/riscv32-unknown-elf
|
||||
POCL_CC_PATH ?= $(realpath ../compiler)
|
||||
POCL_RT_PATH ?= $(realpath ../runtime)
|
||||
VORTEX_DRV_PATH ?= $(realpath ../../../driver/sw)
|
||||
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
||||
|
||||
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
|
||||
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
|
||||
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
|
||||
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
|
||||
CXXFLAGS += -std=c++11 -O0 -g -fpermissive -Wall -Wextra -pedantic -Wfatal-errors
|
||||
|
||||
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
|
||||
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
|
||||
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
|
||||
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
|
||||
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
|
||||
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
|
||||
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
|
||||
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
|
||||
CXXFLAGS += -I$(POCL_RT_PATH)/include
|
||||
|
||||
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
|
||||
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
|
||||
|
||||
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
|
||||
CXXFLAGS += -ffreestanding # program may not begin at main()
|
||||
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
|
||||
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
|
||||
CXXFLAGS += -I$(POCL_INC_PATH)
|
||||
|
||||
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
|
||||
PROJECT = gaussian
|
||||
PROJECT = guassian
|
||||
|
||||
SRCS = main.cc clutils.cpp utils.cpp
|
||||
|
||||
all: $(PROJECT).dump $(PROJECT).hex
|
||||
all: $(PROJECT)
|
||||
|
||||
lib$(PROJECT).a: kernel.cl
|
||||
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
|
||||
kernel.pocl: kernel.cl
|
||||
TOOLCHAIN_PATH=$(TOOLCHAIN_PATH) SYSROOT=$(SYSROOT) LLVM_HOME=$(LLVM_HOME) VORTEX_RUNTIME_PATH=$(VORTEX_RT_PATH) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_HOME)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
|
||||
run-fpga: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
|
||||
run-ase: $(PROJECT) kernel.pocl
|
||||
ASE_LOG=0 LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
$(PROJECT).hex: $(PROJECT).elf
|
||||
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
|
||||
run-simx: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
$(PROJECT).dump: $(PROJECT).elf
|
||||
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||
run-rtlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
run: $(PROJECT).hex
|
||||
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
|
||||
|
||||
qemu: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-s: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-c: $(PROJECT).qemu
|
||||
$(GDB) $(PROJECT).qemu
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug
|
||||
rm -rf $(PROJECT) *.o *.dump .depend
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
||||
@@ -782,6 +782,27 @@ void cl_writeToZCBuffer(cl_mem mem, void* data, size_t size)
|
||||
cl_unmapBuffer(mem, ptr);
|
||||
}
|
||||
|
||||
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
|
||||
if (nullptr == filename || nullptr == data || 0 == size)
|
||||
return -1;
|
||||
|
||||
FILE* fp = fopen(filename, "r");
|
||||
if (NULL == fp) {
|
||||
fprintf(stderr, "Failed to load kernel.");
|
||||
return -1;
|
||||
}
|
||||
fseek(fp , 0 , SEEK_END);
|
||||
long fsize = ftell(fp);
|
||||
rewind(fp);
|
||||
|
||||
*data = (uint8_t*)malloc(fsize);
|
||||
*size = fread(*data, 1, fsize, fp);
|
||||
|
||||
fclose(fp);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
//-------------------------------------------------------
|
||||
// Program and kernels
|
||||
//-------------------------------------------------------
|
||||
@@ -839,11 +860,20 @@ cl_program cl_compileProgram(char* kernelPath, char* compileoptions, bool verbos
|
||||
|
||||
// Create the program object
|
||||
//cl_program clProgramReturn = clCreateProgramWithSource(context, 1, (const char **)&source, NULL, &status);
|
||||
cl_program clProgramReturn = clCreateProgramWithBuiltInKernels(context, 1, &device, "Fan1;Fan2", &status);
|
||||
//cl_program clProgramReturn = clCreateProgramWithBuiltInKernels(context, 1, &device, "Fan1;Fan2", &status);
|
||||
// read kernel binary from file
|
||||
uint8_t *kernel_bin = NULL;
|
||||
size_t kernel_size;
|
||||
cl_int binary_status = 0;
|
||||
status = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
|
||||
cl_errChk(status, "read_kernel_file", true);
|
||||
cl_program clProgramReturn = clCreateProgramWithBinary(
|
||||
context, 1, &device, &kernel_size, &kernel_bin, &binary_status, &status);
|
||||
free(kernel_bin);
|
||||
cl_errChk(status, "Creating program", true);
|
||||
|
||||
free(source);
|
||||
fclose(fp);
|
||||
//free(source);
|
||||
//fclose(fp);
|
||||
|
||||
// Try to compile the program
|
||||
status = clBuildProgram(clProgramReturn, 0, NULL, compileoptions, NULL, NULL);
|
||||
|
||||
BIN
benchmarks/opencl/guassian/kernel.pocl
Normal file
BIN
benchmarks/opencl/guassian/kernel.pocl
Normal file
Binary file not shown.
Binary file not shown.
@@ -94,10 +94,9 @@ void ForwardSub(cl_context context, float *a, float *b, float *m, int size,
|
||||
cl_event writeEvent, kernelEvent, readEvent;
|
||||
float writeTime = 0, readTime = 0, kernelTime = 0;
|
||||
float writeMB = 0, readMB = 0;
|
||||
|
||||
gaussianElim_program =
|
||||
cl_compileProgram((char *)"gaussianElim_kernels.cl", NULL);
|
||||
|
||||
|
||||
gaussianElim_program = cl_compileProgram((char *)"gaussianElim_kernels.cl", NULL);
|
||||
|
||||
fan1_kernel = clCreateKernel(gaussianElim_program, "Fan1", &status);
|
||||
status = cl_errChk(status, (char *)"Error Creating Fan1 kernel", true);
|
||||
if (status)
|
||||
|
||||
2
benchmarks/opencl/kmeans/.gitignore
vendored
Normal file
2
benchmarks/opencl/kmeans/.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
kmeans
|
||||
|
||||
@@ -1,79 +1,47 @@
|
||||
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
|
||||
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
|
||||
POCL_INC_PATH ?= $(wildcard ../include)
|
||||
POCL_LIB_PATH ?= $(wildcard ../lib)
|
||||
VX_RT_PATH ?= $(wildcard ../../../runtime)
|
||||
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
|
||||
LLVM_HOME ?= ~/dev/llvm-project/drops
|
||||
TOOLCHAIN_PATH ?= ~/dev/riscv-gnu-toolchain/drops
|
||||
SYSROOT ?= $(TOOLCHAIN_PATH)/riscv32-unknown-elf
|
||||
POCL_CC_PATH ?= $(realpath ../compiler)
|
||||
POCL_RT_PATH ?= $(realpath ../runtime)
|
||||
VORTEX_DRV_PATH ?= $(realpath ../../../driver/sw)
|
||||
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
||||
|
||||
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
|
||||
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
|
||||
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
|
||||
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
|
||||
CXXFLAGS += -std=c++11 -O0 -g -fpermissive -Wall -Wextra -pedantic -Wfatal-errors
|
||||
|
||||
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
|
||||
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
|
||||
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
|
||||
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
|
||||
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
|
||||
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
|
||||
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
|
||||
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
|
||||
CXXFLAGS += -I$(POCL_RT_PATH)/include
|
||||
|
||||
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
|
||||
|
||||
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
|
||||
CXXFLAGS += -ffreestanding # program may not begin at main()
|
||||
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
|
||||
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
|
||||
CXXFLAGS += -I$(POCL_INC_PATH)
|
||||
|
||||
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
|
||||
|
||||
PROJECT = kmeans
|
||||
SRCS = main.cc read_input.c rmse.c cluster.c kmeans_clustering.c
|
||||
|
||||
all: $(PROJECT).dump $(PROJECT).hex
|
||||
SRCS = main.cc read_input.c rmse.c kmeans_clustering.c cluster.c getopt.c
|
||||
|
||||
lib$(PROJECT).a: kernel.cl
|
||||
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
|
||||
all: $(PROJECT)
|
||||
|
||||
kmeans_clustering.o: kmeans_clustering.c
|
||||
$(CC) $(CXXFLAGS) -c kmeans_clustering.c
|
||||
kernel.pocl: kernel.cl
|
||||
TOOLCHAIN_PATH=$(TOOLCHAIN_PATH) SYSROOT=$(SYSROOT) LLVM_HOME=$(LLVM_HOME) VORTEX_RUNTIME_PATH=$(VORTEX_RT_PATH) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_HOME)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
cluster.o: cluster.c
|
||||
$(CC) $(CXXFLAGS) -c cluster.c
|
||||
run-fpga: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
read_input.o: read_input.c
|
||||
$(CC) $(CXXFLAGS) -c read_input.c
|
||||
run-ase: $(PROJECT) kernel.pocl
|
||||
ASE_LOG=0 LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
rmse.o: rmse.c
|
||||
$(CC) $(CXXFLAGS) -c rmse.c
|
||||
run-simx: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
|
||||
run-rtlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
|
||||
|
||||
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
|
||||
|
||||
$(PROJECT).hex: $(PROJECT).elf
|
||||
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
|
||||
|
||||
$(PROJECT).dump: $(PROJECT).elf
|
||||
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||
|
||||
run: $(PROJECT).hex
|
||||
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
|
||||
|
||||
qemu: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -strace -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-s: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-c: $(PROJECT).qemu
|
||||
$(GDB) $(PROJECT).qemu
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug
|
||||
rm -rf $(PROJECT) *.o *.dump .depend
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,191 +1,191 @@
|
||||
|
||||
|
||||
/* getopt.h */
|
||||
/* Declarations for getopt.
|
||||
Copyright (C) 1989-1994, 1996-1999, 2001 Free Software
|
||||
Foundation, Inc. This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute
|
||||
it and/or modify it under the terms of the GNU Lesser
|
||||
General Public License as published by the Free Software
|
||||
Foundation; either version 2.1 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will
|
||||
be useful, but WITHOUT ANY WARRANTY; without even the
|
||||
implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||||
License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General
|
||||
Public License along with the GNU C Library; if not, write
|
||||
to the Free Software Foundation, Inc., 59 Temple Place,
|
||||
Suite 330, Boston, MA 02111-1307 USA. */
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#ifndef _GETOPT_H
|
||||
|
||||
#ifndef __need_getopt
|
||||
# define _GETOPT_H 1
|
||||
#endif
|
||||
|
||||
/* If __GNU_LIBRARY__ is not already defined, either we are being used
|
||||
standalone, or this is the first header included in the source file.
|
||||
If we are being used with glibc, we need to include <features.h>, but
|
||||
that does not exist if we are standalone. So: if __GNU_LIBRARY__ is
|
||||
not defined, include <ctype.h>, which will pull in <features.h> for us
|
||||
if it's from glibc. (Why ctype.h? It's guaranteed to exist and it
|
||||
doesn't flood the namespace with stuff the way some other headers do.) */
|
||||
#if !defined __GNU_LIBRARY__
|
||||
# include <ctype.h>
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* For communication from `getopt' to the caller.
|
||||
When `getopt' finds an option that takes an argument,
|
||||
the argument value is returned here.
|
||||
Also, when `ordering' is RETURN_IN_ORDER,
|
||||
each non-option ARGV-element is returned here. */
|
||||
|
||||
extern char *optarg;
|
||||
|
||||
/* Index in ARGV of the next element to be scanned.
|
||||
This is used for communication to and from the caller
|
||||
and for communication between successive calls to `getopt'.
|
||||
|
||||
On entry to `getopt', zero means this is the first call; initialize.
|
||||
|
||||
When `getopt' returns -1, this is the index of the first of the
|
||||
non-option elements that the caller should itself scan.
|
||||
|
||||
Otherwise, `optind' communicates from one call to the next
|
||||
how much of ARGV has been scanned so far. */
|
||||
|
||||
extern int optind;
|
||||
|
||||
/* Callers store zero here to inhibit the error message `getopt' prints
|
||||
for unrecognized options. */
|
||||
|
||||
extern int opterr;
|
||||
|
||||
/* Set to an option character which was unrecognized. */
|
||||
|
||||
extern int optopt;
|
||||
|
||||
#ifndef __need_getopt
|
||||
/* Describe the long-named options requested by the application.
|
||||
The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector
|
||||
of `struct option' terminated by an element containing a name which is
|
||||
zero.
|
||||
|
||||
The field `has_arg' is:
|
||||
no_argument (or 0) if the option does not take an argument,
|
||||
required_argument (or 1) if the option requires an argument,
|
||||
optional_argument (or 2) if the option takes an optional argument.
|
||||
|
||||
If the field `flag' is not NULL, it points to a variable that is set
|
||||
to the value given in the field `val' when the option is found, but
|
||||
left unchanged if the option is not found.
|
||||
|
||||
To have a long-named option do something other than set an `int' to
|
||||
a compiled-in constant, such as set a value from `optarg', set the
|
||||
option's `flag' field to zero and its `val' field to a nonzero
|
||||
value (the equivalent single-letter option character, if there is
|
||||
one). For long options that have a zero `flag' field, `getopt'
|
||||
returns the contents of the `val' field. */
|
||||
|
||||
struct option
|
||||
{
|
||||
# if (defined __STDC__ && __STDC__) || defined __cplusplus
|
||||
const char *name;
|
||||
# else
|
||||
char *name;
|
||||
# endif
|
||||
/* has_arg can't be an enum because some compilers complain about
|
||||
type mismatches in all the code that assumes it is an int. */
|
||||
int has_arg;
|
||||
int *flag;
|
||||
int val;
|
||||
};
|
||||
|
||||
/* Names for the values of the `has_arg' field of `struct option'. */
|
||||
|
||||
# define no_argument 0
|
||||
# define required_argument 1
|
||||
# define optional_argument 2
|
||||
#endif /* need getopt */
|
||||
|
||||
|
||||
/* Get definitions and prototypes for functions to process the
|
||||
arguments in ARGV (ARGC of them, minus the program name) for
|
||||
options given in OPTS.
|
||||
|
||||
Return the option character from OPTS just read. Return -1 when
|
||||
there are no more options. For unrecognized options, or options
|
||||
missing arguments, `optopt' is set to the option letter, and '?' is
|
||||
returned.
|
||||
|
||||
The OPTS string is a list of characters which are recognized option
|
||||
letters, optionally followed by colons, specifying that that letter
|
||||
takes an argument, to be placed in `optarg'.
|
||||
|
||||
If a letter in OPTS is followed by two colons, its argument is
|
||||
optional. This behavior is specific to the GNU `getopt'.
|
||||
|
||||
The argument `--' causes premature termination of argument
|
||||
scanning, explicitly telling `getopt' that there are no more
|
||||
options.
|
||||
|
||||
If OPTS begins with `--', then non-option arguments are treated as
|
||||
arguments to the option '\0'. This behavior is specific to the GNU
|
||||
`getopt'. */
|
||||
|
||||
#if (defined __STDC__ && __STDC__) || defined __cplusplus
|
||||
# ifdef __GNU_LIBRARY__
|
||||
/* Many other libraries have conflicting prototypes for getopt, with
|
||||
differences in the consts, in stdlib.h. To avoid compilation
|
||||
errors, only prototype getopt for the GNU C library. */
|
||||
extern int getopt (int ___argc, char *const *___argv, const char *__shortopts);
|
||||
# else /* not __GNU_LIBRARY__ */
|
||||
extern int getopt ();
|
||||
# endif /* __GNU_LIBRARY__ */
|
||||
|
||||
# ifndef __need_getopt
|
||||
extern int getopt_long (int ___argc, char *const *___argv,
|
||||
const char *__shortopts,
|
||||
const struct option *__longopts, int *__longind);
|
||||
extern int getopt_long_only (int ___argc, char *const *___argv,
|
||||
const char *__shortopts,
|
||||
const struct option *__longopts, int *__longind);
|
||||
|
||||
/* Internal only. Users should not call this directly. */
|
||||
extern int _getopt_internal (int ___argc, char *const *___argv,
|
||||
const char *__shortopts,
|
||||
const struct option *__longopts, int *__longind,
|
||||
int __long_only);
|
||||
# endif
|
||||
#else /* not __STDC__ */
|
||||
extern int getopt ();
|
||||
# ifndef __need_getopt
|
||||
extern int getopt_long ();
|
||||
extern int getopt_long_only ();
|
||||
|
||||
extern int _getopt_internal ();
|
||||
# endif
|
||||
#endif /* __STDC__ */
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Make sure we later can get all the definitions and declarations. */
|
||||
#undef __need_getopt
|
||||
|
||||
#endif /* getopt.h */
|
||||
|
||||
|
||||
|
||||
/* getopt.h */
|
||||
/* Declarations for getopt.
|
||||
Copyright (C) 1989-1994, 1996-1999, 2001 Free Software
|
||||
Foundation, Inc. This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute
|
||||
it and/or modify it under the terms of the GNU Lesser
|
||||
General Public License as published by the Free Software
|
||||
Foundation; either version 2.1 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will
|
||||
be useful, but WITHOUT ANY WARRANTY; without even the
|
||||
implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. See the GNU Lesser General Public
|
||||
License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General
|
||||
Public License along with the GNU C Library; if not, write
|
||||
to the Free Software Foundation, Inc., 59 Temple Place,
|
||||
Suite 330, Boston, MA 02111-1307 USA. */
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#ifndef _GETOPT_H
|
||||
|
||||
#ifndef __need_getopt
|
||||
# define _GETOPT_H 1
|
||||
#endif
|
||||
|
||||
/* If __GNU_LIBRARY__ is not already defined, either we are being used
|
||||
standalone, or this is the first header included in the source file.
|
||||
If we are being used with glibc, we need to include <features.h>, but
|
||||
that does not exist if we are standalone. So: if __GNU_LIBRARY__ is
|
||||
not defined, include <ctype.h>, which will pull in <features.h> for us
|
||||
if it's from glibc. (Why ctype.h? It's guaranteed to exist and it
|
||||
doesn't flood the namespace with stuff the way some other headers do.) */
|
||||
#if !defined __GNU_LIBRARY__
|
||||
# include <ctype.h>
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* For communication from `getopt' to the caller.
|
||||
When `getopt' finds an option that takes an argument,
|
||||
the argument value is returned here.
|
||||
Also, when `ordering' is RETURN_IN_ORDER,
|
||||
each non-option ARGV-element is returned here. */
|
||||
|
||||
extern char *optarg;
|
||||
|
||||
/* Index in ARGV of the next element to be scanned.
|
||||
This is used for communication to and from the caller
|
||||
and for communication between successive calls to `getopt'.
|
||||
|
||||
On entry to `getopt', zero means this is the first call; initialize.
|
||||
|
||||
When `getopt' returns -1, this is the index of the first of the
|
||||
non-option elements that the caller should itself scan.
|
||||
|
||||
Otherwise, `optind' communicates from one call to the next
|
||||
how much of ARGV has been scanned so far. */
|
||||
|
||||
extern int optind;
|
||||
|
||||
/* Callers store zero here to inhibit the error message `getopt' prints
|
||||
for unrecognized options. */
|
||||
|
||||
extern int opterr;
|
||||
|
||||
/* Set to an option character which was unrecognized. */
|
||||
|
||||
extern int optopt;
|
||||
|
||||
#ifndef __need_getopt
|
||||
/* Describe the long-named options requested by the application.
|
||||
The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector
|
||||
of `struct option' terminated by an element containing a name which is
|
||||
zero.
|
||||
|
||||
The field `has_arg' is:
|
||||
no_argument (or 0) if the option does not take an argument,
|
||||
required_argument (or 1) if the option requires an argument,
|
||||
optional_argument (or 2) if the option takes an optional argument.
|
||||
|
||||
If the field `flag' is not NULL, it points to a variable that is set
|
||||
to the value given in the field `val' when the option is found, but
|
||||
left unchanged if the option is not found.
|
||||
|
||||
To have a long-named option do something other than set an `int' to
|
||||
a compiled-in constant, such as set a value from `optarg', set the
|
||||
option's `flag' field to zero and its `val' field to a nonzero
|
||||
value (the equivalent single-letter option character, if there is
|
||||
one). For long options that have a zero `flag' field, `getopt'
|
||||
returns the contents of the `val' field. */
|
||||
|
||||
struct option
|
||||
{
|
||||
# if (defined __STDC__ && __STDC__) || defined __cplusplus
|
||||
const char *name;
|
||||
# else
|
||||
char *name;
|
||||
# endif
|
||||
/* has_arg can't be an enum because some compilers complain about
|
||||
type mismatches in all the code that assumes it is an int. */
|
||||
int has_arg;
|
||||
int *flag;
|
||||
int val;
|
||||
};
|
||||
|
||||
/* Names for the values of the `has_arg' field of `struct option'. */
|
||||
|
||||
# define no_argument 0
|
||||
# define required_argument 1
|
||||
# define optional_argument 2
|
||||
#endif /* need getopt */
|
||||
|
||||
|
||||
/* Get definitions and prototypes for functions to process the
|
||||
arguments in ARGV (ARGC of them, minus the program name) for
|
||||
options given in OPTS.
|
||||
|
||||
Return the option character from OPTS just read. Return -1 when
|
||||
there are no more options. For unrecognized options, or options
|
||||
missing arguments, `optopt' is set to the option letter, and '?' is
|
||||
returned.
|
||||
|
||||
The OPTS string is a list of characters which are recognized option
|
||||
letters, optionally followed by colons, specifying that that letter
|
||||
takes an argument, to be placed in `optarg'.
|
||||
|
||||
If a letter in OPTS is followed by two colons, its argument is
|
||||
optional. This behavior is specific to the GNU `getopt'.
|
||||
|
||||
The argument `--' causes premature termination of argument
|
||||
scanning, explicitly telling `getopt' that there are no more
|
||||
options.
|
||||
|
||||
If OPTS begins with `--', then non-option arguments are treated as
|
||||
arguments to the option '\0'. This behavior is specific to the GNU
|
||||
`getopt'. */
|
||||
|
||||
#if (defined __STDC__ && __STDC__) || defined __cplusplus
|
||||
# ifdef __GNU_LIBRARY__
|
||||
/* Many other libraries have conflicting prototypes for getopt, with
|
||||
differences in the consts, in stdlib.h. To avoid compilation
|
||||
errors, only prototype getopt for the GNU C library. */
|
||||
extern int getopt (int ___argc, char *const *___argv, const char *__shortopts);
|
||||
# else /* not __GNU_LIBRARY__ */
|
||||
extern int getopt ();
|
||||
# endif /* __GNU_LIBRARY__ */
|
||||
|
||||
# ifndef __need_getopt
|
||||
extern int getopt_long (int ___argc, char *const *___argv,
|
||||
const char *__shortopts,
|
||||
const struct option *__longopts, int *__longind);
|
||||
extern int getopt_long_only (int ___argc, char *const *___argv,
|
||||
const char *__shortopts,
|
||||
const struct option *__longopts, int *__longind);
|
||||
|
||||
/* Internal only. Users should not call this directly. */
|
||||
extern int _getopt_internal (int ___argc, char *const *___argv,
|
||||
const char *__shortopts,
|
||||
const struct option *__longopts, int *__longind,
|
||||
int __long_only);
|
||||
# endif
|
||||
#else /* not __STDC__ */
|
||||
extern int getopt ();
|
||||
# ifndef __need_getopt
|
||||
extern int getopt_long ();
|
||||
extern int getopt_long_only ();
|
||||
|
||||
extern int _getopt_internal ();
|
||||
# endif
|
||||
#endif /* __STDC__ */
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Make sure we later can get all the definitions and declarations. */
|
||||
#undef __need_getopt
|
||||
|
||||
#endif /* getopt.h */
|
||||
|
||||
|
||||
@@ -1,61 +1,61 @@
|
||||
#ifndef FLT_MAX
|
||||
#define FLT_MAX 3.40282347e+38
|
||||
#endif
|
||||
|
||||
__kernel void
|
||||
kmeans_kernel_c(__global float *feature,
|
||||
__global float *clusters,
|
||||
__global int *membership,
|
||||
int npoints,
|
||||
int nclusters,
|
||||
int nfeatures,
|
||||
int offset,
|
||||
int size
|
||||
)
|
||||
{
|
||||
unsigned int point_id = get_global_id(0);
|
||||
int index = 0;
|
||||
//const unsigned int point_id = get_global_id(0);
|
||||
if (point_id < npoints)
|
||||
{
|
||||
float min_dist=FLT_MAX;
|
||||
for (int i=0; i < nclusters; i++) {
|
||||
|
||||
float dist = 0;
|
||||
float ans = 0;
|
||||
for (int l=0; l<nfeatures; l++){
|
||||
ans += (feature[l * npoints + point_id]-clusters[i*nfeatures+l])*
|
||||
(feature[l * npoints + point_id]-clusters[i*nfeatures+l]);
|
||||
}
|
||||
|
||||
dist = ans;
|
||||
if (dist < min_dist) {
|
||||
min_dist = dist;
|
||||
index = i;
|
||||
|
||||
}
|
||||
}
|
||||
//printf("%d\n", index);
|
||||
membership[point_id] = index;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
__kernel void
|
||||
kmeans_swap(__global float *feature,
|
||||
__global float *feature_swap,
|
||||
int npoints,
|
||||
int nfeatures
|
||||
){
|
||||
|
||||
unsigned int tid = get_global_id(0);
|
||||
//for(int i = 0; i < nfeatures; i++)
|
||||
// feature_swap[i * npoints + tid] = feature[tid * nfeatures + i];
|
||||
//Lingjie Zhang modificated at 11/05/2015
|
||||
if (tid < npoints){
|
||||
for(int i = 0; i < nfeatures; i++)
|
||||
feature_swap[i * npoints + tid] = feature[tid * nfeatures + i];
|
||||
}
|
||||
// end of Lingjie Zhang's modification
|
||||
}
|
||||
#ifndef FLT_MAX
|
||||
#define FLT_MAX 3.40282347e+38
|
||||
#endif
|
||||
|
||||
__kernel void
|
||||
kmeans_kernel_c(__global float *feature,
|
||||
__global float *clusters,
|
||||
__global int *membership,
|
||||
int npoints,
|
||||
int nclusters,
|
||||
int nfeatures,
|
||||
int offset,
|
||||
int size
|
||||
)
|
||||
{
|
||||
unsigned int point_id = get_global_id(0);
|
||||
int index = 0;
|
||||
//const unsigned int point_id = get_global_id(0);
|
||||
if (point_id < npoints)
|
||||
{
|
||||
float min_dist=FLT_MAX;
|
||||
for (int i=0; i < nclusters; i++) {
|
||||
|
||||
float dist = 0;
|
||||
float ans = 0;
|
||||
for (int l=0; l<nfeatures; l++){
|
||||
ans += (feature[l * npoints + point_id]-clusters[i*nfeatures+l])*
|
||||
(feature[l * npoints + point_id]-clusters[i*nfeatures+l]);
|
||||
}
|
||||
|
||||
dist = ans;
|
||||
if (dist < min_dist) {
|
||||
min_dist = dist;
|
||||
index = i;
|
||||
|
||||
}
|
||||
}
|
||||
//printf("%d\n", index);
|
||||
membership[point_id] = index;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
__kernel void
|
||||
kmeans_swap(__global float *feature,
|
||||
__global float *feature_swap,
|
||||
int npoints,
|
||||
int nfeatures
|
||||
){
|
||||
|
||||
unsigned int tid = get_global_id(0);
|
||||
//for(int i = 0; i < nfeatures; i++)
|
||||
// feature_swap[i * npoints + tid] = feature[tid * nfeatures + i];
|
||||
//Lingjie Zhang modificated at 11/05/2015
|
||||
if (tid < npoints){
|
||||
for(int i = 0; i < nfeatures; i++)
|
||||
feature_swap[i * npoints + tid] = feature[tid * nfeatures + i];
|
||||
}
|
||||
// end of Lingjie Zhang's modification
|
||||
}
|
||||
|
||||
BIN
benchmarks/opencl/kmeans/kernel.pocl
Normal file
BIN
benchmarks/opencl/kmeans/kernel.pocl
Normal file
Binary file not shown.
Binary file not shown.
@@ -1,176 +1,176 @@
|
||||
/*****************************************************************************/
|
||||
/*IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. */
|
||||
/*By downloading, copying, installing or using the software you agree */
|
||||
/*to this license. If you do not agree to this license, do not download, */
|
||||
/*install, copy or use the software. */
|
||||
/* */
|
||||
/* */
|
||||
/*Copyright (c) 2005 Northwestern University */
|
||||
/*All rights reserved. */
|
||||
|
||||
/*Redistribution of the software in source and binary forms, */
|
||||
/*with or without modification, is permitted provided that the */
|
||||
/*following conditions are met: */
|
||||
/* */
|
||||
/*1 Redistributions of source code must retain the above copyright */
|
||||
/* notice, this list of conditions and the following disclaimer. */
|
||||
/* */
|
||||
/*2 Redistributions in binary form must reproduce the above copyright */
|
||||
/* notice, this list of conditions and the following disclaimer in the */
|
||||
/* documentation and/or other materials provided with the distribution.*/
|
||||
/* */
|
||||
/*3 Neither the name of Northwestern University nor the names of its */
|
||||
/* contributors may be used to endorse or promote products derived */
|
||||
/* from this software without specific prior written permission. */
|
||||
/* */
|
||||
/*THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS */
|
||||
/*IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED */
|
||||
/*TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT AND */
|
||||
/*FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL */
|
||||
/*NORTHWESTERN UNIVERSITY OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, */
|
||||
/*INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/*(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */
|
||||
/*SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) */
|
||||
/*HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, */
|
||||
/*STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN */
|
||||
/*ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/*POSSIBILITY OF SUCH DAMAGE. */
|
||||
/******************************************************************************/
|
||||
|
||||
/*************************************************************************/
|
||||
/** File: kmeans_clustering.c **/
|
||||
/** Description: Implementation of regular k-means clustering **/
|
||||
/** algorithm **/
|
||||
/** Author: Wei-keng Liao **/
|
||||
/** ECE Department, Northwestern University **/
|
||||
/** email: wkliao@ece.northwestern.edu **/
|
||||
/** **/
|
||||
/** Edited by: Jay Pisharath **/
|
||||
/** Northwestern University. **/
|
||||
/** **/
|
||||
/** ================================================================ **/
|
||||
/** **/
|
||||
/** Edited by: Shuai Che, David Tarjan, Sang-Ha Lee **/
|
||||
/** University of Virginia **/
|
||||
/** **/
|
||||
/** Description: No longer supports fuzzy c-means clustering; **/
|
||||
/** only regular k-means clustering. **/
|
||||
/** No longer performs "validity" function to analyze **/
|
||||
/** compactness and separation crietria; instead **/
|
||||
/** calculate root mean squared error. **/
|
||||
/** **/
|
||||
/*************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <float.h>
|
||||
#include <math.h>
|
||||
#include "kmeans.h"
|
||||
|
||||
#define RANDOM_MAX 2147483647
|
||||
|
||||
extern double wtime(void);
|
||||
|
||||
/*----< kmeans_clustering() >---------------------------------------------*/
|
||||
float** kmeans_clustering(float **feature, /* in: [npoints][nfeatures] */
|
||||
int nfeatures,
|
||||
int npoints,
|
||||
int nclusters,
|
||||
float threshold,
|
||||
int *membership) /* out: [npoints] */
|
||||
{
|
||||
int i, j, n = 0; /* counters */
|
||||
int loop=0, temp;
|
||||
int *new_centers_len; /* [nclusters]: no. of points in each cluster */
|
||||
float delta; /* if the point moved */
|
||||
float **clusters; /* out: [nclusters][nfeatures] */
|
||||
float **new_centers; /* [nclusters][nfeatures] */
|
||||
|
||||
int *initial; /* used to hold the index of points not yet selected
|
||||
prevents the "birthday problem" of dual selection (?)
|
||||
considered holding initial cluster indices, but changed due to
|
||||
possible, though unlikely, infinite loops */
|
||||
int initial_points;
|
||||
int c = 0;
|
||||
|
||||
/* nclusters should never be > npoints
|
||||
that would guarantee a cluster without points */
|
||||
if (nclusters > npoints)
|
||||
nclusters = npoints;
|
||||
|
||||
/* allocate space for and initialize returning variable clusters[] */
|
||||
clusters = (float**) malloc(nclusters * sizeof(float*));
|
||||
clusters[0] = (float*) malloc(nclusters * nfeatures * sizeof(float));
|
||||
for (i=1; i<nclusters; i++)
|
||||
clusters[i] = clusters[i-1] + nfeatures;
|
||||
|
||||
/* initialize the random clusters */
|
||||
initial = (int *) malloc (npoints * sizeof(int));
|
||||
for (i = 0; i < npoints; i++)
|
||||
{
|
||||
initial[i] = i;
|
||||
}
|
||||
initial_points = npoints;
|
||||
|
||||
/* randomly pick cluster centers */
|
||||
for (i=0; i<nclusters && initial_points >= 0; i++) {
|
||||
//n = (int)rand() % initial_points;
|
||||
|
||||
for (j=0; j<nfeatures; j++)
|
||||
clusters[i][j] = feature[initial[n]][j]; // remapped
|
||||
|
||||
/* swap the selected index to the end (not really necessary,
|
||||
could just move the end up) */
|
||||
temp = initial[n];
|
||||
initial[n] = initial[initial_points-1];
|
||||
initial[initial_points-1] = temp;
|
||||
initial_points--;
|
||||
n++;
|
||||
}
|
||||
|
||||
/* initialize the membership to -1 for all */
|
||||
for (i=0; i < npoints; i++)
|
||||
membership[i] = -1;
|
||||
|
||||
/* allocate space for and initialize new_centers_len and new_centers */
|
||||
new_centers_len = (int*) calloc(nclusters, sizeof(int));
|
||||
|
||||
new_centers = (float**) malloc(nclusters * sizeof(float*));
|
||||
new_centers[0] = (float*) calloc(nclusters * nfeatures, sizeof(float));
|
||||
for (i=1; i<nclusters; i++)
|
||||
new_centers[i] = new_centers[i-1] + nfeatures;
|
||||
|
||||
/* iterate until convergence */
|
||||
do {
|
||||
delta = 0.0;
|
||||
// CUDA
|
||||
delta = (float) kmeansOCL(feature, /* in: [npoints][nfeatures] */
|
||||
nfeatures, /* number of attributes for each point */
|
||||
npoints, /* number of data points */
|
||||
nclusters, /* number of clusters */
|
||||
membership, /* which cluster the point belongs to */
|
||||
clusters, /* out: [nclusters][nfeatures] */
|
||||
new_centers_len, /* out: number of points in each cluster */
|
||||
new_centers /* sum of points in each cluster */
|
||||
);
|
||||
|
||||
/* replace old cluster centers with new_centers */
|
||||
/* CPU side of reduction */
|
||||
for (i=0; i<nclusters; i++) {
|
||||
for (j=0; j<nfeatures; j++) {
|
||||
if (new_centers_len[i] > 0)
|
||||
clusters[i][j] = new_centers[i][j] / new_centers_len[i]; /* take average i.e. sum/n */
|
||||
new_centers[i][j] = 0.0; /* set back to 0 */
|
||||
}
|
||||
new_centers_len[i] = 0; /* set back to 0 */
|
||||
}
|
||||
c++;
|
||||
} while ((delta > threshold) && (loop++ < 500)); /* makes sure loop terminates */
|
||||
printf("iterated %d times\n", c);
|
||||
free(new_centers[0]);
|
||||
free(new_centers);
|
||||
free(new_centers_len);
|
||||
|
||||
return clusters;
|
||||
}
|
||||
|
||||
/*****************************************************************************/
|
||||
/*IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. */
|
||||
/*By downloading, copying, installing or using the software you agree */
|
||||
/*to this license. If you do not agree to this license, do not download, */
|
||||
/*install, copy or use the software. */
|
||||
/* */
|
||||
/* */
|
||||
/*Copyright (c) 2005 Northwestern University */
|
||||
/*All rights reserved. */
|
||||
|
||||
/*Redistribution of the software in source and binary forms, */
|
||||
/*with or without modification, is permitted provided that the */
|
||||
/*following conditions are met: */
|
||||
/* */
|
||||
/*1 Redistributions of source code must retain the above copyright */
|
||||
/* notice, this list of conditions and the following disclaimer. */
|
||||
/* */
|
||||
/*2 Redistributions in binary form must reproduce the above copyright */
|
||||
/* notice, this list of conditions and the following disclaimer in the */
|
||||
/* documentation and/or other materials provided with the distribution.*/
|
||||
/* */
|
||||
/*3 Neither the name of Northwestern University nor the names of its */
|
||||
/* contributors may be used to endorse or promote products derived */
|
||||
/* from this software without specific prior written permission. */
|
||||
/* */
|
||||
/*THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS */
|
||||
/*IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED */
|
||||
/*TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT AND */
|
||||
/*FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL */
|
||||
/*NORTHWESTERN UNIVERSITY OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, */
|
||||
/*INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/*(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */
|
||||
/*SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) */
|
||||
/*HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, */
|
||||
/*STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN */
|
||||
/*ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/*POSSIBILITY OF SUCH DAMAGE. */
|
||||
/******************************************************************************/
|
||||
|
||||
/*************************************************************************/
|
||||
/** File: kmeans_clustering.c **/
|
||||
/** Description: Implementation of regular k-means clustering **/
|
||||
/** algorithm **/
|
||||
/** Author: Wei-keng Liao **/
|
||||
/** ECE Department, Northwestern University **/
|
||||
/** email: wkliao@ece.northwestern.edu **/
|
||||
/** **/
|
||||
/** Edited by: Jay Pisharath **/
|
||||
/** Northwestern University. **/
|
||||
/** **/
|
||||
/** ================================================================ **/
|
||||
/** **/
|
||||
/** Edited by: Shuai Che, David Tarjan, Sang-Ha Lee **/
|
||||
/** University of Virginia **/
|
||||
/** **/
|
||||
/** Description: No longer supports fuzzy c-means clustering; **/
|
||||
/** only regular k-means clustering. **/
|
||||
/** No longer performs "validity" function to analyze **/
|
||||
/** compactness and separation crietria; instead **/
|
||||
/** calculate root mean squared error. **/
|
||||
/** **/
|
||||
/*************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <float.h>
|
||||
#include <math.h>
|
||||
#include "kmeans.h"
|
||||
|
||||
#define RANDOM_MAX 2147483647
|
||||
|
||||
extern double wtime(void);
|
||||
|
||||
/*----< kmeans_clustering() >---------------------------------------------*/
|
||||
float** kmeans_clustering(float **feature, /* in: [npoints][nfeatures] */
|
||||
int nfeatures,
|
||||
int npoints,
|
||||
int nclusters,
|
||||
float threshold,
|
||||
int *membership) /* out: [npoints] */
|
||||
{
|
||||
int i, j, n = 0; /* counters */
|
||||
int loop=0, temp;
|
||||
int *new_centers_len; /* [nclusters]: no. of points in each cluster */
|
||||
float delta; /* if the point moved */
|
||||
float **clusters; /* out: [nclusters][nfeatures] */
|
||||
float **new_centers; /* [nclusters][nfeatures] */
|
||||
|
||||
int *initial; /* used to hold the index of points not yet selected
|
||||
prevents the "birthday problem" of dual selection (?)
|
||||
considered holding initial cluster indices, but changed due to
|
||||
possible, though unlikely, infinite loops */
|
||||
int initial_points;
|
||||
int c = 0;
|
||||
|
||||
/* nclusters should never be > npoints
|
||||
that would guarantee a cluster without points */
|
||||
if (nclusters > npoints)
|
||||
nclusters = npoints;
|
||||
|
||||
/* allocate space for and initialize returning variable clusters[] */
|
||||
clusters = (float**) malloc(nclusters * sizeof(float*));
|
||||
clusters[0] = (float*) malloc(nclusters * nfeatures * sizeof(float));
|
||||
for (i=1; i<nclusters; i++)
|
||||
clusters[i] = clusters[i-1] + nfeatures;
|
||||
|
||||
/* initialize the random clusters */
|
||||
initial = (int *) malloc (npoints * sizeof(int));
|
||||
for (i = 0; i < npoints; i++)
|
||||
{
|
||||
initial[i] = i;
|
||||
}
|
||||
initial_points = npoints;
|
||||
|
||||
/* randomly pick cluster centers */
|
||||
for (i=0; i<nclusters && initial_points >= 0; i++) {
|
||||
//n = (int)rand() % initial_points;
|
||||
|
||||
for (j=0; j<nfeatures; j++)
|
||||
clusters[i][j] = feature[initial[n]][j]; // remapped
|
||||
|
||||
/* swap the selected index to the end (not really necessary,
|
||||
could just move the end up) */
|
||||
temp = initial[n];
|
||||
initial[n] = initial[initial_points-1];
|
||||
initial[initial_points-1] = temp;
|
||||
initial_points--;
|
||||
n++;
|
||||
}
|
||||
|
||||
/* initialize the membership to -1 for all */
|
||||
for (i=0; i < npoints; i++)
|
||||
membership[i] = -1;
|
||||
|
||||
/* allocate space for and initialize new_centers_len and new_centers */
|
||||
new_centers_len = (int*) calloc(nclusters, sizeof(int));
|
||||
|
||||
new_centers = (float**) malloc(nclusters * sizeof(float*));
|
||||
new_centers[0] = (float*) calloc(nclusters * nfeatures, sizeof(float));
|
||||
for (i=1; i<nclusters; i++)
|
||||
new_centers[i] = new_centers[i-1] + nfeatures;
|
||||
|
||||
/* iterate until convergence */
|
||||
do {
|
||||
delta = 0.0;
|
||||
// CUDA
|
||||
delta = (float) kmeansOCL(feature, /* in: [npoints][nfeatures] */
|
||||
nfeatures, /* number of attributes for each point */
|
||||
npoints, /* number of data points */
|
||||
nclusters, /* number of clusters */
|
||||
membership, /* which cluster the point belongs to */
|
||||
clusters, /* out: [nclusters][nfeatures] */
|
||||
new_centers_len, /* out: number of points in each cluster */
|
||||
new_centers /* sum of points in each cluster */
|
||||
);
|
||||
|
||||
/* replace old cluster centers with new_centers */
|
||||
/* CPU side of reduction */
|
||||
for (i=0; i<nclusters; i++) {
|
||||
for (j=0; j<nfeatures; j++) {
|
||||
if (new_centers_len[i] > 0)
|
||||
clusters[i][j] = new_centers[i][j] / new_centers_len[i]; /* take average i.e. sum/n */
|
||||
new_centers[i][j] = 0.0; /* set back to 0 */
|
||||
}
|
||||
new_centers_len[i] = 0; /* set back to 0 */
|
||||
}
|
||||
c++;
|
||||
} while ((delta > threshold) && (loop++ < 500)); /* makes sure loop terminates */
|
||||
printf("iterated %d times\n", c);
|
||||
free(new_centers[0]);
|
||||
free(new_centers);
|
||||
free(new_centers_len);
|
||||
|
||||
return clusters;
|
||||
}
|
||||
|
||||
|
||||
Binary file not shown.
@@ -1,394 +1,382 @@
|
||||
#include "kmeans.h"
|
||||
#include <iostream>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <string>
|
||||
|
||||
#ifdef WIN
|
||||
#include <windows.h>
|
||||
#else
|
||||
#include <pthread.h>
|
||||
#include <sys/time.h>
|
||||
double gettime() {
|
||||
struct timeval t;
|
||||
gettimeofday(&t, NULL);
|
||||
return t.tv_sec + t.tv_usec * 1e-6;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef NV
|
||||
#include <oclUtils.h>
|
||||
#else
|
||||
#include <CL/cl.h>
|
||||
#endif
|
||||
|
||||
#ifndef FLT_MAX
|
||||
#define FLT_MAX 3.40282347e+38
|
||||
#endif
|
||||
|
||||
#ifdef RD_WG_SIZE_0_0
|
||||
#define BLOCK_SIZE RD_WG_SIZE_0_0
|
||||
#elif defined(RD_WG_SIZE_0)
|
||||
#define BLOCK_SIZE RD_WG_SIZE_0
|
||||
#elif defined(RD_WG_SIZE)
|
||||
#define BLOCK_SIZE RD_WG_SIZE
|
||||
#else
|
||||
#define BLOCK_SIZE 256
|
||||
#endif
|
||||
|
||||
#ifdef RD_WG_SIZE_1_0
|
||||
#define BLOCK_SIZE2 RD_WG_SIZE_1_0
|
||||
#elif defined(RD_WG_SIZE_1)
|
||||
#define BLOCK_SIZE2 RD_WG_SIZE_1
|
||||
#elif defined(RD_WG_SIZE)
|
||||
#define BLOCK_SIZE2 RD_WG_SIZE
|
||||
#else
|
||||
#define BLOCK_SIZE2 256
|
||||
#endif
|
||||
|
||||
// local variables
|
||||
static cl_context context;
|
||||
static cl_command_queue cmd_queue;
|
||||
static cl_device_type device_type;
|
||||
static cl_device_id *device_list;
|
||||
static cl_int num_devices;
|
||||
|
||||
|
||||
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
|
||||
if (nullptr == filename || nullptr == data || 0 == size)
|
||||
return -1;
|
||||
|
||||
FILE* fp = fopen(filename, "r");
|
||||
if (NULL == fp) {
|
||||
fprintf(stderr, "Failed to load kernel.");
|
||||
return -1;
|
||||
}
|
||||
fseek(fp , 0 , SEEK_END);
|
||||
long fsize = ftell(fp);
|
||||
rewind(fp);
|
||||
|
||||
*data = (uint8_t*)malloc(fsize);
|
||||
*size = fread(*data, 1, fsize, fp);
|
||||
|
||||
fclose(fp);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int initialize(int use_gpu) {
|
||||
cl_int result;
|
||||
size_t size;
|
||||
|
||||
/*// create OpenCL context
|
||||
cl_platform_id platform_id;
|
||||
if (clGetPlatformIDs(1, &platform_id, NULL) != CL_SUCCESS) {
|
||||
printf("ERROR: clGetPlatformIDs(1,*,0) failed\n");
|
||||
return -1;
|
||||
}
|
||||
cl_context_properties ctxprop[] = {CL_CONTEXT_PLATFORM,
|
||||
(cl_context_properties)platform_id, 0};
|
||||
device_type = use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU;
|
||||
context = clCreateContextFromType(ctxprop, device_type, NULL, NULL, NULL);
|
||||
if (!context) {
|
||||
printf("ERROR: clCreateContextFromType(%s) failed\n",
|
||||
use_gpu ? "GPU" : "CPU");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// get the list of GPUs
|
||||
result = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size);
|
||||
num_devices = (int)(size / sizeof(cl_device_id));
|
||||
|
||||
if (result != CL_SUCCESS || num_devices < 1) {
|
||||
printf("ERROR: clGetContextInfo() failed\n");
|
||||
return -1;
|
||||
}
|
||||
device_list = new cl_device_id[num_devices];
|
||||
if (!device_list) {
|
||||
printf("ERROR: new cl_device_id[] failed\n");
|
||||
return -1;
|
||||
}
|
||||
result =
|
||||
clGetContextInfo(context, CL_CONTEXT_DEVICES, size, device_list, NULL);
|
||||
if (result != CL_SUCCESS) {
|
||||
printf("ERROR: clGetContextInfo() failed\n");
|
||||
return -1;
|
||||
}*/
|
||||
|
||||
cl_platform_id platform_id;
|
||||
num_devices = 1;
|
||||
device_list = new cl_device_id[num_devices];
|
||||
|
||||
result = clGetPlatformIDs(1, &platform_id, NULL);
|
||||
result = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, device_list, NULL);
|
||||
context = clCreateContext(NULL, 1, device_list, NULL, NULL, &result);
|
||||
|
||||
// create command queue for the first device
|
||||
cmd_queue = clCreateCommandQueue(context, device_list[0], 0, NULL);
|
||||
if (!cmd_queue) {
|
||||
printf("ERROR: clCreateCommandQueue() failed\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int shutdown() {
|
||||
// release resources
|
||||
if (cmd_queue)
|
||||
clReleaseCommandQueue(cmd_queue);
|
||||
if (context)
|
||||
clReleaseContext(context);
|
||||
if (device_list)
|
||||
delete device_list;
|
||||
|
||||
// reset all variables
|
||||
cmd_queue = 0;
|
||||
context = 0;
|
||||
device_list = 0;
|
||||
num_devices = 0;
|
||||
device_type = 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
cl_mem d_feature;
|
||||
cl_mem d_feature_swap;
|
||||
cl_mem d_cluster;
|
||||
cl_mem d_membership;
|
||||
|
||||
cl_kernel kernel;
|
||||
cl_kernel kernel_s;
|
||||
cl_kernel kernel2;
|
||||
|
||||
int *membership_OCL;
|
||||
int *membership_d;
|
||||
float *feature_d;
|
||||
float *clusters_d;
|
||||
float *center_d;
|
||||
|
||||
uint8_t* kernel_bin = NULL;
|
||||
size_t kernel_size = 0;
|
||||
cl_int binary_status = 0;
|
||||
|
||||
|
||||
int allocate(int n_points, int n_features, int n_clusters, float **feature) {
|
||||
/*int sourcesize = 1024 * 1024;
|
||||
char *source = (char *)calloc(sourcesize, sizeof(char));
|
||||
if (!source) {
|
||||
printf("ERROR: calloc(%d) failed\n", sourcesize);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// read the kernel core source
|
||||
char *tempchar = "./kmeans.cl";
|
||||
FILE *fp = fopen(tempchar, "rb");
|
||||
if (!fp) {
|
||||
printf("ERROR: unable to open '%s'\n", tempchar);
|
||||
return -1;
|
||||
}
|
||||
fread(source + strlen(source), sourcesize, 1, fp);
|
||||
fclose(fp);*/
|
||||
|
||||
// OpenCL initialization
|
||||
int use_gpu = 1;
|
||||
if (initialize(use_gpu))
|
||||
return -1;
|
||||
|
||||
// Load Kernel
|
||||
if (read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size)) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// compile kernel
|
||||
cl_int err = 0;
|
||||
//const char *slist[2] = {source, 0};
|
||||
//cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
|
||||
cl_program prog = clCreateProgramWithBinary(
|
||||
context, 1, device_list, &kernel_size, &kernel_bin, &binary_status, &err);
|
||||
// cl_program prog = clCreateProgramWithBuiltInKernels(context, 1, device_list, "kmeans_kernel_c;kmeans_swap", &err);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: clCreateProgramWithSource() => %d\n", err);
|
||||
return -1;
|
||||
}
|
||||
err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL);
|
||||
{ // show warnings/errors
|
||||
// static char log[65536]; memset(log, 0, sizeof(log));
|
||||
// cl_device_id device_id = 0;
|
||||
// err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id),
|
||||
//&device_id, NULL);
|
||||
// clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG,
|
||||
// sizeof(log)-1, log, NULL);
|
||||
// if(err || strstr(log,"warning:") || strstr(log, "error:"))
|
||||
// printf("<<<<\n%s\n>>>>\n", log);
|
||||
}
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: clBuildProgram() => %d\n", err);
|
||||
return -1;
|
||||
}
|
||||
|
||||
char *kernel_kmeans_c = "kmeans_kernel_c";
|
||||
char *kernel_swap = "kmeans_swap";
|
||||
|
||||
kernel_s = clCreateKernel(prog, kernel_kmeans_c, &err);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: clCreateKernel() 0 => %d\n", err);
|
||||
return -1;
|
||||
}
|
||||
kernel2 = clCreateKernel(prog, kernel_swap, &err);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: clCreateKernel() 0 => %d\n", err);
|
||||
return -1;
|
||||
}
|
||||
|
||||
clReleaseProgram(prog);
|
||||
|
||||
d_feature = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
n_points * n_features * sizeof(float), NULL, &err);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: clCreateBuffer d_feature (size:%d) => %d\n",
|
||||
n_points * n_features, err);
|
||||
return -1;
|
||||
}
|
||||
d_feature_swap =
|
||||
clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
n_points * n_features * sizeof(float), NULL, &err);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: clCreateBuffer d_feature_swap (size:%d) => %d\n",
|
||||
n_points * n_features, err);
|
||||
return -1;
|
||||
}
|
||||
d_cluster =
|
||||
clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
n_clusters * n_features * sizeof(float), NULL, &err);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: clCreateBuffer d_cluster (size:%d) => %d\n",
|
||||
n_clusters * n_features, err);
|
||||
return -1;
|
||||
}
|
||||
d_membership = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
n_points * sizeof(int), NULL, &err);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: clCreateBuffer d_membership (size:%d) => %d\n", n_points,
|
||||
err);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// write buffers
|
||||
err = clEnqueueWriteBuffer(cmd_queue, d_feature, 1, 0,
|
||||
n_points * n_features * sizeof(float), feature[0],
|
||||
0, 0, 0);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: clEnqueueWriteBuffer d_feature (size:%d) => %d\n",
|
||||
n_points * n_features, err);
|
||||
return -1;
|
||||
}
|
||||
|
||||
clSetKernelArg(kernel2, 0, sizeof(void *), (void *)&d_feature);
|
||||
clSetKernelArg(kernel2, 1, sizeof(void *), (void *)&d_feature_swap);
|
||||
clSetKernelArg(kernel2, 2, sizeof(cl_int), (void *)&n_points);
|
||||
clSetKernelArg(kernel2, 3, sizeof(cl_int), (void *)&n_features);
|
||||
|
||||
size_t global_work[3] = {n_points, 1, 1};
|
||||
/// Ke Wang adjustable local group size 2013/08/07 10:37:33
|
||||
size_t local_work_size = BLOCK_SIZE; // work group size is defined by
|
||||
// RD_WG_SIZE_0 or RD_WG_SIZE_0_0
|
||||
// 2014/06/10 17:00:51
|
||||
if (global_work[0] % local_work_size != 0)
|
||||
global_work[0] = (global_work[0] / local_work_size + 1) * local_work_size;
|
||||
|
||||
err = clEnqueueNDRangeKernel(cmd_queue, kernel2, 1, NULL, global_work,
|
||||
&local_work_size, 0, 0, 0);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: clEnqueueNDRangeKernel()=>%d failed\n", err);
|
||||
return -1;
|
||||
}
|
||||
|
||||
membership_OCL = (int *)malloc(n_points * sizeof(int));
|
||||
}
|
||||
|
||||
void deallocateMemory() {
|
||||
clReleaseMemObject(d_feature);
|
||||
clReleaseMemObject(d_feature_swap);
|
||||
clReleaseMemObject(d_cluster);
|
||||
clReleaseMemObject(d_membership);
|
||||
if (kernel_bin) free(kernel_bin);
|
||||
free(membership_OCL);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
printf("WG size of kernel_swap = %d, WG size of kernel_kmeans = %d \n",
|
||||
BLOCK_SIZE, BLOCK_SIZE2);
|
||||
setup(argc, argv);
|
||||
shutdown();
|
||||
}
|
||||
|
||||
int kmeansOCL(float **feature, /* in: [npoints][nfeatures] */
|
||||
int n_features, int n_points, int n_clusters, int *membership,
|
||||
float **clusters, int *new_centers_len, float **new_centers) {
|
||||
|
||||
int delta = 0;
|
||||
int i, j, k;
|
||||
cl_int err = 0;
|
||||
|
||||
size_t global_work[3] = {n_points, 1, 1};
|
||||
|
||||
/// Ke Wang adjustable local group size 2013/08/07 10:37:33
|
||||
size_t local_work_size = BLOCK_SIZE2; // work group size is defined by
|
||||
// RD_WG_SIZE_1 or RD_WG_SIZE_1_0
|
||||
// 2014/06/10 17:00:41
|
||||
if (global_work[0] % local_work_size != 0)
|
||||
global_work[0] = (global_work[0] / local_work_size + 1) * local_work_size;
|
||||
|
||||
err = clEnqueueWriteBuffer(cmd_queue, d_cluster, 1, 0,
|
||||
n_clusters * n_features * sizeof(float),
|
||||
clusters[0], 0, 0, 0);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: clEnqueueWriteBuffer d_cluster (size:%d) => %d\n", n_points,
|
||||
err);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int size = 0;
|
||||
int offset = 0;
|
||||
|
||||
clSetKernelArg(kernel_s, 0, sizeof(void *), (void *)&d_feature_swap);
|
||||
clSetKernelArg(kernel_s, 1, sizeof(void *), (void *)&d_cluster);
|
||||
clSetKernelArg(kernel_s, 2, sizeof(void *), (void *)&d_membership);
|
||||
clSetKernelArg(kernel_s, 3, sizeof(cl_int), (void *)&n_points);
|
||||
clSetKernelArg(kernel_s, 4, sizeof(cl_int), (void *)&n_clusters);
|
||||
clSetKernelArg(kernel_s, 5, sizeof(cl_int), (void *)&n_features);
|
||||
clSetKernelArg(kernel_s, 6, sizeof(cl_int), (void *)&offset);
|
||||
clSetKernelArg(kernel_s, 7, sizeof(cl_int), (void *)&size);
|
||||
|
||||
err = clEnqueueNDRangeKernel(cmd_queue, kernel_s, 1, NULL, global_work,
|
||||
&local_work_size, 0, 0, 0);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: clEnqueueNDRangeKernel()=>%d failed\n", err);
|
||||
return -1;
|
||||
}
|
||||
clFinish(cmd_queue);
|
||||
err = clEnqueueReadBuffer(cmd_queue, d_membership, 1, 0,
|
||||
n_points * sizeof(int), membership_OCL, 0, 0, 0);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: Memcopy Out\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
delta = 0;
|
||||
for (i = 0; i < n_points; i++) {
|
||||
int cluster_id = membership_OCL[i];
|
||||
new_centers_len[cluster_id]++;
|
||||
if (membership_OCL[i] != membership[i]) {
|
||||
delta++;
|
||||
membership[i] = membership_OCL[i];
|
||||
}
|
||||
for (j = 0; j < n_features; j++) {
|
||||
new_centers[cluster_id][j] += feature[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
return delta;
|
||||
}
|
||||
#include "kmeans.h"
|
||||
#include <iostream>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <string>
|
||||
|
||||
#ifdef WIN
|
||||
#include <windows.h>
|
||||
#else
|
||||
#include <pthread.h>
|
||||
#include <sys/time.h>
|
||||
double gettime() {
|
||||
struct timeval t;
|
||||
gettimeofday(&t, NULL);
|
||||
return t.tv_sec + t.tv_usec * 1e-6;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef NV
|
||||
#include <oclUtils.h>
|
||||
#else
|
||||
#include <CL/cl.h>
|
||||
#endif
|
||||
|
||||
#ifndef FLT_MAX
|
||||
#define FLT_MAX 3.40282347e+38
|
||||
#endif
|
||||
|
||||
#ifdef RD_WG_SIZE_0_0
|
||||
#define BLOCK_SIZE RD_WG_SIZE_0_0
|
||||
#elif defined(RD_WG_SIZE_0)
|
||||
#define BLOCK_SIZE RD_WG_SIZE_0
|
||||
#elif defined(RD_WG_SIZE)
|
||||
#define BLOCK_SIZE RD_WG_SIZE
|
||||
#else
|
||||
#define BLOCK_SIZE 256
|
||||
#endif
|
||||
|
||||
#ifdef RD_WG_SIZE_1_0
|
||||
#define BLOCK_SIZE2 RD_WG_SIZE_1_0
|
||||
#elif defined(RD_WG_SIZE_1)
|
||||
#define BLOCK_SIZE2 RD_WG_SIZE_1
|
||||
#elif defined(RD_WG_SIZE)
|
||||
#define BLOCK_SIZE2 RD_WG_SIZE
|
||||
#else
|
||||
#define BLOCK_SIZE2 256
|
||||
#endif
|
||||
|
||||
// local variables
|
||||
static cl_context context;
|
||||
static cl_command_queue cmd_queue;
|
||||
static cl_device_type device_type;
|
||||
static cl_device_id *device_list;
|
||||
static cl_int num_devices;
|
||||
|
||||
static int initialize(int use_gpu) {
|
||||
cl_int result;
|
||||
size_t size;
|
||||
|
||||
/*// create OpenCL context
|
||||
cl_platform_id platform_id;
|
||||
if (clGetPlatformIDs(1, &platform_id, NULL) != CL_SUCCESS) {
|
||||
printf("ERROR: clGetPlatformIDs(1,*,0) failed\n");
|
||||
return -1;
|
||||
}
|
||||
cl_context_properties ctxprop[] = {CL_CONTEXT_PLATFORM,
|
||||
(cl_context_properties)platform_id, 0};
|
||||
device_type = use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU;
|
||||
context = clCreateContextFromType(ctxprop, device_type, NULL, NULL, NULL);
|
||||
if (!context) {
|
||||
printf("ERROR: clCreateContextFromType(%s) failed\n",
|
||||
use_gpu ? "GPU" : "CPU");
|
||||
return -1;
|
||||
}
|
||||
|
||||
// get the list of GPUs
|
||||
result = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size);
|
||||
num_devices = (int)(size / sizeof(cl_device_id));
|
||||
|
||||
if (result != CL_SUCCESS || num_devices < 1) {
|
||||
printf("ERROR: clGetContextInfo() failed\n");
|
||||
return -1;
|
||||
}
|
||||
device_list = new cl_device_id[num_devices];
|
||||
if (!device_list) {
|
||||
printf("ERROR: new cl_device_id[] failed\n");
|
||||
return -1;
|
||||
}
|
||||
result =
|
||||
clGetContextInfo(context, CL_CONTEXT_DEVICES, size, device_list, NULL);
|
||||
if (result != CL_SUCCESS) {
|
||||
printf("ERROR: clGetContextInfo() failed\n");
|
||||
return -1;
|
||||
}*/
|
||||
|
||||
cl_platform_id platform_id;
|
||||
num_devices = 1;
|
||||
device_list = new cl_device_id[num_devices];
|
||||
|
||||
result = clGetPlatformIDs(1, &platform_id, NULL);
|
||||
result = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, device_list, NULL);
|
||||
context = clCreateContext(NULL, 1, device_list, NULL, NULL, &result);
|
||||
|
||||
// create command queue for the first device
|
||||
cmd_queue = clCreateCommandQueue(context, device_list[0], 0, NULL);
|
||||
if (!cmd_queue) {
|
||||
printf("ERROR: clCreateCommandQueue() failed\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int shutdown() {
|
||||
// release resources
|
||||
if (cmd_queue)
|
||||
clReleaseCommandQueue(cmd_queue);
|
||||
if (context)
|
||||
clReleaseContext(context);
|
||||
if (device_list)
|
||||
delete device_list;
|
||||
|
||||
// reset all variables
|
||||
cmd_queue = 0;
|
||||
context = 0;
|
||||
device_list = 0;
|
||||
num_devices = 0;
|
||||
device_type = 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
cl_mem d_feature;
|
||||
cl_mem d_feature_swap;
|
||||
cl_mem d_cluster;
|
||||
cl_mem d_membership;
|
||||
|
||||
cl_kernel kernel;
|
||||
cl_kernel kernel_s;
|
||||
cl_kernel kernel2;
|
||||
|
||||
int *membership_OCL;
|
||||
int *membership_d;
|
||||
float *feature_d;
|
||||
float *clusters_d;
|
||||
float *center_d;
|
||||
|
||||
|
||||
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
|
||||
if (nullptr == filename || nullptr == data || 0 == size)
|
||||
return -1;
|
||||
|
||||
FILE* fp = fopen(filename, "r");
|
||||
if (NULL == fp) {
|
||||
fprintf(stderr, "Failed to load kernel.");
|
||||
return -1;
|
||||
}
|
||||
fseek(fp , 0 , SEEK_END);
|
||||
long fsize = ftell(fp);
|
||||
rewind(fp);
|
||||
|
||||
*data = (uint8_t*)malloc(fsize);
|
||||
*size = fread(*data, 1, fsize, fp);
|
||||
|
||||
fclose(fp);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
int allocate(int n_points, int n_features, int n_clusters, float **feature) {
|
||||
/*int sourcesize = 1024 * 1024;
|
||||
char *source = (char *)calloc(sourcesize, sizeof(char));
|
||||
if (!source) {
|
||||
printf("ERROR: calloc(%d) failed\n", sourcesize);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// read the kernel core source
|
||||
char *tempchar = "./kmeans.cl";
|
||||
FILE *fp = fopen(tempchar, "rb");
|
||||
if (!fp) {
|
||||
printf("ERROR: unable to open '%s'\n", tempchar);
|
||||
return -1;
|
||||
}
|
||||
fread(source + strlen(source), sourcesize, 1, fp);
|
||||
fclose(fp);*/
|
||||
|
||||
// OpenCL initialization
|
||||
int use_gpu = 1;
|
||||
if (initialize(use_gpu))
|
||||
return -1;
|
||||
|
||||
// compile kernel
|
||||
cl_int err = 0;
|
||||
//const char *slist[2] = {source, 0};
|
||||
//cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
|
||||
cl_program prog = clCreateProgramWithBuiltInKernels(context, 1, device_list, "kmeans_kernel_c;kmeans_swap", &err);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: clCreateProgramWithSource() => %d\n", err);
|
||||
return -1;
|
||||
}
|
||||
err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL);
|
||||
{ // show warnings/errors
|
||||
// static char log[65536]; memset(log, 0, sizeof(log));
|
||||
// cl_device_id device_id = 0;
|
||||
// err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id),
|
||||
//&device_id, NULL);
|
||||
// clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG,
|
||||
// sizeof(log)-1, log, NULL);
|
||||
// if(err || strstr(log,"warning:") || strstr(log, "error:"))
|
||||
// printf("<<<<\n%s\n>>>>\n", log);
|
||||
}
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: clBuildProgram() => %d\n", err);
|
||||
return -1;
|
||||
}
|
||||
|
||||
char *kernel_kmeans_c = "kmeans_kernel_c";
|
||||
char *kernel_swap = "kmeans_swap";
|
||||
|
||||
kernel_s = clCreateKernel(prog, kernel_kmeans_c, &err);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: clCreateKernel() 0 => %d\n", err);
|
||||
return -1;
|
||||
}
|
||||
kernel2 = clCreateKernel(prog, kernel_swap, &err);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: clCreateKernel() 0 => %d\n", err);
|
||||
return -1;
|
||||
}
|
||||
|
||||
clReleaseProgram(prog);
|
||||
|
||||
d_feature = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
n_points * n_features * sizeof(float), NULL, &err);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: clCreateBuffer d_feature (size:%d) => %d\n",
|
||||
n_points * n_features, err);
|
||||
return -1;
|
||||
}
|
||||
d_feature_swap =
|
||||
clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
n_points * n_features * sizeof(float), NULL, &err);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: clCreateBuffer d_feature_swap (size:%d) => %d\n",
|
||||
n_points * n_features, err);
|
||||
return -1;
|
||||
}
|
||||
d_cluster =
|
||||
clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
n_clusters * n_features * sizeof(float), NULL, &err);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: clCreateBuffer d_cluster (size:%d) => %d\n",
|
||||
n_clusters * n_features, err);
|
||||
return -1;
|
||||
}
|
||||
d_membership = clCreateBuffer(context, CL_MEM_READ_WRITE,
|
||||
n_points * sizeof(int), NULL, &err);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: clCreateBuffer d_membership (size:%d) => %d\n", n_points,
|
||||
err);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// write buffers
|
||||
err = clEnqueueWriteBuffer(cmd_queue, d_feature, 1, 0,
|
||||
n_points * n_features * sizeof(float), feature[0],
|
||||
0, 0, 0);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: clEnqueueWriteBuffer d_feature (size:%d) => %d\n",
|
||||
n_points * n_features, err);
|
||||
return -1;
|
||||
}
|
||||
|
||||
clSetKernelArg(kernel2, 0, sizeof(void *), (void *)&d_feature);
|
||||
clSetKernelArg(kernel2, 1, sizeof(void *), (void *)&d_feature_swap);
|
||||
clSetKernelArg(kernel2, 2, sizeof(cl_int), (void *)&n_points);
|
||||
clSetKernelArg(kernel2, 3, sizeof(cl_int), (void *)&n_features);
|
||||
|
||||
size_t global_work[3] = {n_points, 1, 1};
|
||||
/// Ke Wang adjustable local group size 2013/08/07 10:37:33
|
||||
size_t local_work_size = BLOCK_SIZE; // work group size is defined by
|
||||
// RD_WG_SIZE_0 or RD_WG_SIZE_0_0
|
||||
// 2014/06/10 17:00:51
|
||||
if (global_work[0] % local_work_size != 0)
|
||||
global_work[0] = (global_work[0] / local_work_size + 1) * local_work_size;
|
||||
|
||||
err = clEnqueueNDRangeKernel(cmd_queue, kernel2, 1, NULL, global_work,
|
||||
&local_work_size, 0, 0, 0);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: clEnqueueNDRangeKernel()=>%d failed\n", err);
|
||||
return -1;
|
||||
}
|
||||
|
||||
membership_OCL = (int *)malloc(n_points * sizeof(int));
|
||||
}
|
||||
|
||||
void deallocateMemory() {
|
||||
clReleaseMemObject(d_feature);
|
||||
clReleaseMemObject(d_feature_swap);
|
||||
clReleaseMemObject(d_cluster);
|
||||
clReleaseMemObject(d_membership);
|
||||
free(membership_OCL);
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
printf("WG size of kernel_swap = %d, WG size of kernel_kmeans = %d \n",
|
||||
BLOCK_SIZE, BLOCK_SIZE2);
|
||||
setup(argc, argv);
|
||||
shutdown();
|
||||
}
|
||||
|
||||
int kmeansOCL(float **feature, /* in: [npoints][nfeatures] */
|
||||
int n_features, int n_points, int n_clusters, int *membership,
|
||||
float **clusters, int *new_centers_len, float **new_centers) {
|
||||
|
||||
int delta = 0;
|
||||
int i, j, k;
|
||||
cl_int err = 0;
|
||||
|
||||
size_t global_work[3] = {n_points, 1, 1};
|
||||
|
||||
/// Ke Wang adjustable local group size 2013/08/07 10:37:33
|
||||
size_t local_work_size = BLOCK_SIZE2; // work group size is defined by
|
||||
// RD_WG_SIZE_1 or RD_WG_SIZE_1_0
|
||||
// 2014/06/10 17:00:41
|
||||
if (global_work[0] % local_work_size != 0)
|
||||
global_work[0] = (global_work[0] / local_work_size + 1) * local_work_size;
|
||||
|
||||
err = clEnqueueWriteBuffer(cmd_queue, d_cluster, 1, 0,
|
||||
n_clusters * n_features * sizeof(float),
|
||||
clusters[0], 0, 0, 0);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: clEnqueueWriteBuffer d_cluster (size:%d) => %d\n", n_points,
|
||||
err);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int size = 0;
|
||||
int offset = 0;
|
||||
|
||||
clSetKernelArg(kernel_s, 0, sizeof(void *), (void *)&d_feature_swap);
|
||||
clSetKernelArg(kernel_s, 1, sizeof(void *), (void *)&d_cluster);
|
||||
clSetKernelArg(kernel_s, 2, sizeof(void *), (void *)&d_membership);
|
||||
clSetKernelArg(kernel_s, 3, sizeof(cl_int), (void *)&n_points);
|
||||
clSetKernelArg(kernel_s, 4, sizeof(cl_int), (void *)&n_clusters);
|
||||
clSetKernelArg(kernel_s, 5, sizeof(cl_int), (void *)&n_features);
|
||||
clSetKernelArg(kernel_s, 6, sizeof(cl_int), (void *)&offset);
|
||||
clSetKernelArg(kernel_s, 7, sizeof(cl_int), (void *)&size);
|
||||
|
||||
err = clEnqueueNDRangeKernel(cmd_queue, kernel_s, 1, NULL, global_work,
|
||||
&local_work_size, 0, 0, 0);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: clEnqueueNDRangeKernel()=>%d failed\n", err);
|
||||
return -1;
|
||||
}
|
||||
clFinish(cmd_queue);
|
||||
err = clEnqueueReadBuffer(cmd_queue, d_membership, 1, 0,
|
||||
n_points * sizeof(int), membership_OCL, 0, 0, 0);
|
||||
if (err != CL_SUCCESS) {
|
||||
printf("ERROR: Memcopy Out\n");
|
||||
return -1;
|
||||
}
|
||||
|
||||
delta = 0;
|
||||
for (i = 0; i < n_points; i++) {
|
||||
int cluster_id = membership_OCL[i];
|
||||
new_centers_len[cluster_id]++;
|
||||
if (membership_OCL[i] != membership[i]) {
|
||||
delta++;
|
||||
membership[i] = membership_OCL[i];
|
||||
}
|
||||
for (j = 0; j < n_features; j++) {
|
||||
new_centers[cluster_id][j] += feature[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
return delta;
|
||||
}
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
./kmeans -o -i ../../data/kmeans/kdd_cup
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,68 +0,0 @@
|
||||
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
|
||||
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
|
||||
POCL_INC_PATH ?= $(wildcard ../include)
|
||||
POCL_LIB_PATH ?= $(wildcard ../lib)
|
||||
VX_RT_PATH ?= $(wildcard ../../../runtime)
|
||||
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
|
||||
|
||||
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
|
||||
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
|
||||
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
|
||||
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
|
||||
|
||||
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
|
||||
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
|
||||
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
|
||||
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
|
||||
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
|
||||
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
|
||||
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
|
||||
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
|
||||
|
||||
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
|
||||
|
||||
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
|
||||
CXXFLAGS += -ffreestanding # program may not begin at main()
|
||||
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
|
||||
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
|
||||
CXXFLAGS += -I$(POCL_INC_PATH) -I.
|
||||
|
||||
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
|
||||
|
||||
PROJECT = lbm
|
||||
|
||||
SRCS = main.cc args.c parboil_opencl.c gpu_info.c lbm.c ocl.c
|
||||
|
||||
all: $(PROJECT).dump $(PROJECT).hex
|
||||
|
||||
lib$(PROJECT).a: kernel.cl
|
||||
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
|
||||
|
||||
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
|
||||
|
||||
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
|
||||
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
|
||||
|
||||
$(PROJECT).hex: $(PROJECT).elf
|
||||
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
|
||||
|
||||
$(PROJECT).dump: $(PROJECT).elf
|
||||
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||
|
||||
run: $(PROJECT).hex
|
||||
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
|
||||
|
||||
qemu: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-s: $(PROJECT).qemu
|
||||
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
|
||||
|
||||
gdb-c: $(PROJECT).qemu
|
||||
$(GDB) $(PROJECT).qemu
|
||||
|
||||
clean:
|
||||
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug
|
||||
@@ -1,617 +0,0 @@
|
||||
|
||||
#include <parboil.h>
|
||||
#include <errno.h>
|
||||
#include <limits.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
/*****************************************************************************/
|
||||
/* Memory management routines */
|
||||
|
||||
/* Free an array of owned strings. */
|
||||
void
|
||||
pb_FreeStringArray(char **string_array)
|
||||
{
|
||||
char **p;
|
||||
|
||||
if (!string_array) return;
|
||||
for (p = string_array; *p; p++) free(*p);
|
||||
free(string_array);
|
||||
}
|
||||
|
||||
struct pb_PlatformParam *
|
||||
pb_PlatformParam(char *name, char *version)
|
||||
{
|
||||
if (name == NULL) {
|
||||
fprintf(stderr, "pb_PlatformParam: Invalid argument\n");
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
struct pb_PlatformParam *ret =
|
||||
(struct pb_PlatformParam *)malloc(sizeof (struct pb_PlatformParam));
|
||||
|
||||
ret->name = name;
|
||||
ret->version = version;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void
|
||||
pb_FreePlatformParam(struct pb_PlatformParam *p)
|
||||
{
|
||||
if (p == NULL) return;
|
||||
|
||||
free(p->name);
|
||||
free(p->version);
|
||||
free(p);
|
||||
}
|
||||
|
||||
struct pb_DeviceParam *
|
||||
pb_DeviceParam_index(int index)
|
||||
{
|
||||
struct pb_DeviceParam *ret =
|
||||
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
|
||||
ret->criterion = pb_Device_INDEX;
|
||||
ret->index = index;
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct pb_DeviceParam *
|
||||
pb_DeviceParam_cpu(void)
|
||||
{
|
||||
struct pb_DeviceParam *ret =
|
||||
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
|
||||
ret->criterion = pb_Device_CPU;
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct pb_DeviceParam *
|
||||
pb_DeviceParam_gpu(void)
|
||||
{
|
||||
struct pb_DeviceParam *ret =
|
||||
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
|
||||
ret->criterion = pb_Device_GPU;
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct pb_DeviceParam *
|
||||
pb_DeviceParam_accelerator(void)
|
||||
{
|
||||
struct pb_DeviceParam *ret =
|
||||
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
|
||||
ret->criterion = pb_Device_ACCELERATOR;
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct pb_DeviceParam *
|
||||
pb_DeviceParam_name(char *name)
|
||||
{
|
||||
struct pb_DeviceParam *ret =
|
||||
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
|
||||
ret->criterion = pb_Device_NAME;
|
||||
ret->name = name;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void
|
||||
pb_FreeDeviceParam(struct pb_DeviceParam *p)
|
||||
{
|
||||
if (p == NULL) return;
|
||||
|
||||
switch(p->criterion) {
|
||||
case pb_Device_NAME:
|
||||
free(p->name);
|
||||
break;
|
||||
case pb_Device_INDEX:
|
||||
case pb_Device_CPU:
|
||||
case pb_Device_ACCELERATOR:
|
||||
break;
|
||||
default:
|
||||
fprintf(stderr, "pb_FreeDeviceParam: Invalid argument\n");
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
pb_FreeParameters(struct pb_Parameters *p)
|
||||
{
|
||||
free(p->outFile);
|
||||
pb_FreeStringArray(p->inpFiles);
|
||||
pb_FreePlatformParam(p->platform);
|
||||
pb_FreeDeviceParam(p->device);
|
||||
free(p);
|
||||
}
|
||||
|
||||
/*****************************************************************************/
|
||||
|
||||
/* Parse a comma-delimited list of strings into an
|
||||
* array of strings. */
|
||||
static char **
|
||||
read_string_array(char *in)
|
||||
{
|
||||
char **ret;
|
||||
int i;
|
||||
int count; /* Number of items in the input */
|
||||
char *substring; /* Current substring within 'in' */
|
||||
|
||||
/* Count the number of items in the string */
|
||||
count = 1;
|
||||
for (i = 0; in[i]; i++) if (in[i] == ',') count++;
|
||||
|
||||
/* Allocate storage */
|
||||
ret = (char **)malloc((count + 1) * sizeof(char *));
|
||||
|
||||
/* Create copies of the strings from the list */
|
||||
substring = in;
|
||||
for (i = 0; i < count; i++) {
|
||||
char *substring_end;
|
||||
int substring_length;
|
||||
|
||||
/* Find length of substring */
|
||||
for (substring_end = substring;
|
||||
(*substring_end != ',') && (*substring_end != 0);
|
||||
substring_end++);
|
||||
|
||||
substring_length = substring_end - substring;
|
||||
|
||||
/* Allocate memory and copy the substring */
|
||||
ret[i] = (char *)malloc(substring_length + 1);
|
||||
memcpy(ret[i], substring, substring_length);
|
||||
ret[i][substring_length] = 0;
|
||||
|
||||
/* go to next substring */
|
||||
substring = substring_end + 1;
|
||||
}
|
||||
ret[i] = NULL; /* Write the sentinel value */
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void
|
||||
report_parse_error(const char *str)
|
||||
{
|
||||
fputs(str, stderr);
|
||||
}
|
||||
|
||||
/* Interpret a string as a 'pb_DeviceParam' value.
|
||||
* Return a pointer to a new value, or NULL on failure.
|
||||
*/
|
||||
static struct pb_DeviceParam *
|
||||
read_device_param(char *str)
|
||||
{
|
||||
/* Try different ways of interpreting 'device_string' until one works */
|
||||
|
||||
/* If argument is an integer, then interpret it as a device index */
|
||||
errno = 0;
|
||||
char *end;
|
||||
long device_int = strtol(str, &end, 10);
|
||||
if (!errno) {
|
||||
/* Negative numbers are not valid */
|
||||
if (device_int < 0 || device_int > INT_MAX) return NULL;
|
||||
|
||||
return pb_DeviceParam_index(device_int);
|
||||
}
|
||||
|
||||
/* Match against predefined strings */
|
||||
if (strcmp(str, "CPU") == 0)
|
||||
return pb_DeviceParam_cpu();
|
||||
if (strcmp(str, "GPU") == 0)
|
||||
return pb_DeviceParam_gpu();
|
||||
if (strcmp(str, "ACCELERATOR") == 0)
|
||||
return pb_DeviceParam_accelerator();
|
||||
|
||||
/* Assume any other string is a device name */
|
||||
return pb_DeviceParam_name(strdup(str));
|
||||
}
|
||||
|
||||
/* Interpret a string as a 'pb_PlatformParam' value.
|
||||
* Return a pointer to a new value, or NULL on failure.
|
||||
*/
|
||||
static struct pb_PlatformParam *
|
||||
read_platform_param(char *str)
|
||||
{
|
||||
int separator_index; /* Index of the '-' character separating
|
||||
* name and version number. It's -1 if
|
||||
* there's no '-' character. */
|
||||
|
||||
/* Find the last occurrence of '-' in 'str' */
|
||||
{
|
||||
char *cur;
|
||||
separator_index = -1;
|
||||
for (cur = str; *cur; cur++) {
|
||||
if (*cur == '-') separator_index = cur - str;
|
||||
}
|
||||
}
|
||||
|
||||
/* The platform name is either the entire string, or all characters before
|
||||
* the separator */
|
||||
int name_length = separator_index == -1 ? strlen(str) : separator_index;
|
||||
char *name_str = (char *)malloc(name_length + 1);
|
||||
memcpy(name_str, str, name_length);
|
||||
name_str[name_length] = 0;
|
||||
|
||||
/* The version is either NULL, or all characters after the separator */
|
||||
char *version_str;
|
||||
if (separator_index == -1) {
|
||||
version_str = NULL;
|
||||
}
|
||||
else {
|
||||
const char *version_input_str = str + separator_index + 1;
|
||||
int version_length = strlen(version_input_str);
|
||||
|
||||
version_str = (char *)malloc(version_length + 1);
|
||||
memcpy(version_str, version_input_str, version_length);
|
||||
version_str[version_length] = 0;
|
||||
}
|
||||
|
||||
/* Create output structure */
|
||||
return pb_PlatformParam(name_str, version_str);
|
||||
}
|
||||
|
||||
/****************************************************************************/
|
||||
/* Argument parsing state */
|
||||
|
||||
/* Argument parsing state.
|
||||
*
|
||||
* Arguments that are interpreted by the argument parser are removed from
|
||||
* the list. Variables 'argc' and 'argn' do not count arguments that have
|
||||
* been removed.
|
||||
*
|
||||
* During argument parsing, the array of arguments is compacted, overwriting
|
||||
* the erased arguments. Variable 'argv_put' points to the array element
|
||||
* where the next argument will be written. Variable 'argv_get' points to
|
||||
* the array element where the next argument will be read from.
|
||||
*/
|
||||
struct argparse {
|
||||
int argc; /* Number of arguments. Mutable. */
|
||||
int argn; /* Current argument index. */
|
||||
char **argv_get; /* Argument value being read. */
|
||||
char **argv_put; /* Argument value being written.
|
||||
* argv_put <= argv_get. */
|
||||
};
|
||||
|
||||
static void
|
||||
initialize_argparse(struct argparse *ap, int argc, char **argv)
|
||||
{
|
||||
ap->argc = argc;
|
||||
ap->argn = 0;
|
||||
ap->argv_get = ap->argv_put = argv;
|
||||
}
|
||||
|
||||
/* Finish argument parsing, without processing the remaining arguments.
|
||||
* Write new argument count into _argc. */
|
||||
static void
|
||||
finalize_argparse(struct argparse *ap, int *_argc, char **argv)
|
||||
{
|
||||
/* Move the remaining arguments */
|
||||
for(; ap->argn < ap->argc; ap->argn++)
|
||||
*ap->argv_put++ = *ap->argv_get++;
|
||||
|
||||
/* Update the argument count */
|
||||
*_argc = ap->argc;
|
||||
|
||||
/* Insert a terminating NULL */
|
||||
argv[ap->argc] = NULL;
|
||||
}
|
||||
|
||||
/* Delete the current argument. The argument will not be visible
|
||||
* when argument parsing is done. */
|
||||
static void
|
||||
delete_argument(struct argparse *ap)
|
||||
{
|
||||
if (ap->argn >= ap->argc) {
|
||||
fprintf(stderr, "delete_argument\n");
|
||||
}
|
||||
ap->argc--;
|
||||
ap->argv_get++;
|
||||
}
|
||||
|
||||
/* Go to the next argument. Also, move the current argument to its
|
||||
* final location in argv. */
|
||||
static void
|
||||
next_argument(struct argparse *ap)
|
||||
{
|
||||
if (ap->argn >= ap->argc) {
|
||||
fprintf(stderr, "next_argument\n");
|
||||
}
|
||||
/* Move argument to its new location. */
|
||||
*ap->argv_put++ = *ap->argv_get++;
|
||||
ap->argn++;
|
||||
}
|
||||
|
||||
static int
|
||||
is_end_of_arguments(struct argparse *ap)
|
||||
{
|
||||
return ap->argn == ap->argc;
|
||||
}
|
||||
|
||||
/* Get the current argument */
|
||||
static char *
|
||||
get_argument(struct argparse *ap)
|
||||
{
|
||||
return *ap->argv_get;
|
||||
}
|
||||
|
||||
/* Get the current argument, and also delete it */
|
||||
static char *
|
||||
consume_argument(struct argparse *ap)
|
||||
{
|
||||
char *ret = get_argument(ap);
|
||||
delete_argument(ap);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/****************************************************************************/
|
||||
|
||||
/* The result of parsing a command-line argument */
|
||||
typedef enum {
|
||||
ARGPARSE_OK, /* Success */
|
||||
ARGPARSE_ERROR, /* Error */
|
||||
ARGPARSE_DONE /* Success, and do not continue parsing */
|
||||
} result;
|
||||
|
||||
typedef result parse_action(struct argparse *ap, struct pb_Parameters *params);
|
||||
|
||||
|
||||
/* A command-line option */
|
||||
struct option {
|
||||
char short_name; /* If not 0, the one-character
|
||||
* name of this option */
|
||||
const char *long_name; /* If not NULL, the long name of this option */
|
||||
parse_action *action; /* What to do when this option occurs.
|
||||
* Sentinel value is NULL.
|
||||
*/
|
||||
};
|
||||
|
||||
/* Output file
|
||||
*
|
||||
* -o FILE
|
||||
*/
|
||||
static result
|
||||
parse_output_file(struct argparse *ap, struct pb_Parameters *params)
|
||||
{
|
||||
if (is_end_of_arguments(ap))
|
||||
{
|
||||
report_parse_error("Expecting file name after '-o'\n");
|
||||
return ARGPARSE_ERROR;
|
||||
}
|
||||
|
||||
/* Replace the output file name */
|
||||
free(params->outFile);
|
||||
params->outFile = strdup(consume_argument(ap));
|
||||
|
||||
return ARGPARSE_OK;
|
||||
}
|
||||
|
||||
/* Input files
|
||||
*
|
||||
* -i FILE,FILE,...
|
||||
*/
|
||||
static result
|
||||
parse_input_files(struct argparse *ap, struct pb_Parameters *params)
|
||||
{
|
||||
if (is_end_of_arguments(ap))
|
||||
{
|
||||
report_parse_error("Expecting file name after '-i'\n");
|
||||
return ARGPARSE_ERROR;
|
||||
}
|
||||
|
||||
/* Replace the input file list */
|
||||
pb_FreeStringArray(params->inpFiles);
|
||||
params->inpFiles = read_string_array(consume_argument(ap));
|
||||
return ARGPARSE_OK;
|
||||
}
|
||||
|
||||
/* End of options
|
||||
*
|
||||
* --
|
||||
*/
|
||||
|
||||
static result
|
||||
parse_end_options(struct argparse *ap, struct pb_Parameters *params)
|
||||
{
|
||||
return ARGPARSE_DONE;
|
||||
}
|
||||
|
||||
/* OpenCL device
|
||||
*
|
||||
* --device X
|
||||
*/
|
||||
|
||||
static result
|
||||
parse_device(struct argparse *ap, struct pb_Parameters *params)
|
||||
{
|
||||
/* Read the next argument, which specifies a device */
|
||||
|
||||
if (is_end_of_arguments(ap))
|
||||
{
|
||||
report_parse_error("Expecting device specification after '--device'\n");
|
||||
return ARGPARSE_ERROR;
|
||||
}
|
||||
|
||||
char *device_string = consume_argument(ap);
|
||||
struct pb_DeviceParam *device_param = read_device_param(device_string);
|
||||
|
||||
if (!device_param) {
|
||||
report_parse_error("Unrecognized device specification format on command line\n");
|
||||
return ARGPARSE_ERROR;
|
||||
}
|
||||
|
||||
/* Save the result */
|
||||
pb_FreeDeviceParam(params->device);
|
||||
params->device = device_param;
|
||||
|
||||
return ARGPARSE_OK;
|
||||
}
|
||||
|
||||
static result
|
||||
parse_platform(struct argparse *ap, struct pb_Parameters *params)
|
||||
{
|
||||
/* Read the next argument, which specifies a platform */
|
||||
|
||||
if (is_end_of_arguments(ap))
|
||||
{
|
||||
report_parse_error("Expecting device specification after '--platform'\n");
|
||||
return ARGPARSE_ERROR;
|
||||
}
|
||||
|
||||
char *platform_string = consume_argument(ap);
|
||||
struct pb_PlatformParam *platform_param = read_platform_param(platform_string);
|
||||
|
||||
if (!platform_param) {
|
||||
report_parse_error("Unrecognized platform specification format on command line\n");
|
||||
return ARGPARSE_ERROR;
|
||||
}
|
||||
|
||||
/* Save the result */
|
||||
pb_FreePlatformParam(params->platform);
|
||||
params->platform = platform_param;
|
||||
|
||||
return ARGPARSE_OK;
|
||||
}
|
||||
|
||||
|
||||
static struct option options[] = {
|
||||
{ 'o', NULL, &parse_output_file },
|
||||
{ 'i', NULL, &parse_input_files },
|
||||
{ '-', NULL, &parse_end_options },
|
||||
{ 0, "device", &parse_device },
|
||||
{ 0, "platform", &parse_platform },
|
||||
{ 0, NULL, NULL }
|
||||
};
|
||||
|
||||
static int
|
||||
is_last_option(struct option *op)
|
||||
{
|
||||
return op->action == NULL;
|
||||
}
|
||||
|
||||
/****************************************************************************/
|
||||
|
||||
/* Parse command-line parameters.
|
||||
* Return zero on error, nonzero otherwise.
|
||||
* On error, the other outputs may be invalid.
|
||||
*
|
||||
* The information collected from parameters is used to update
|
||||
* 'ret'. 'ret' should be initialized.
|
||||
*
|
||||
* '_argc' and 'argv' are updated to contain only the unprocessed arguments.
|
||||
*/
|
||||
static int
|
||||
pb_ParseParameters (struct pb_Parameters *ret, int *_argc, char **argv)
|
||||
{
|
||||
char *err_message;
|
||||
struct argparse ap;
|
||||
|
||||
/* Each argument */
|
||||
initialize_argparse(&ap, *_argc, argv);
|
||||
while(!is_end_of_arguments(&ap)) {
|
||||
result arg_result; /* Result of parsing this option */
|
||||
char *arg = get_argument(&ap);
|
||||
|
||||
/* Process this argument */
|
||||
if (arg[0] == '-') {
|
||||
/* Single-character flag */
|
||||
if ((arg[1] != 0) && (arg[2] == 0)) {
|
||||
delete_argument(&ap); /* This argument is consumed here */
|
||||
|
||||
/* Find a matching short option */
|
||||
struct option *op;
|
||||
for (op = options; !is_last_option(op); op++) {
|
||||
if (op->short_name == arg[1]) {
|
||||
arg_result = (*op->action)(&ap, ret);
|
||||
goto option_was_processed;
|
||||
}
|
||||
}
|
||||
|
||||
/* No option matches */
|
||||
report_parse_error("Unexpected command-line parameter\n");
|
||||
arg_result = ARGPARSE_ERROR;
|
||||
goto option_was_processed;
|
||||
}
|
||||
|
||||
/* Long flag */
|
||||
if (arg[1] == '-') {
|
||||
delete_argument(&ap); /* This argument is consumed here */
|
||||
|
||||
/* Find a matching long option */
|
||||
struct option *op;
|
||||
for (op = options; !is_last_option(op); op++) {
|
||||
if (op->long_name && strcmp(&arg[2], op->long_name) == 0) {
|
||||
arg_result = (*op->action)(&ap, ret);
|
||||
goto option_was_processed;
|
||||
}
|
||||
}
|
||||
|
||||
/* No option matches */
|
||||
report_parse_error("Unexpected command-line parameter\n");
|
||||
arg_result = ARGPARSE_ERROR;
|
||||
goto option_was_processed;
|
||||
}
|
||||
}
|
||||
else {
|
||||
/* Other arguments are ignored */
|
||||
next_argument(&ap);
|
||||
arg_result = ARGPARSE_OK;
|
||||
goto option_was_processed;
|
||||
}
|
||||
|
||||
option_was_processed:
|
||||
/* Decide what to do next based on 'arg_result' */
|
||||
switch(arg_result) {
|
||||
case ARGPARSE_OK:
|
||||
/* Continue processing */
|
||||
break;
|
||||
|
||||
case ARGPARSE_ERROR:
|
||||
/* Error exit from the function */
|
||||
return 0;
|
||||
|
||||
case ARGPARSE_DONE:
|
||||
/* Normal exit from the argument parsing loop */
|
||||
goto end_of_options;
|
||||
}
|
||||
} /* end for each argument */
|
||||
|
||||
/* If all arguments were processed, then normal exit from the loop */
|
||||
|
||||
end_of_options:
|
||||
finalize_argparse(&ap, _argc, argv);
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*****************************************************************************/
|
||||
/* Other exported functions */
|
||||
|
||||
struct pb_Parameters *
|
||||
pb_ReadParameters(int *_argc, char **argv)
|
||||
{
|
||||
struct pb_Parameters *ret =
|
||||
(struct pb_Parameters *)malloc(sizeof(struct pb_Parameters));
|
||||
|
||||
/* Initialize the parameters structure */
|
||||
ret->outFile = NULL;
|
||||
ret->inpFiles = (char **)malloc(sizeof(char *));
|
||||
ret->inpFiles[0] = NULL;
|
||||
ret->platform = NULL;
|
||||
ret->device = NULL;
|
||||
|
||||
/* Read parameters and update _argc, argv */
|
||||
if (!pb_ParseParameters(ret, _argc, argv)) {
|
||||
/* Parse error */
|
||||
pb_FreeParameters(ret);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int
|
||||
pb_Parameters_CountInputs(struct pb_Parameters *p)
|
||||
{
|
||||
int n;
|
||||
|
||||
for (n = 0; p->inpFiles[n]; n++);
|
||||
return n;
|
||||
}
|
||||
|
||||
@@ -1,55 +0,0 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
//#include <endian.h>
|
||||
#include <stdlib.h>
|
||||
#include <malloc.h>
|
||||
#include <stdio.h>
|
||||
#include <inttypes.h>
|
||||
|
||||
#include "gpu_info.h"
|
||||
|
||||
void compute_active_thread(size_t *thread,
|
||||
size_t *grid,
|
||||
int task,
|
||||
int pad,
|
||||
int major,
|
||||
int minor,
|
||||
int sm)
|
||||
{
|
||||
int max_thread;
|
||||
int max_block=8;
|
||||
if(major==1)
|
||||
{
|
||||
if(minor>=2)
|
||||
max_thread=1024;
|
||||
else
|
||||
max_thread=768;
|
||||
}
|
||||
else if(major==2)
|
||||
max_thread=1536;
|
||||
else
|
||||
//newer GPU //keep using 2.0
|
||||
max_thread=1536;
|
||||
|
||||
int _grid;
|
||||
int _thread;
|
||||
|
||||
if(task*pad>sm*max_thread)
|
||||
{
|
||||
_thread=max_thread/max_block;
|
||||
_grid = ((task*pad+_thread-1)/_thread)*_thread;
|
||||
}
|
||||
else
|
||||
{
|
||||
_thread=pad;
|
||||
_grid=task*pad;
|
||||
}
|
||||
|
||||
thread[0]=_thread;
|
||||
grid[0]=_grid;
|
||||
}
|
||||
@@ -1,20 +0,0 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
#ifndef __GPUINFOH__
|
||||
#define __GPUINFOH__
|
||||
|
||||
void compute_active_thread(size_t *thread,
|
||||
size_t *grid,
|
||||
int task,
|
||||
int pad,
|
||||
int major,
|
||||
int minor,
|
||||
int sm);
|
||||
|
||||
#endif
|
||||
@@ -1,424 +0,0 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
#ifndef LBM_KERNEL_CL
|
||||
#define LBM_KERNEL_CL
|
||||
|
||||
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
#ifndef _LAYOUT_CONFIG_H_
|
||||
#define _LAYOUT_CONFIG_H_
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
//Unchangeable settings: volume simulation size for the given example
|
||||
#define SIZE_X (32)
|
||||
#define SIZE_Y (32)
|
||||
#define SIZE_Z (32)
|
||||
|
||||
//Changeable settings
|
||||
//Padding in each dimension
|
||||
#define PADDING_X (8)
|
||||
#define PADDING_Y (0)
|
||||
#define PADDING_Z (4)
|
||||
|
||||
//Pitch in each dimension
|
||||
#define PADDED_X (SIZE_X+PADDING_X)
|
||||
#define PADDED_Y (SIZE_Y+PADDING_Y)
|
||||
#define PADDED_Z (SIZE_Z+PADDING_Z)
|
||||
|
||||
#define TOTAL_CELLS (SIZE_X*SIZE_Y*SIZE_Z)
|
||||
#define TOTAL_PADDED_CELLS (PADDED_X*PADDED_Y*PADDED_Z)
|
||||
|
||||
//Flattening function
|
||||
// This macro will be used to map a 3-D index and element to a value
|
||||
// The macro below implements the equivalent of a 3-D array of
|
||||
// 20-element structures in C standard layout.
|
||||
#define CALC_INDEX(x,y,z,e) ( e + N_CELL_ENTRIES*\
|
||||
((x)+(y)*PADDED_X+(z)*PADDED_X*PADDED_Y) )
|
||||
|
||||
#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0,0,0,0))
|
||||
|
||||
// Set this value to 1 for GATHER, or 0 for SCATTER
|
||||
#if 1
|
||||
#define GATHER
|
||||
#else
|
||||
#define SCATTER
|
||||
#endif
|
||||
|
||||
//OpenCL block size (not trivially changeable here)
|
||||
#define BLOCK_SIZE SIZE_X
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
typedef enum {C = 0,
|
||||
N, S, E, W, T, B,
|
||||
NE, NW, SE, SW,
|
||||
NT, NB, ST, SB,
|
||||
ET, EB, WT, WB,
|
||||
FLAGS, N_CELL_ENTRIES} CELL_ENTRIES;
|
||||
|
||||
#define N_DISTR_FUNCS FLAGS
|
||||
|
||||
typedef enum {OBSTACLE = 1 << 0,
|
||||
ACCEL = 1 << 1,
|
||||
IN_OUT_FLOW = 1 << 2} CELL_FLAGS;
|
||||
|
||||
#endif /* _CONFIG_H_ */
|
||||
|
||||
|
||||
#ifndef _LBM_MARCOS_H
|
||||
#define _LBM_MACROS_H_
|
||||
|
||||
#define OMEGA (1.95f)
|
||||
|
||||
#define OUTPUT_PRECISION float
|
||||
|
||||
#define BOOL int
|
||||
#define TRUE (-1)
|
||||
#define FALSE (0)
|
||||
|
||||
#define DFL1 (1.0f/ 3.0f)
|
||||
#define DFL2 (1.0f/18.0f)
|
||||
#define DFL3 (1.0f/36.0f)
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
typedef float* LBM_Grid;//float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES];
|
||||
typedef LBM_Grid* LBM_GridPtr;
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
|
||||
#define SWEEP_X __temp_x__
|
||||
#define SWEEP_Y __temp_y__
|
||||
#define SWEEP_Z __temp_z__
|
||||
#define SWEEP_VAR int __temp_x__, __temp_y__, __temp_z__;
|
||||
|
||||
#define SWEEP_START(x1,y1,z1,x2,y2,z2) \
|
||||
for( __temp_z__ = z1; \
|
||||
__temp_z__ < z2; \
|
||||
__temp_z__++) { \
|
||||
for( __temp_y__ = 0; \
|
||||
__temp_y__ < SIZE_Y; \
|
||||
__temp_y__++) { \
|
||||
for(__temp_x__ = 0; \
|
||||
__temp_x__ < SIZE_X; \
|
||||
__temp_x__++) { \
|
||||
|
||||
#define SWEEP_END }}}
|
||||
|
||||
|
||||
#define GRID_ENTRY(g,x,y,z,e) ((g)[CALC_INDEX( x, y, z, e)])
|
||||
#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX((dx)+SWEEP_X, (dy)+SWEEP_Y, (dz)+SWEEP_Z, e)])
|
||||
|
||||
#define LOCAL(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e ))
|
||||
#define NEIGHBOR_C(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e ))
|
||||
#define NEIGHBOR_N(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, 0, e ))
|
||||
#define NEIGHBOR_S(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, 0, e ))
|
||||
#define NEIGHBOR_E(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, 0, e ))
|
||||
#define NEIGHBOR_W(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, 0, e ))
|
||||
#define NEIGHBOR_T(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, +1, e ))
|
||||
#define NEIGHBOR_B(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, -1, e ))
|
||||
#define NEIGHBOR_NE(g,e) (GRID_ENTRY_SWEEP( g, +1, +1, 0, e ))
|
||||
#define NEIGHBOR_NW(g,e) (GRID_ENTRY_SWEEP( g, -1, +1, 0, e ))
|
||||
#define NEIGHBOR_SE(g,e) (GRID_ENTRY_SWEEP( g, +1, -1, 0, e ))
|
||||
#define NEIGHBOR_SW(g,e) (GRID_ENTRY_SWEEP( g, -1, -1, 0, e ))
|
||||
#define NEIGHBOR_NT(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, +1, e ))
|
||||
#define NEIGHBOR_NB(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, -1, e ))
|
||||
#define NEIGHBOR_ST(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, +1, e ))
|
||||
#define NEIGHBOR_SB(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, -1, e ))
|
||||
#define NEIGHBOR_ET(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, +1, e ))
|
||||
#define NEIGHBOR_EB(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, -1, e ))
|
||||
#define NEIGHBOR_WT(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, +1, e ))
|
||||
#define NEIGHBOR_WB(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, -1, e ))
|
||||
|
||||
|
||||
#ifdef SCATTER
|
||||
|
||||
#define SRC_C(g) (LOCAL( g, C ))
|
||||
#define SRC_N(g) (LOCAL( g, N ))
|
||||
#define SRC_S(g) (LOCAL( g, S ))
|
||||
#define SRC_E(g) (LOCAL( g, E ))
|
||||
#define SRC_W(g) (LOCAL( g, W ))
|
||||
#define SRC_T(g) (LOCAL( g, T ))
|
||||
#define SRC_B(g) (LOCAL( g, B ))
|
||||
#define SRC_NE(g) (LOCAL( g, NE ))
|
||||
#define SRC_NW(g) (LOCAL( g, NW ))
|
||||
#define SRC_SE(g) (LOCAL( g, SE ))
|
||||
#define SRC_SW(g) (LOCAL( g, SW ))
|
||||
#define SRC_NT(g) (LOCAL( g, NT ))
|
||||
#define SRC_NB(g) (LOCAL( g, NB ))
|
||||
#define SRC_ST(g) (LOCAL( g, ST ))
|
||||
#define SRC_SB(g) (LOCAL( g, SB ))
|
||||
#define SRC_ET(g) (LOCAL( g, ET ))
|
||||
#define SRC_EB(g) (LOCAL( g, EB ))
|
||||
#define SRC_WT(g) (LOCAL( g, WT ))
|
||||
#define SRC_WB(g) (LOCAL( g, WB ))
|
||||
|
||||
#define DST_C(g) (NEIGHBOR_C ( g, C ))
|
||||
#define DST_N(g) (NEIGHBOR_N ( g, N ))
|
||||
#define DST_S(g) (NEIGHBOR_S ( g, S ))
|
||||
#define DST_E(g) (NEIGHBOR_E ( g, E ))
|
||||
#define DST_W(g) (NEIGHBOR_W ( g, W ))
|
||||
#define DST_T(g) (NEIGHBOR_T ( g, T ))
|
||||
#define DST_B(g) (NEIGHBOR_B ( g, B ))
|
||||
#define DST_NE(g) (NEIGHBOR_NE( g, NE ))
|
||||
#define DST_NW(g) (NEIGHBOR_NW( g, NW ))
|
||||
#define DST_SE(g) (NEIGHBOR_SE( g, SE ))
|
||||
#define DST_SW(g) (NEIGHBOR_SW( g, SW ))
|
||||
#define DST_NT(g) (NEIGHBOR_NT( g, NT ))
|
||||
#define DST_NB(g) (NEIGHBOR_NB( g, NB ))
|
||||
#define DST_ST(g) (NEIGHBOR_ST( g, ST ))
|
||||
#define DST_SB(g) (NEIGHBOR_SB( g, SB ))
|
||||
#define DST_ET(g) (NEIGHBOR_ET( g, ET ))
|
||||
#define DST_EB(g) (NEIGHBOR_EB( g, EB ))
|
||||
#define DST_WT(g) (NEIGHBOR_WT( g, WT ))
|
||||
#define DST_WB(g) (NEIGHBOR_WB( g, WB ))
|
||||
|
||||
#else /* GATHER */
|
||||
|
||||
#define SRC_C(g) (NEIGHBOR_C ( g, C ))
|
||||
#define SRC_N(g) (NEIGHBOR_S ( g, N ))
|
||||
#define SRC_S(g) (NEIGHBOR_N ( g, S ))
|
||||
#define SRC_E(g) (NEIGHBOR_W ( g, E ))
|
||||
#define SRC_W(g) (NEIGHBOR_E ( g, W ))
|
||||
#define SRC_T(g) (NEIGHBOR_B ( g, T ))
|
||||
#define SRC_B(g) (NEIGHBOR_T ( g, B ))
|
||||
#define SRC_NE(g) (NEIGHBOR_SW( g, NE ))
|
||||
#define SRC_NW(g) (NEIGHBOR_SE( g, NW ))
|
||||
#define SRC_SE(g) (NEIGHBOR_NW( g, SE ))
|
||||
#define SRC_SW(g) (NEIGHBOR_NE( g, SW ))
|
||||
#define SRC_NT(g) (NEIGHBOR_SB( g, NT ))
|
||||
#define SRC_NB(g) (NEIGHBOR_ST( g, NB ))
|
||||
#define SRC_ST(g) (NEIGHBOR_NB( g, ST ))
|
||||
#define SRC_SB(g) (NEIGHBOR_NT( g, SB ))
|
||||
#define SRC_ET(g) (NEIGHBOR_WB( g, ET ))
|
||||
#define SRC_EB(g) (NEIGHBOR_WT( g, EB ))
|
||||
#define SRC_WT(g) (NEIGHBOR_EB( g, WT ))
|
||||
#define SRC_WB(g) (NEIGHBOR_ET( g, WB ))
|
||||
|
||||
#define DST_C(g) (LOCAL( g, C ))
|
||||
#define DST_N(g) (LOCAL( g, N ))
|
||||
#define DST_S(g) (LOCAL( g, S ))
|
||||
#define DST_E(g) (LOCAL( g, E ))
|
||||
#define DST_W(g) (LOCAL( g, W ))
|
||||
#define DST_T(g) (LOCAL( g, T ))
|
||||
#define DST_B(g) (LOCAL( g, B ))
|
||||
#define DST_NE(g) (LOCAL( g, NE ))
|
||||
#define DST_NW(g) (LOCAL( g, NW ))
|
||||
#define DST_SE(g) (LOCAL( g, SE ))
|
||||
#define DST_SW(g) (LOCAL( g, SW ))
|
||||
#define DST_NT(g) (LOCAL( g, NT ))
|
||||
#define DST_NB(g) (LOCAL( g, NB ))
|
||||
#define DST_ST(g) (LOCAL( g, ST ))
|
||||
#define DST_SB(g) (LOCAL( g, SB ))
|
||||
#define DST_ET(g) (LOCAL( g, ET ))
|
||||
#define DST_EB(g) (LOCAL( g, EB ))
|
||||
#define DST_WT(g) (LOCAL( g, WT ))
|
||||
#define DST_WB(g) (LOCAL( g, WB ))
|
||||
|
||||
#endif /* GATHER */
|
||||
|
||||
#define MAGIC_CAST(v) ((unsigned int*) ((void*) (&(v))))
|
||||
#define FLAG_VAR(v) unsigned int* _aux_ = MAGIC_CAST(v)
|
||||
|
||||
#define TEST_FLAG_SWEEP(g,f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f))
|
||||
#define SET_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) |= (f);}
|
||||
#define CLEAR_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) &= ~(f);}
|
||||
#define CLEAR_ALL_FLAGS_SWEEP(g) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) = 0;}
|
||||
|
||||
#define TEST_FLAG(g,x,y,z,f) ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f))
|
||||
#define SET_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) |= (f);}
|
||||
#define CLEAR_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) &= ~(f);}
|
||||
#define CLEAR_ALL_FLAGS(g,x,y,z) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) = 0;}
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
#endif /* _CONFIG_H_ */
|
||||
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
__kernel void performStreamCollide_kernel( __global float* srcGrid, __global float* dstGrid )
|
||||
{
|
||||
srcGrid += MARGIN;
|
||||
dstGrid += MARGIN;
|
||||
|
||||
|
||||
//Using some predefined macros here. Consider this the declaration
|
||||
// and initialization of the variables SWEEP_X, SWEEP_Y and SWEEP_Z
|
||||
|
||||
SWEEP_VAR
|
||||
SWEEP_X = get_local_id(0);
|
||||
SWEEP_Y = get_group_id(0);
|
||||
SWEEP_Z = get_group_id(1);
|
||||
|
||||
float temp_swp, tempC, tempN, tempS, tempE, tempW, tempT, tempB;
|
||||
float tempNE, tempNW, tempSE, tempSW, tempNT, tempNB, tempST ;
|
||||
float tempSB, tempET, tempEB, tempWT, tempWB ;
|
||||
|
||||
//Load all of the input fields
|
||||
//This is a gather operation of the SCATTER preprocessor variable
|
||||
// is undefined in layout_config.h, or a "local" read otherwise
|
||||
tempC = SRC_C(srcGrid);
|
||||
|
||||
tempN = SRC_N(srcGrid);
|
||||
tempS = SRC_S(srcGrid);
|
||||
tempE = SRC_E(srcGrid);
|
||||
tempW = SRC_W(srcGrid);
|
||||
tempT = SRC_T(srcGrid);
|
||||
tempB = SRC_B(srcGrid);
|
||||
|
||||
tempNE = SRC_NE(srcGrid);
|
||||
tempNW = SRC_NW(srcGrid);
|
||||
tempSE = SRC_SE(srcGrid);
|
||||
tempSW = SRC_SW(srcGrid);
|
||||
tempNT = SRC_NT(srcGrid);
|
||||
tempNB = SRC_NB(srcGrid);
|
||||
tempST = SRC_ST(srcGrid);
|
||||
tempSB = SRC_SB(srcGrid);
|
||||
tempET = SRC_ET(srcGrid);
|
||||
tempEB = SRC_EB(srcGrid);
|
||||
tempWT = SRC_WT(srcGrid);
|
||||
tempWB = SRC_WB(srcGrid);
|
||||
|
||||
//Test whether the cell is fluid or obstacle
|
||||
if(as_uint(LOCAL(srcGrid,FLAGS)) & (OBSTACLE)) {
|
||||
|
||||
//Swizzle the inputs: reflect any fluid coming into this cell
|
||||
// back to where it came from
|
||||
temp_swp = tempN ; tempN = tempS ; tempS = temp_swp ;
|
||||
temp_swp = tempE ; tempE = tempW ; tempW = temp_swp;
|
||||
temp_swp = tempT ; tempT = tempB ; tempB = temp_swp;
|
||||
temp_swp = tempNE; tempNE = tempSW ; tempSW = temp_swp;
|
||||
temp_swp = tempNW; tempNW = tempSE ; tempSE = temp_swp;
|
||||
temp_swp = tempNT ; tempNT = tempSB ; tempSB = temp_swp;
|
||||
temp_swp = tempNB ; tempNB = tempST ; tempST = temp_swp;
|
||||
temp_swp = tempET ; tempET= tempWB ; tempWB = temp_swp;
|
||||
temp_swp = tempEB ; tempEB = tempWT ; tempWT = temp_swp;
|
||||
}
|
||||
else {
|
||||
|
||||
//The math meat of LBM: ignore for optimization
|
||||
float ux, uy, uz, rho, u2;
|
||||
float temp1, temp2, temp_base;
|
||||
rho = tempC + tempN
|
||||
+ tempS + tempE
|
||||
+ tempW + tempT
|
||||
+ tempB + tempNE
|
||||
+ tempNW + tempSE
|
||||
+ tempSW + tempNT
|
||||
+ tempNB + tempST
|
||||
+ tempSB + tempET
|
||||
+ tempEB + tempWT
|
||||
+ tempWB;
|
||||
|
||||
ux = + tempE - tempW
|
||||
+ tempNE - tempNW
|
||||
+ tempSE - tempSW
|
||||
+ tempET + tempEB
|
||||
- tempWT - tempWB;
|
||||
|
||||
uy = + tempN - tempS
|
||||
+ tempNE + tempNW
|
||||
- tempSE - tempSW
|
||||
+ tempNT + tempNB
|
||||
- tempST - tempSB;
|
||||
|
||||
uz = + tempT - tempB
|
||||
+ tempNT - tempNB
|
||||
+ tempST - tempSB
|
||||
+ tempET - tempEB
|
||||
+ tempWT - tempWB;
|
||||
|
||||
ux /= rho;
|
||||
uy /= rho;
|
||||
uz /= rho;
|
||||
|
||||
if(as_uint(LOCAL(srcGrid,FLAGS)) & (ACCEL)) {
|
||||
|
||||
ux = 0.005f;
|
||||
uy = 0.002f;
|
||||
uz = 0.000f;
|
||||
}
|
||||
|
||||
u2 = 1.5f * (ux*ux + uy*uy + uz*uz) - 1.0f;
|
||||
temp_base = OMEGA*rho;
|
||||
temp1 = DFL1*temp_base;
|
||||
|
||||
//Put the output values for this cell in the shared memory
|
||||
temp_base = OMEGA*rho;
|
||||
temp1 = DFL1*temp_base;
|
||||
temp2 = 1.0f-OMEGA;
|
||||
tempC = temp2*tempC + temp1*( - u2);
|
||||
temp1 = DFL2*temp_base;
|
||||
tempN = temp2*tempN + temp1*( uy*(4.5f*uy + 3.0f) - u2);
|
||||
tempS = temp2*tempS + temp1*( uy*(4.5f*uy - 3.0f) - u2);
|
||||
tempT = temp2*tempT + temp1*( uz*(4.5f*uz + 3.0f) - u2);
|
||||
tempB = temp2*tempB + temp1*( uz*(4.5f*uz - 3.0f) - u2);
|
||||
tempE = temp2*tempE + temp1*( ux*(4.5f*ux + 3.0f) - u2);
|
||||
tempW = temp2*tempW + temp1*( ux*(4.5f*ux - 3.0f) - u2);
|
||||
temp1 = DFL3*temp_base;
|
||||
tempNT= temp2*tempNT + temp1 *( (+uy+uz)*(4.5f*(+uy+uz) + 3.0f) - u2);
|
||||
tempNB= temp2*tempNB + temp1 *( (+uy-uz)*(4.5f*(+uy-uz) + 3.0f) - u2);
|
||||
tempST= temp2*tempST + temp1 *( (-uy+uz)*(4.5f*(-uy+uz) + 3.0f) - u2);
|
||||
tempSB= temp2*tempSB + temp1 *( (-uy-uz)*(4.5f*(-uy-uz) + 3.0f) - u2);
|
||||
tempNE = temp2*tempNE + temp1 *( (+ux+uy)*(4.5f*(+ux+uy) + 3.0f) - u2);
|
||||
tempSE = temp2*tempSE + temp1 *((+ux-uy)*(4.5f*(+ux-uy) + 3.0f) - u2);
|
||||
tempET = temp2*tempET + temp1 *( (+ux+uz)*(4.5f*(+ux+uz) + 3.0f) - u2);
|
||||
tempEB = temp2*tempEB + temp1 *( (+ux-uz)*(4.5f*(+ux-uz) + 3.0f) - u2);
|
||||
tempNW = temp2*tempNW + temp1 *( (-ux+uy)*(4.5f*(-ux+uy) + 3.0f) - u2);
|
||||
tempSW = temp2*tempSW + temp1 *( (-ux-uy)*(4.5f*(-ux-uy) + 3.0f) - u2);
|
||||
tempWT = temp2*tempWT + temp1 *( (-ux+uz)*(4.5f*(-ux+uz) + 3.0f) - u2);
|
||||
tempWB = temp2*tempWB + temp1 *( (-ux-uz)*(4.5f*(-ux-uz) + 3.0f) - u2);
|
||||
}
|
||||
|
||||
//Write the results computed above
|
||||
//This is a scatter operation of the SCATTER preprocessor variable
|
||||
// is defined in layout_config.h, or a "local" write otherwise
|
||||
DST_C ( dstGrid ) = tempC;
|
||||
|
||||
DST_N ( dstGrid ) = tempN;
|
||||
DST_S ( dstGrid ) = tempS;
|
||||
DST_E ( dstGrid ) = tempE;
|
||||
DST_W ( dstGrid ) = tempW;
|
||||
DST_T ( dstGrid ) = tempT;
|
||||
DST_B ( dstGrid ) = tempB;
|
||||
|
||||
DST_NE( dstGrid ) = tempNE;
|
||||
DST_NW( dstGrid ) = tempNW;
|
||||
DST_SE( dstGrid ) = tempSE;
|
||||
DST_SW( dstGrid ) = tempSW;
|
||||
DST_NT( dstGrid ) = tempNT;
|
||||
DST_NB( dstGrid ) = tempNB;
|
||||
DST_ST( dstGrid ) = tempST;
|
||||
DST_SB( dstGrid ) = tempSB;
|
||||
DST_ET( dstGrid ) = tempET;
|
||||
DST_EB( dstGrid ) = tempEB;
|
||||
DST_WT( dstGrid ) = tempWT;
|
||||
DST_WB( dstGrid ) = tempWB;
|
||||
}
|
||||
|
||||
#endif // LBM_KERNEL_CL
|
||||
@@ -1,69 +0,0 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
#ifndef _LAYOUT_CONFIG_H_
|
||||
#define _LAYOUT_CONFIG_H_
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
//Unchangeable settings: volume simulation size for the given example
|
||||
#define SIZE_X (32)
|
||||
#define SIZE_Y (16)
|
||||
#define SIZE_Z (8)
|
||||
|
||||
//Changeable settings
|
||||
//Padding in each dimension
|
||||
#define PADDING_X (8)
|
||||
#define PADDING_Y (0)
|
||||
#define PADDING_Z (4)
|
||||
|
||||
//Pitch in each dimension
|
||||
#define PADDED_X (SIZE_X+PADDING_X)
|
||||
#define PADDED_Y (SIZE_Y+PADDING_Y)
|
||||
#define PADDED_Z (SIZE_Z+PADDING_Z)
|
||||
|
||||
#define TOTAL_CELLS (SIZE_X*SIZE_Y*SIZE_Z)
|
||||
#define TOTAL_PADDED_CELLS (PADDED_X*PADDED_Y*PADDED_Z)
|
||||
|
||||
//Flattening function
|
||||
// This macro will be used to map a 3-D index and element to a value
|
||||
// The macro below implements the equivalent of a 3-D array of
|
||||
// 20-element structures in C standard layout.
|
||||
#define CALC_INDEX(x,y,z,e) ( e + N_CELL_ENTRIES*\
|
||||
((x)+(y)*PADDED_X+(z)*PADDED_X*PADDED_Y) )
|
||||
|
||||
#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0,0,0,0))
|
||||
|
||||
// Set this value to 1 for GATHER, or 0 for SCATTER
|
||||
#if 1
|
||||
#define GATHER
|
||||
#else
|
||||
#define SCATTER
|
||||
#endif
|
||||
|
||||
//OpenCL block size (not trivially changeable here)
|
||||
#define BLOCK_SIZE SIZE_X
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
typedef enum {C = 0,
|
||||
N, S, E, W, T, B,
|
||||
NE, NW, SE, SW,
|
||||
NT, NB, ST, SB,
|
||||
ET, EB, WT, WB,
|
||||
FLAGS, N_CELL_ENTRIES} CELL_ENTRIES;
|
||||
|
||||
#define N_DISTR_FUNCS FLAGS
|
||||
|
||||
typedef enum {OBSTACLE = 1 << 0,
|
||||
ACCEL = 1 << 1,
|
||||
IN_OUT_FLOW = 1 << 2} CELL_FLAGS;
|
||||
|
||||
#endif /* _CONFIG_H_ */
|
||||
@@ -1,356 +0,0 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
// includes, system
|
||||
#include <CL/cl.h>
|
||||
#include <math.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <float.h>
|
||||
|
||||
// includes, project
|
||||
#include "layout_config.h"
|
||||
#include "lbm_macros.h"
|
||||
#include "ocl.h"
|
||||
#include "lbm.h"
|
||||
|
||||
#include "parboil.h"
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid ) {
|
||||
|
||||
cl_int clStatus;
|
||||
|
||||
clStatus = clSetKernelArg(prm->clKernel,0,sizeof(cl_mem),(void*)&srcGrid);
|
||||
CHECK_ERROR("clSetKernelArg")
|
||||
|
||||
clStatus = clSetKernelArg(prm->clKernel,1,sizeof(cl_mem),(void*)&dstGrid);
|
||||
CHECK_ERROR("clSetKernelArg")
|
||||
|
||||
size_t dimBlock[3] = {SIZE_X,1,1};
|
||||
size_t dimGrid[3] = {SIZE_X*SIZE_Y,SIZE_Z,1};
|
||||
clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue,prm->clKernel,3,NULL,dimGrid,dimBlock,0,NULL,NULL);
|
||||
CHECK_ERROR("clEnqueueNDRangeKernel")
|
||||
|
||||
clStatus = clFinish(prm->clCommandQueue);
|
||||
CHECK_ERROR("clFinish")
|
||||
}
|
||||
/*############################################################################*/
|
||||
|
||||
void LBM_allocateGrid( float** ptr ) {
|
||||
const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
|
||||
*ptr = (float*)malloc( size );
|
||||
if( !ptr ) {
|
||||
printf( "LBM_allocateGrid: could not allocate %.1f MByte\n",
|
||||
size / (1024.0*1024.0) );
|
||||
exit( 1 );
|
||||
}
|
||||
|
||||
memset( *ptr, 0, size );
|
||||
|
||||
printf( "LBM_allocateGrid: allocated %.1f MByte\n",
|
||||
size / (1024.0*1024.0) );
|
||||
|
||||
*ptr += MARGIN;
|
||||
}
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr ) {
|
||||
const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
|
||||
cl_int clStatus;
|
||||
/*size_t max_alloc_size = 0;
|
||||
clGetDeviceInfo(prm->clDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
|
||||
sizeof(max_alloc_size), &max_alloc_size, NULL);
|
||||
if (max_alloc_size < size) {
|
||||
fprintf(stderr, "Can't allocate buffer: max alloc size is %dMB\n",
|
||||
(int) (max_alloc_size >> 20));
|
||||
exit(-1);
|
||||
}*/
|
||||
*ptr = clCreateBuffer(prm->clContext,CL_MEM_READ_WRITE,size,NULL,&clStatus);
|
||||
CHECK_ERROR("clCreateBuffer")
|
||||
}
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
void LBM_freeGrid( float** ptr ) {
|
||||
free( *ptr-MARGIN );
|
||||
*ptr = NULL;
|
||||
}
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
void OpenCL_LBM_freeGrid(cl_mem ptr) {
|
||||
clReleaseMemObject(ptr);
|
||||
}
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
void LBM_initializeGrid( LBM_Grid grid ) {
|
||||
SWEEP_VAR
|
||||
|
||||
SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z )
|
||||
SRC_C( grid ) = DFL1;
|
||||
SRC_N( grid ) = DFL2;
|
||||
SRC_S( grid ) = DFL2;
|
||||
SRC_E( grid ) = DFL2;
|
||||
SRC_W( grid ) = DFL2;
|
||||
SRC_T( grid ) = DFL2;
|
||||
SRC_B( grid ) = DFL2;
|
||||
SRC_NE( grid ) = DFL3;
|
||||
SRC_NW( grid ) = DFL3;
|
||||
SRC_SE( grid ) = DFL3;
|
||||
SRC_SW( grid ) = DFL3;
|
||||
SRC_NT( grid ) = DFL3;
|
||||
SRC_NB( grid ) = DFL3;
|
||||
SRC_ST( grid ) = DFL3;
|
||||
SRC_SB( grid ) = DFL3;
|
||||
SRC_ET( grid ) = DFL3;
|
||||
SRC_EB( grid ) = DFL3;
|
||||
SRC_WT( grid ) = DFL3;
|
||||
SRC_WB( grid ) = DFL3;
|
||||
|
||||
CLEAR_ALL_FLAGS_SWEEP( grid );
|
||||
SWEEP_END
|
||||
}
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) {
|
||||
const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
|
||||
cl_int clStatus;
|
||||
clStatus = clEnqueueWriteBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL);
|
||||
CHECK_ERROR("clEnqueueWriteBuffer")
|
||||
}
|
||||
|
||||
void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) {
|
||||
const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
|
||||
cl_int clStatus;
|
||||
clStatus = clEnqueueReadBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL);
|
||||
CHECK_ERROR("clEnqueueReadBuffer")
|
||||
}
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 ) {
|
||||
cl_mem aux = *grid1;
|
||||
*grid1 = *grid2;
|
||||
*grid2 = aux;
|
||||
}
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ) {
|
||||
int x, y, z;
|
||||
|
||||
FILE* file = fopen( filename, "rb" );
|
||||
|
||||
for( z = 0; z < SIZE_Z; z++ ) {
|
||||
for( y = 0; y < SIZE_Y; y++ ) {
|
||||
for( x = 0; x < SIZE_X; x++ ) {
|
||||
if( fgetc( file ) != '.' ) SET_FLAG( grid, x, y, z, OBSTACLE );
|
||||
}
|
||||
fgetc( file );
|
||||
}
|
||||
fgetc( file );
|
||||
}
|
||||
|
||||
fclose( file );
|
||||
}
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ) {
|
||||
int x, y, z;
|
||||
|
||||
for( z = -2; z < SIZE_Z+2; z++ ) {
|
||||
for( y = 0; y < SIZE_Y; y++ ) {
|
||||
for( x = 0; x < SIZE_X; x++ ) {
|
||||
if( x == 0 || x == SIZE_X-1 ||
|
||||
y == 0 || y == SIZE_Y-1 ||
|
||||
z == 0 || z == SIZE_Z-1 ) {
|
||||
SET_FLAG( grid, x, y, z, OBSTACLE );
|
||||
}
|
||||
else {
|
||||
if( (z == 1 || z == SIZE_Z-2) &&
|
||||
x > 1 && x < SIZE_X-2 &&
|
||||
y > 1 && y < SIZE_Y-2 ) {
|
||||
SET_FLAG( grid, x, y, z, ACCEL );
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
void LBM_showGridStatistics( LBM_Grid grid ) {
|
||||
int nObstacleCells = 0,
|
||||
nAccelCells = 0,
|
||||
nFluidCells = 0;
|
||||
float ux, uy, uz;
|
||||
float minU2 = 1e+30, maxU2 = -1e+30, u2;
|
||||
float minRho = 1e+30, maxRho = -1e+30, rho;
|
||||
float mass = 0;
|
||||
|
||||
SWEEP_VAR
|
||||
|
||||
SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z )
|
||||
rho = LOCAL( grid, C ) + LOCAL( grid, N )
|
||||
+ LOCAL( grid, S ) + LOCAL( grid, E )
|
||||
+ LOCAL( grid, W ) + LOCAL( grid, T )
|
||||
+ LOCAL( grid, B ) + LOCAL( grid, NE )
|
||||
+ LOCAL( grid, NW ) + LOCAL( grid, SE )
|
||||
+ LOCAL( grid, SW ) + LOCAL( grid, NT )
|
||||
+ LOCAL( grid, NB ) + LOCAL( grid, ST )
|
||||
+ LOCAL( grid, SB ) + LOCAL( grid, ET )
|
||||
+ LOCAL( grid, EB ) + LOCAL( grid, WT )
|
||||
+ LOCAL( grid, WB );
|
||||
|
||||
if( rho < minRho ) minRho = rho;
|
||||
if( rho > maxRho ) maxRho = rho;
|
||||
mass += rho;
|
||||
|
||||
if( TEST_FLAG_SWEEP( grid, OBSTACLE )) {
|
||||
nObstacleCells++;
|
||||
}
|
||||
else {
|
||||
if( TEST_FLAG_SWEEP( grid, ACCEL ))
|
||||
nAccelCells++;
|
||||
else
|
||||
nFluidCells++;
|
||||
|
||||
ux = + LOCAL( grid, E ) - LOCAL( grid, W )
|
||||
+ LOCAL( grid, NE ) - LOCAL( grid, NW )
|
||||
+ LOCAL( grid, SE ) - LOCAL( grid, SW )
|
||||
+ LOCAL( grid, ET ) + LOCAL( grid, EB )
|
||||
- LOCAL( grid, WT ) - LOCAL( grid, WB );
|
||||
uy = + LOCAL( grid, N ) - LOCAL( grid, S )
|
||||
+ LOCAL( grid, NE ) + LOCAL( grid, NW )
|
||||
- LOCAL( grid, SE ) - LOCAL( grid, SW )
|
||||
+ LOCAL( grid, NT ) + LOCAL( grid, NB )
|
||||
- LOCAL( grid, ST ) - LOCAL( grid, SB );
|
||||
uz = + LOCAL( grid, T ) - LOCAL( grid, B )
|
||||
+ LOCAL( grid, NT ) - LOCAL( grid, NB )
|
||||
+ LOCAL( grid, ST ) - LOCAL( grid, SB )
|
||||
+ LOCAL( grid, ET ) - LOCAL( grid, EB )
|
||||
+ LOCAL( grid, WT ) - LOCAL( grid, WB );
|
||||
u2 = (ux*ux + uy*uy + uz*uz) / (rho*rho);
|
||||
if( u2 < minU2 ) minU2 = u2;
|
||||
if( u2 > maxU2 ) maxU2 = u2;
|
||||
}
|
||||
SWEEP_END
|
||||
|
||||
printf( "LBM_showGridStatistics:\n"
|
||||
"\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n"
|
||||
"\tminRho: %8.4f maxRho: %8.4f mass: %e\n"
|
||||
"\tminU: %e maxU: %e\n\n",
|
||||
nObstacleCells, nAccelCells, nFluidCells,
|
||||
minRho, maxRho, mass,
|
||||
sqrt( minU2 ), sqrt( maxU2 ) );
|
||||
|
||||
}
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
static void storeValue( FILE* file, OUTPUT_PRECISION* v ) {
|
||||
const int litteBigEndianTest = 1;
|
||||
if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) { /* big endian */
|
||||
const char* vPtr = (char*) v;
|
||||
char buffer[sizeof( OUTPUT_PRECISION )];
|
||||
int i;
|
||||
|
||||
for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++)
|
||||
buffer[i] = vPtr[sizeof( OUTPUT_PRECISION ) - i - 1];
|
||||
|
||||
fwrite( buffer, sizeof( OUTPUT_PRECISION ), 1, file );
|
||||
}
|
||||
else { /* little endian */
|
||||
fwrite( v, sizeof( OUTPUT_PRECISION ), 1, file );
|
||||
}
|
||||
}
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
static void loadValue( FILE* file, OUTPUT_PRECISION* v ) {
|
||||
const int litteBigEndianTest = 1;
|
||||
if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) { /* big endian */
|
||||
char* vPtr = (char*) v;
|
||||
char buffer[sizeof( OUTPUT_PRECISION )];
|
||||
int i;
|
||||
|
||||
fread( buffer, sizeof( OUTPUT_PRECISION ), 1, file );
|
||||
|
||||
for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++)
|
||||
vPtr[i] = buffer[sizeof( OUTPUT_PRECISION ) - i - 1];
|
||||
}
|
||||
else { /* little endian */
|
||||
fread( v, sizeof( OUTPUT_PRECISION ), 1, file );
|
||||
}
|
||||
}
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
void LBM_storeVelocityField( LBM_Grid grid, const char* filename,
|
||||
const int binary ) {
|
||||
OUTPUT_PRECISION rho, ux, uy, uz;
|
||||
|
||||
FILE* file = fopen( filename, (binary ? "wb" : "w") );
|
||||
|
||||
SWEEP_VAR
|
||||
SWEEP_START(0,0,0,SIZE_X,SIZE_Y,SIZE_Z)
|
||||
rho = + SRC_C( grid ) + SRC_N( grid )
|
||||
+ SRC_S( grid ) + SRC_E( grid )
|
||||
+ SRC_W( grid ) + SRC_T( grid )
|
||||
+ SRC_B( grid ) + SRC_NE( grid )
|
||||
+ SRC_NW( grid ) + SRC_SE( grid )
|
||||
+ SRC_SW( grid ) + SRC_NT( grid )
|
||||
+ SRC_NB( grid ) + SRC_ST( grid )
|
||||
+ SRC_SB( grid ) + SRC_ET( grid )
|
||||
+ SRC_EB( grid ) + SRC_WT( grid )
|
||||
+ SRC_WB( grid );
|
||||
ux = + SRC_E( grid ) - SRC_W( grid )
|
||||
+ SRC_NE( grid ) - SRC_NW( grid )
|
||||
+ SRC_SE( grid ) - SRC_SW( grid )
|
||||
+ SRC_ET( grid ) + SRC_EB( grid )
|
||||
- SRC_WT( grid ) - SRC_WB( grid );
|
||||
uy = + SRC_N( grid ) - SRC_S( grid )
|
||||
+ SRC_NE( grid ) + SRC_NW( grid )
|
||||
- SRC_SE( grid ) - SRC_SW( grid )
|
||||
+ SRC_NT( grid ) + SRC_NB( grid )
|
||||
- SRC_ST( grid ) - SRC_SB( grid );
|
||||
uz = + SRC_T( grid ) - SRC_B( grid )
|
||||
+ SRC_NT( grid ) - SRC_NB( grid )
|
||||
+ SRC_ST( grid ) - SRC_SB( grid )
|
||||
+ SRC_ET( grid ) - SRC_EB( grid )
|
||||
+ SRC_WT( grid ) - SRC_WB( grid );
|
||||
ux /= rho;
|
||||
uy /= rho;
|
||||
uz /= rho;
|
||||
|
||||
if( binary ) {
|
||||
/*
|
||||
fwrite( &ux, sizeof( ux ), 1, file );
|
||||
fwrite( &uy, sizeof( uy ), 1, file );
|
||||
fwrite( &uz, sizeof( uz ), 1, file );
|
||||
*/
|
||||
storeValue( file, &ux );
|
||||
storeValue( file, &uy );
|
||||
storeValue( file, &uz );
|
||||
} else
|
||||
fprintf( file, "%e %e %e\n", ux, uy, uz );
|
||||
|
||||
SWEEP_END;
|
||||
|
||||
fclose( file );
|
||||
}
|
||||
@@ -1,39 +0,0 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
#ifndef _LBM_H_
|
||||
#define _LBM_H_
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
#include "ocl.h"
|
||||
#include "lbm_macros.h"
|
||||
|
||||
void LBM_allocateGrid( float** ptr );
|
||||
void LBM_freeGrid( float** ptr );
|
||||
void LBM_initializeGrid( LBM_Grid grid );
|
||||
void LBM_initializeSpecialCellsForLDC( LBM_Grid grid );
|
||||
void LBM_loadObstacleFile( LBM_Grid grid, const char* filename );
|
||||
void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 );
|
||||
void LBM_showGridStatistics( LBM_Grid Grid );
|
||||
void LBM_storeVelocityField( LBM_Grid grid, const char* filename,
|
||||
const BOOL binary );
|
||||
|
||||
/* OpenCL *********************************************************************/
|
||||
|
||||
void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr );
|
||||
void OpenCL_LBM_freeGrid( cl_mem ptr );
|
||||
void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid );
|
||||
void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid );
|
||||
void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid );
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
#endif /* _LBM_H_ */
|
||||
@@ -1,177 +0,0 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
#ifndef _LBM_MARCOS_H
|
||||
#define _LBM_MACROS_H_
|
||||
|
||||
#define OMEGA (1.95f)
|
||||
|
||||
#define OUTPUT_PRECISION float
|
||||
|
||||
#define BOOL int
|
||||
#define TRUE (-1)
|
||||
#define FALSE (0)
|
||||
|
||||
#define DFL1 (1.0f/ 3.0f)
|
||||
#define DFL2 (1.0f/18.0f)
|
||||
#define DFL3 (1.0f/36.0f)
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
typedef float* LBM_Grid;//float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES];
|
||||
typedef LBM_Grid* LBM_GridPtr;
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
|
||||
#define SWEEP_X __temp_x__
|
||||
#define SWEEP_Y __temp_y__
|
||||
#define SWEEP_Z __temp_z__
|
||||
#define SWEEP_VAR int __temp_x__, __temp_y__, __temp_z__;
|
||||
|
||||
#define SWEEP_START(x1,y1,z1,x2,y2,z2) \
|
||||
for( __temp_z__ = z1; \
|
||||
__temp_z__ < z2; \
|
||||
__temp_z__++) { \
|
||||
for( __temp_y__ = 0; \
|
||||
__temp_y__ < SIZE_Y; \
|
||||
__temp_y__++) { \
|
||||
for(__temp_x__ = 0; \
|
||||
__temp_x__ < SIZE_X; \
|
||||
__temp_x__++) { \
|
||||
|
||||
#define SWEEP_END }}}
|
||||
|
||||
|
||||
#define GRID_ENTRY(g,x,y,z,e) ((g)[CALC_INDEX( x, y, z, e)])
|
||||
#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX((dx)+SWEEP_X, (dy)+SWEEP_Y, (dz)+SWEEP_Z, e)])
|
||||
|
||||
#define LOCAL(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e ))
|
||||
#define NEIGHBOR_C(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e ))
|
||||
#define NEIGHBOR_N(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, 0, e ))
|
||||
#define NEIGHBOR_S(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, 0, e ))
|
||||
#define NEIGHBOR_E(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, 0, e ))
|
||||
#define NEIGHBOR_W(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, 0, e ))
|
||||
#define NEIGHBOR_T(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, +1, e ))
|
||||
#define NEIGHBOR_B(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, -1, e ))
|
||||
#define NEIGHBOR_NE(g,e) (GRID_ENTRY_SWEEP( g, +1, +1, 0, e ))
|
||||
#define NEIGHBOR_NW(g,e) (GRID_ENTRY_SWEEP( g, -1, +1, 0, e ))
|
||||
#define NEIGHBOR_SE(g,e) (GRID_ENTRY_SWEEP( g, +1, -1, 0, e ))
|
||||
#define NEIGHBOR_SW(g,e) (GRID_ENTRY_SWEEP( g, -1, -1, 0, e ))
|
||||
#define NEIGHBOR_NT(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, +1, e ))
|
||||
#define NEIGHBOR_NB(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, -1, e ))
|
||||
#define NEIGHBOR_ST(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, +1, e ))
|
||||
#define NEIGHBOR_SB(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, -1, e ))
|
||||
#define NEIGHBOR_ET(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, +1, e ))
|
||||
#define NEIGHBOR_EB(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, -1, e ))
|
||||
#define NEIGHBOR_WT(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, +1, e ))
|
||||
#define NEIGHBOR_WB(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, -1, e ))
|
||||
|
||||
|
||||
#ifdef SCATTER
|
||||
|
||||
#define SRC_C(g) (LOCAL( g, C ))
|
||||
#define SRC_N(g) (LOCAL( g, N ))
|
||||
#define SRC_S(g) (LOCAL( g, S ))
|
||||
#define SRC_E(g) (LOCAL( g, E ))
|
||||
#define SRC_W(g) (LOCAL( g, W ))
|
||||
#define SRC_T(g) (LOCAL( g, T ))
|
||||
#define SRC_B(g) (LOCAL( g, B ))
|
||||
#define SRC_NE(g) (LOCAL( g, NE ))
|
||||
#define SRC_NW(g) (LOCAL( g, NW ))
|
||||
#define SRC_SE(g) (LOCAL( g, SE ))
|
||||
#define SRC_SW(g) (LOCAL( g, SW ))
|
||||
#define SRC_NT(g) (LOCAL( g, NT ))
|
||||
#define SRC_NB(g) (LOCAL( g, NB ))
|
||||
#define SRC_ST(g) (LOCAL( g, ST ))
|
||||
#define SRC_SB(g) (LOCAL( g, SB ))
|
||||
#define SRC_ET(g) (LOCAL( g, ET ))
|
||||
#define SRC_EB(g) (LOCAL( g, EB ))
|
||||
#define SRC_WT(g) (LOCAL( g, WT ))
|
||||
#define SRC_WB(g) (LOCAL( g, WB ))
|
||||
|
||||
#define DST_C(g) (NEIGHBOR_C ( g, C ))
|
||||
#define DST_N(g) (NEIGHBOR_N ( g, N ))
|
||||
#define DST_S(g) (NEIGHBOR_S ( g, S ))
|
||||
#define DST_E(g) (NEIGHBOR_E ( g, E ))
|
||||
#define DST_W(g) (NEIGHBOR_W ( g, W ))
|
||||
#define DST_T(g) (NEIGHBOR_T ( g, T ))
|
||||
#define DST_B(g) (NEIGHBOR_B ( g, B ))
|
||||
#define DST_NE(g) (NEIGHBOR_NE( g, NE ))
|
||||
#define DST_NW(g) (NEIGHBOR_NW( g, NW ))
|
||||
#define DST_SE(g) (NEIGHBOR_SE( g, SE ))
|
||||
#define DST_SW(g) (NEIGHBOR_SW( g, SW ))
|
||||
#define DST_NT(g) (NEIGHBOR_NT( g, NT ))
|
||||
#define DST_NB(g) (NEIGHBOR_NB( g, NB ))
|
||||
#define DST_ST(g) (NEIGHBOR_ST( g, ST ))
|
||||
#define DST_SB(g) (NEIGHBOR_SB( g, SB ))
|
||||
#define DST_ET(g) (NEIGHBOR_ET( g, ET ))
|
||||
#define DST_EB(g) (NEIGHBOR_EB( g, EB ))
|
||||
#define DST_WT(g) (NEIGHBOR_WT( g, WT ))
|
||||
#define DST_WB(g) (NEIGHBOR_WB( g, WB ))
|
||||
|
||||
#else /* GATHER */
|
||||
|
||||
#define SRC_C(g) (NEIGHBOR_C ( g, C ))
|
||||
#define SRC_N(g) (NEIGHBOR_S ( g, N ))
|
||||
#define SRC_S(g) (NEIGHBOR_N ( g, S ))
|
||||
#define SRC_E(g) (NEIGHBOR_W ( g, E ))
|
||||
#define SRC_W(g) (NEIGHBOR_E ( g, W ))
|
||||
#define SRC_T(g) (NEIGHBOR_B ( g, T ))
|
||||
#define SRC_B(g) (NEIGHBOR_T ( g, B ))
|
||||
#define SRC_NE(g) (NEIGHBOR_SW( g, NE ))
|
||||
#define SRC_NW(g) (NEIGHBOR_SE( g, NW ))
|
||||
#define SRC_SE(g) (NEIGHBOR_NW( g, SE ))
|
||||
#define SRC_SW(g) (NEIGHBOR_NE( g, SW ))
|
||||
#define SRC_NT(g) (NEIGHBOR_SB( g, NT ))
|
||||
#define SRC_NB(g) (NEIGHBOR_ST( g, NB ))
|
||||
#define SRC_ST(g) (NEIGHBOR_NB( g, ST ))
|
||||
#define SRC_SB(g) (NEIGHBOR_NT( g, SB ))
|
||||
#define SRC_ET(g) (NEIGHBOR_WB( g, ET ))
|
||||
#define SRC_EB(g) (NEIGHBOR_WT( g, EB ))
|
||||
#define SRC_WT(g) (NEIGHBOR_EB( g, WT ))
|
||||
#define SRC_WB(g) (NEIGHBOR_ET( g, WB ))
|
||||
|
||||
#define DST_C(g) (LOCAL( g, C ))
|
||||
#define DST_N(g) (LOCAL( g, N ))
|
||||
#define DST_S(g) (LOCAL( g, S ))
|
||||
#define DST_E(g) (LOCAL( g, E ))
|
||||
#define DST_W(g) (LOCAL( g, W ))
|
||||
#define DST_T(g) (LOCAL( g, T ))
|
||||
#define DST_B(g) (LOCAL( g, B ))
|
||||
#define DST_NE(g) (LOCAL( g, NE ))
|
||||
#define DST_NW(g) (LOCAL( g, NW ))
|
||||
#define DST_SE(g) (LOCAL( g, SE ))
|
||||
#define DST_SW(g) (LOCAL( g, SW ))
|
||||
#define DST_NT(g) (LOCAL( g, NT ))
|
||||
#define DST_NB(g) (LOCAL( g, NB ))
|
||||
#define DST_ST(g) (LOCAL( g, ST ))
|
||||
#define DST_SB(g) (LOCAL( g, SB ))
|
||||
#define DST_ET(g) (LOCAL( g, ET ))
|
||||
#define DST_EB(g) (LOCAL( g, EB ))
|
||||
#define DST_WT(g) (LOCAL( g, WT ))
|
||||
#define DST_WB(g) (LOCAL( g, WB ))
|
||||
|
||||
#endif /* GATHER */
|
||||
|
||||
#define MAGIC_CAST(v) ((unsigned int*) ((void*) (&(v))))
|
||||
#define FLAG_VAR(v) unsigned int* _aux_ = MAGIC_CAST(v)
|
||||
|
||||
#define TEST_FLAG_SWEEP(g,f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f))
|
||||
#define SET_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) |= (f);}
|
||||
#define CLEAR_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) &= ~(f);}
|
||||
#define CLEAR_ALL_FLAGS_SWEEP(g) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) = 0;}
|
||||
|
||||
#define TEST_FLAG(g,x,y,z,f) ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f))
|
||||
#define SET_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) |= (f);}
|
||||
#define CLEAR_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) &= ~(f);}
|
||||
#define CLEAR_ALL_FLAGS(g,x,y,z) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) = 0;}
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
#endif /* _CONFIG_H_ */
|
||||
Binary file not shown.
@@ -1,238 +0,0 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
#include <CL/cl.h>
|
||||
#include <parboil.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <sys/stat.h>
|
||||
|
||||
|
||||
#include "layout_config.h"
|
||||
#include "lbm.h"
|
||||
#include "lbm_macros.h"
|
||||
#include "main.h"
|
||||
#include "ocl.h"
|
||||
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
static cl_mem OpenCL_srcGrid, OpenCL_dstGrid;
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
struct pb_TimerSet timers;
|
||||
int main(int nArgs, char *arg[]) {
|
||||
MAIN_Param param;
|
||||
int t;
|
||||
|
||||
OpenCL_Param prm;
|
||||
|
||||
pb_InitializeTimerSet(&timers);
|
||||
struct pb_Parameters *params;
|
||||
params = pb_ReadParameters(&nArgs, arg);
|
||||
|
||||
params->inpFiles = (char **)malloc(sizeof(char *) * 2);
|
||||
params->inpFiles[0] = (char *)malloc(100);
|
||||
params->inpFiles[1] = NULL;
|
||||
strncpy(params->inpFiles[0], "120_120_150_ldc.of", 100);
|
||||
|
||||
static LBM_GridPtr TEMP_srcGrid;
|
||||
// Setup TEMP datastructures
|
||||
LBM_allocateGrid((float **)&TEMP_srcGrid);
|
||||
MAIN_parseCommandLine(nArgs, arg, ¶m, params);
|
||||
MAIN_printInfo(¶m);
|
||||
|
||||
OpenCL_initialize(params, &prm);
|
||||
MAIN_initialize(¶m, &prm);
|
||||
|
||||
for (t = 1; t <= param.nTimeSteps; t++) {
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
|
||||
OpenCL_LBM_performStreamCollide(&prm, OpenCL_srcGrid, OpenCL_dstGrid);
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
|
||||
LBM_swapGrids(&OpenCL_srcGrid, &OpenCL_dstGrid);
|
||||
|
||||
if ((t & 63) == 0) {
|
||||
printf("timestep: %i\n", t);
|
||||
#if 0
|
||||
CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid);
|
||||
LBM_showGridStatistics( *TEMP_srcGrid );
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
MAIN_finalize(¶m, &prm);
|
||||
|
||||
LBM_freeGrid((float **)&TEMP_srcGrid);
|
||||
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
|
||||
pb_PrintTimerSet(&timers);
|
||||
pb_FreeParameters(params);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param,
|
||||
struct pb_Parameters *params) {
|
||||
struct stat fileStat;
|
||||
|
||||
/*if (nArgs < 2) {
|
||||
printf("syntax: lbm <time steps>\n");
|
||||
exit(1);
|
||||
}*/
|
||||
|
||||
param->nTimeSteps = 4; //atoi(arg[1]);
|
||||
|
||||
if (params->inpFiles[0] != NULL) {
|
||||
param->obstacleFilename = params->inpFiles[0];
|
||||
|
||||
/*if (stat(param->obstacleFilename, &fileStat) != 0) {
|
||||
printf("MAIN_parseCommandLine: cannot stat obstacle file '%s'\n",
|
||||
param->obstacleFilename);
|
||||
exit(1);
|
||||
}
|
||||
if (fileStat.st_size != SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z) {
|
||||
printf("MAIN_parseCommandLine:\n"
|
||||
"\tsize of file '%s' is %i bytes\n"
|
||||
"\texpected size is %i bytes\n",
|
||||
param->obstacleFilename, (int)fileStat.st_size,
|
||||
SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z);
|
||||
exit(1);
|
||||
}*/
|
||||
} else
|
||||
param->obstacleFilename = NULL;
|
||||
|
||||
param->resultFilename = params->outFile;
|
||||
}
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
void MAIN_printInfo(const MAIN_Param *param) {
|
||||
printf("MAIN_printInfo:\n"
|
||||
"\tgrid size : %i x %i x %i = %.2f * 10^6 Cells\n"
|
||||
"\tnTimeSteps : %i\n"
|
||||
"\tresult file : %s\n"
|
||||
"\taction : %s\n"
|
||||
"\tsimulation type: %s\n"
|
||||
"\tobstacle file : %s\n\n",
|
||||
SIZE_X, SIZE_Y, SIZE_Z, 1e-6 * SIZE_X * SIZE_Y * SIZE_Z,
|
||||
param->nTimeSteps, param->resultFilename, "store", "lid-driven cavity",
|
||||
(param->obstacleFilename == NULL) ? "<none>"
|
||||
: param->obstacleFilename);
|
||||
}
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) {
|
||||
static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
|
||||
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
|
||||
// Setup TEMP datastructures
|
||||
LBM_allocateGrid((float **)&TEMP_srcGrid);
|
||||
LBM_allocateGrid((float **)&TEMP_dstGrid);
|
||||
LBM_initializeGrid(TEMP_srcGrid);
|
||||
LBM_initializeGrid(TEMP_dstGrid);
|
||||
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_IO);
|
||||
if (param->obstacleFilename != NULL) {
|
||||
LBM_loadObstacleFile(TEMP_srcGrid, param->obstacleFilename);
|
||||
LBM_loadObstacleFile(TEMP_dstGrid, param->obstacleFilename);
|
||||
}
|
||||
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
|
||||
LBM_initializeSpecialCellsForLDC(TEMP_srcGrid);
|
||||
LBM_initializeSpecialCellsForLDC(TEMP_dstGrid);
|
||||
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
|
||||
|
||||
printf("OK+\n");
|
||||
|
||||
// Setup DEVICE datastructures
|
||||
OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid);
|
||||
OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid);
|
||||
|
||||
printf("OK-\n");
|
||||
|
||||
// Initialize DEVICE datastructures
|
||||
OpenCL_LBM_initializeGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
|
||||
OpenCL_LBM_initializeGrid(prm, OpenCL_dstGrid, TEMP_dstGrid);
|
||||
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
|
||||
LBM_showGridStatistics(TEMP_srcGrid);
|
||||
|
||||
LBM_freeGrid((float **)&TEMP_srcGrid);
|
||||
LBM_freeGrid((float **)&TEMP_dstGrid);
|
||||
|
||||
printf("OK\n");
|
||||
}
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) {
|
||||
LBM_Grid TEMP_srcGrid;
|
||||
|
||||
// Setup TEMP datastructures
|
||||
LBM_allocateGrid((float **)&TEMP_srcGrid);
|
||||
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
|
||||
OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
|
||||
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
|
||||
LBM_showGridStatistics(TEMP_srcGrid);
|
||||
|
||||
LBM_storeVelocityField(TEMP_srcGrid, param->resultFilename, TRUE);
|
||||
|
||||
LBM_freeGrid((float **)&TEMP_srcGrid);
|
||||
OpenCL_LBM_freeGrid(OpenCL_srcGrid);
|
||||
OpenCL_LBM_freeGrid(OpenCL_dstGrid);
|
||||
|
||||
clReleaseProgram(prm->clProgram);
|
||||
clReleaseKernel(prm->clKernel);
|
||||
clReleaseCommandQueue(prm->clCommandQueue);
|
||||
clReleaseContext(prm->clContext);
|
||||
}
|
||||
|
||||
void OpenCL_initialize(struct pb_Parameters *p, OpenCL_Param *prm) {
|
||||
cl_int clStatus;
|
||||
pb_Context *pb_context;
|
||||
pb_context = pb_InitOpenCLContext(p);
|
||||
if (pb_context == NULL) {
|
||||
fprintf(stderr, "Error: No OpenCL platform/device can be found.");
|
||||
return;
|
||||
}
|
||||
prm->clDevice = (cl_device_id)pb_context->clDeviceId;
|
||||
prm->clPlatform = (cl_platform_id)pb_context->clPlatformId;
|
||||
prm->clContext = (cl_context)pb_context->clContext;
|
||||
|
||||
prm->clCommandQueue = clCreateCommandQueue(
|
||||
prm->clContext, prm->clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
|
||||
CHECK_ERROR("clCreateCommandQueue")
|
||||
|
||||
pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue));
|
||||
|
||||
//const char *clSource[] = {readFile("src/opencl_base/kernel.cl")};
|
||||
//prm->clProgram = clCreateProgramWithSource(prm->clContext, 1, clSource, NULL, &clStatus);
|
||||
prm->clProgram = clCreateProgramWithBuiltInKernels(
|
||||
prm->clContext, 1, &prm->clDevice, "performStreamCollide_kernel", &clStatus);
|
||||
CHECK_ERROR("clCreateProgramWithSource")
|
||||
|
||||
//char clOptions[100];
|
||||
//sprintf(clOptions, "-I src/opencl_base");
|
||||
//clStatus = clBuildProgram(prm->clProgram, 1, &(prm->clDevice), clOptions, NULL, NULL);
|
||||
clStatus = clBuildProgram(prm->clProgram, 1, &prm->clDevice, NULL, NULL, NULL);
|
||||
CHECK_ERROR("clBuildProgram")
|
||||
|
||||
prm->clKernel =
|
||||
clCreateKernel(prm->clProgram, "performStreamCollide_kernel", &clStatus);
|
||||
CHECK_ERROR("clCreateKernel")
|
||||
|
||||
//free((void *)clSource[0]);
|
||||
}
|
||||
@@ -1,31 +0,0 @@
|
||||
/***************************************************************************
|
||||
*cr
|
||||
*cr (C) Copyright 2010 The Board of Trustees of the
|
||||
*cr University of Illinois
|
||||
*cr All Rights Reserved
|
||||
*cr
|
||||
***************************************************************************/
|
||||
|
||||
#ifndef _MAIN_H_
|
||||
#define _MAIN_H_
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
typedef struct {
|
||||
int nTimeSteps;
|
||||
char* resultFilename;
|
||||
char* obstacleFilename;
|
||||
} MAIN_Param;
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters* );
|
||||
void MAIN_printInfo( const MAIN_Param* param );
|
||||
void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm );
|
||||
void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm );
|
||||
|
||||
void OpenCL_initialize(struct pb_Parameters*, OpenCL_Param* prm);
|
||||
|
||||
/*############################################################################*/
|
||||
|
||||
#endif /* _MAIN_H_ */
|
||||
@@ -1,40 +0,0 @@
|
||||
#include <CL/cl.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "ocl.h"
|
||||
|
||||
char* readFile(char* fileName)
|
||||
{
|
||||
FILE* fp;
|
||||
fp = fopen(fileName,"r");
|
||||
|
||||
if(fp == NULL)
|
||||
{
|
||||
printf("Error 1!\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
fseek(fp,0,SEEK_END);
|
||||
long size = ftell(fp);
|
||||
rewind(fp);
|
||||
|
||||
char* buffer = (char*)malloc(sizeof(char)*(size+1));
|
||||
if(buffer == NULL)
|
||||
{
|
||||
printf("Error 2!\n");
|
||||
fclose(fp);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
size_t res = fread(buffer,1,size,fp);
|
||||
if(res != size)
|
||||
{
|
||||
printf("Error 3!\n");
|
||||
fclose(fp);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
buffer[size] = 0;
|
||||
fclose(fp);
|
||||
return buffer;
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user