project directories reorganization

This commit is contained in:
Blaise Tine
2020-04-14 06:35:20 -04:00
parent 1de06fd9c0
commit fc155e1223
1056 changed files with 8120 additions and 8120 deletions

View File

@@ -1,101 +0,0 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#if(0)
#define EXP(a) native_exp(a)
#define LOG(a) native_log(a)
#define SQRT(a) native_sqrt(a)
#else
#define EXP(a) exp(a)
#define LOG(a) log(a)
#define SQRT(a) sqrt(a)
#endif
///////////////////////////////////////////////////////////////////////////////
// Predefine functions to avoid bug in OpenCL compiler on Mac OSX 10.7 systems
///////////////////////////////////////////////////////////////////////////////
float CND(float d);
void BlackScholesBody(__global float *call, __global float *put, float S,
float X, float T, float R, float V);
///////////////////////////////////////////////////////////////////////////////
// Rational approximation of cumulative normal distribution function
///////////////////////////////////////////////////////////////////////////////
float CND(float d){
const float A1 = 0.31938153f;
const float A2 = -0.356563782f;
const float A3 = 1.781477937f;
const float A4 = -1.821255978f;
const float A5 = 1.330274429f;
const float RSQRT2PI = 0.39894228040143267793994605993438f;
float
K = 1.0f / (1.0f + 0.2316419f * fabs(d));
float
cnd = RSQRT2PI * EXP(- 0.5f * d * d) *
(K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
if(d > 0)
cnd = 1.0f - cnd;
return cnd;
}
///////////////////////////////////////////////////////////////////////////////
// Black-Scholes formula for both call and put
///////////////////////////////////////////////////////////////////////////////
void BlackScholesBody(
__global float *call, //Call option price
__global float *put, //Put option price
float S, //Current stock price
float X, //Option strike price
float T, //Option years
float R, //Riskless rate of return
float V //Stock volatility
){
float sqrtT = SQRT(T);
float d1 = (LOG(S / X) + (R + 0.5f * V * V) * T) / (V * sqrtT);
float d2 = d1 - V * sqrtT;
float CNDD1 = CND(d1);
float CNDD2 = CND(d2);
//Calculate Call and Put simultaneously
float expRT = EXP(- R * T);
*call = (S * CNDD1 - X * expRT * CNDD2);
*put = (X * expRT * (1.0f - CNDD2) - S * (1.0f - CNDD1));
}
__kernel void BlackScholes(
__global float *d_Call, //Call option price
__global float *d_Put, //Put option price
__global float *d_S, //Current stock price
__global float *d_X, //Option strike price
__global float *d_T, //Option years
float R, //Riskless rate of return
float V, //Stock volatility
unsigned int optN
){
for(unsigned int opt = get_global_id(0); opt < optN; opt += get_global_size(0))
BlackScholesBody(
&d_Call[opt],
&d_Put[opt],
d_S[opt],
d_X[opt],
d_T[opt],
R,
V
);
}

View File

@@ -1,66 +0,0 @@
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH) -I.
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT=BlackScholes
all: $(PROJECT).dump $(PROJECT).hex
lib$(PROJECT).a: BlackScholes.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc oclBlackScholes_common.h oclBlackScholes_launcher.cpp oclBlackScholes_gold.cpp $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.elf *.dump *.hex

View File

@@ -1,248 +0,0 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
// standard utilities and systems includes
#include <oclUtils.h>
#include <shrQATest.h>
#include "oclBlackScholes_common.h"
////////////////////////////////////////////////////////////////////////////////
// Helper functions
////////////////////////////////////////////////////////////////////////////////
double executionTime(cl_event &event){
cl_ulong start, end;
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
return (double)1.0e-9 * (end - start); // convert nanoseconds to seconds on return
}
////////////////////////////////////////////////////////////////////////////////
// Random float helper
////////////////////////////////////////////////////////////////////////////////
float randFloat(float low, float high){
float t = (float)rand() / (float)RAND_MAX;
return (1.0f - t) * low + t * high;
}
////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
cl_platform_id cpPlatform; //OpenCL platform
cl_device_id* cdDevices = NULL; //OpenCL devices list (array)
cl_context cxGPUContext; //OpenCL context
cl_command_queue cqCommandQueue; //OpenCL command que
cl_mem //OpenCL memory buffer objects
d_Call,
d_Put,
d_S,
d_X,
d_T;
cl_int ciErrNum;
float
*h_CallCPU,
*h_PutCPU,
*h_CallGPU,
*h_PutGPU,
*h_S,
*h_X,
*h_T;
const unsigned int optionCount = 4000000;
const float R = 0.02f;
const float V = 0.30f;
shrQAStart(argc, argv);
// Get the NVIDIA platform
ciErrNum = oclGetPlatformID(&cpPlatform);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
shrLog("clGetPlatformID...\n");
//Get all the devices
cl_uint uiNumDevices = 0; // Number of devices available
cl_uint uiTargetDevice = 0; // Default Device to compute on
cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU)
shrLog("Get the Device info and select Device...\n");
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
// Get command line device options and config accordingly
shrLog(" # of Devices Available = %u\n", uiNumDevices);
if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE)
{
uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
}
shrLog(" Using Device %u: ", uiTargetDevice);
oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits);
// set logfile name and start logs
shrSetLogFileName ("oclBlackScholes.txt");
shrLog("%s Starting...\n\n", argv[0]);
shrLog("Allocating and initializing host memory...\n");
h_CallCPU = (float *)malloc(optionCount * sizeof(float));
h_PutCPU = (float *)malloc(optionCount * sizeof(float));
h_CallGPU = (float *)malloc(optionCount * sizeof(float));
h_PutGPU = (float *)malloc(optionCount * sizeof(float));
h_S = (float *)malloc(optionCount * sizeof(float));
h_X = (float *)malloc(optionCount * sizeof(float));
h_T = (float *)malloc(optionCount * sizeof(float));
srand(2009);
for(unsigned int i = 0; i < optionCount; i++){
h_CallCPU[i] = -1.0f;
h_PutCPU[i] = -1.0f;
h_S[i] = randFloat(5.0f, 30.0f);
h_X[i] = randFloat(1.0f, 100.0f);
h_T[i] = randFloat(0.25f, 10.0f);
}
shrLog("Initializing OpenCL...\n");
// Get the NVIDIA platform
ciErrNum = oclGetPlatformID(&cpPlatform);
//oclCheckError(ciErrNum, CL_SUCCESS);
// Get a GPU device
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 1, &cdDevices[uiTargetDevice], NULL);
//oclCheckError(ciErrNum, CL_SUCCESS);
// Create the context
cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
//Create a command-queue
cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
shrLog("Creating OpenCL memory objects...\n");
d_Call = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, optionCount * sizeof(float), NULL, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
d_Put = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, optionCount * sizeof(float), NULL, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
d_S = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_S, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
d_X = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_X, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
d_T = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_T, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
shrLog("Starting up BlackScholes...\n");
initBlackScholes(cxGPUContext, cqCommandQueue, (const char **)argv);
shrLog("Running OpenCL BlackScholes...\n\n");
//Just a single run or a warmup iteration
BlackScholes(
NULL,
d_Call,
d_Put,
d_S,
d_X,
d_T,
R,
V,
optionCount
);
#ifdef GPU_PROFILING
const int numIterations = 16;
cl_event startMark, endMark;
ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark);
ciErrNum |= clFinish(cqCommandQueue);
shrCheckError(ciErrNum, CL_SUCCESS);
shrDeltaT(0);
for(int i = 0; i < numIterations; i++){
BlackScholes(
cqCommandQueue,
d_Call,
d_Put,
d_S,
d_X,
d_T,
R,
V,
optionCount
);
}
ciErrNum = clEnqueueMarker(cqCommandQueue, &endMark);
ciErrNum |= clFinish(cqCommandQueue);
shrCheckError(ciErrNum, CL_SUCCESS);
//Calculate performance metrics by wallclock time
double gpuTime = shrDeltaT(0) / numIterations;
shrLogEx(LOGBOTH | MASTER, 0, "oclBlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u options, NumDevsUsed = %i, Workgroup = %u\n",
(double)(2.0 * optionCount * 1.0e-9)/gpuTime, gpuTime, (2 * optionCount), 1, 0);
//Get profiling info
cl_ulong startTime = 0, endTime = 0;
ciErrNum = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &startTime, NULL);
ciErrNum |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
shrLog("\nOpenCL time: %.5f s\n\n", 1.0e-9 * ((double)endTime - (double)startTime) / (double)numIterations);
#endif
shrLog("\nReading back OpenCL BlackScholes results...\n");
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Call, CL_TRUE, 0, optionCount * sizeof(float), h_CallGPU, 0, NULL, NULL);
//oclCheckError(ciErrNum, CL_SUCCESS);
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Put, CL_TRUE, 0, optionCount * sizeof(float), h_PutGPU, 0, NULL, NULL);
//oclCheckError(ciErrNum, CL_SUCCESS);
shrLog("Comparing against Host/C++ computation...\n");
BlackScholesCPU(h_CallCPU, h_PutCPU, h_S, h_X, h_T, R, V, optionCount);
double deltaCall = 0, deltaPut = 0, sumCall = 0, sumPut = 0;
double L1call, L1put;
for(unsigned int i = 0; i < optionCount; i++)
{
sumCall += fabs(h_CallCPU[i]);
sumPut += fabs(h_PutCPU[i]);
deltaCall += fabs(h_CallCPU[i] - h_CallGPU[i]);
deltaPut += fabs(h_PutCPU[i] - h_PutGPU[i]);
}
L1call = deltaCall / sumCall;
L1put = deltaPut / sumPut;
shrLog("Relative L1 (call, put) = (%.3e, %.3e)\n\n", L1call, L1put);
shrLog("Shutting down...\n");
closeBlackScholes();
ciErrNum = clReleaseMemObject(d_T);
ciErrNum |= clReleaseMemObject(d_X);
ciErrNum |= clReleaseMemObject(d_S);
ciErrNum |= clReleaseMemObject(d_Put);
ciErrNum |= clReleaseMemObject(d_Call);
ciErrNum |= clReleaseCommandQueue(cqCommandQueue);
ciErrNum |= clReleaseContext(cxGPUContext);
//oclCheckError(ciErrNum, CL_SUCCESS);
free(h_T);
free(h_X);
free(h_S);
free(h_PutGPU);
free(h_CallGPU);
free(h_PutCPU);
free(h_CallCPU);
if(cdDevices)free(cdDevices);
shrQAFinishExit(argc, (const char **)argv, ((L1call < 1E-6) && (L1put < 1E-6)) ? QA_PASSED : QA_FAILED );
}

View File

@@ -1,50 +0,0 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#include <oclUtils.h>
////////////////////////////////////////////////////////////////////////////////
// Process an array of optN options on CPU
////////////////////////////////////////////////////////////////////////////////
extern "C" void BlackScholesCPU(
float *h_Call, //Call option price
float *h_Put, //Put option price
float *h_S, //Current stock price
float *h_X, //Option strike price
float *h_T, //Option years
float R, //Riskless rate of return
float V, //Stock volatility
unsigned int optionCount
);
////////////////////////////////////////////////////////////////////////////////
// OpenCL Black-Scholes kernel launcher
////////////////////////////////////////////////////////////////////////////////
extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqParamCommandQue, const char **argv);
extern "C" void closeBlackScholes(void);
extern "C" void BlackScholes(
cl_command_queue cqCommandQueue,
cl_mem d_Call, //Call option price
cl_mem d_Put, //Put option price
cl_mem d_S, //Current stock price
cl_mem d_X, //Option strike price
cl_mem d_T, //Option years
cl_float R, //Riskless rate of return
cl_float V, //Stock volatility
cl_uint optionCount
);

View File

@@ -1,92 +0,0 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#include <math.h>
#include "oclBlackScholes_common.h"
///////////////////////////////////////////////////////////////////////////////
// Rational approximation of cumulative normal distribution function
///////////////////////////////////////////////////////////////////////////////
static double CND(double d){
const double A1 = 0.31938153;
const double A2 = -0.356563782;
const double A3 = 1.781477937;
const double A4 = -1.821255978;
const double A5 = 1.330274429;
const double RSQRT2PI = 0.39894228040143267793994605993438;
double
K = 1.0 / (1.0 + 0.2316419 * fabs(d));
double
cnd = RSQRT2PI * exp(- 0.5 * d * d) *
(K * (A1 + K * (A2 + K * (A3 + K * (A4 + K * A5)))));
if(d > 0)
cnd = 1.0 - cnd;
return cnd;
}
///////////////////////////////////////////////////////////////////////////////
// Black-Scholes formula for both call and put
///////////////////////////////////////////////////////////////////////////////
static void BlackScholesBodyCPU(
float& call, //Call option price
float& put, //Put option price
float Sf, //Current stock price
float Xf, //Option strike price
float Tf, //Option years
float Rf, //Riskless rate of return
float Vf //Stock volatility
){
double S = Sf, X = Xf, T = Tf, R = Rf, V = Vf;
double sqrtT = sqrt(T);
double d1 = (log(S / X) + (R + 0.5 * V * V) * T) / (V * sqrtT);
double d2 = d1 - V * sqrtT;
double CNDD1 = CND(d1);
double CNDD2 = CND(d2);
//Calculate Call and Put simultaneously
double expRT = exp(- R * T);
call = (float)(S * CNDD1 - X * expRT * CNDD2);
put = (float)(X * expRT * (1.0 - CNDD2) - S * (1.0 - CNDD1));
}
////////////////////////////////////////////////////////////////////////////////
// Process an array of optN options
////////////////////////////////////////////////////////////////////////////////
extern "C" void BlackScholesCPU(
float *h_Call, //Call option price
float *h_Put, //Put option price
float *h_S, //Current stock price
float *h_X, //Option strike price
float *h_T, //Option years
float R, //Riskless rate of return
float V, //Stock volatility
unsigned int optionCount
){
for(unsigned int i = 0; i < optionCount; i++)
BlackScholesBodyCPU(
h_Call[i],
h_Put[i],
h_S[i],
h_X[i],
h_T[i],
R,
V
);
}

View File

@@ -1,125 +0,0 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#include <oclUtils.h>
#include "oclBlackScholes_common.h"
static cl_program cpBlackScholes; //OpenCL program
static cl_kernel ckBlackScholes; //OpenCL kernel
static cl_command_queue cqDefaultCommandQueue;
extern "C" void initBlackScholes(cl_context cxGPUContext, cl_command_queue cqParamCommandQueue, const char **argv){
cl_int ciErrNum;
size_t kernelLength;
shrLog("...loading BlackScholes.cl\n");
char *cPathAndName = shrFindFilePath("BlackScholes.cl", argv[0]);
shrCheckError(cPathAndName != NULL, shrTRUE);
char *cBlackScholes = oclLoadProgSource(cPathAndName, "// My comment\n", &kernelLength);
shrCheckError(cBlackScholes != NULL, shrTRUE);
shrLog("...creating BlackScholes program\n");
//cpBlackScholes = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cBlackScholes, &kernelLength, &ciErrNum);
cpBlackScholes = clCreateProgramWithBuiltInKernels(context, 1, &device_id, "BlackScholes", NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
shrLog("...building BlackScholes program\n");
ciErrNum = clBuildProgram(cpBlackScholes, 0, NULL, "-cl-fast-relaxed-math -Werror", NULL, NULL);
if(ciErrNum != CL_BUILD_SUCCESS){
shrLog("*** Compilation failure ***\n");
size_t deviceNum;
cl_device_id *cdDevices;
ciErrNum = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &deviceNum);
shrCheckError(ciErrNum, CL_SUCCESS);
cdDevices = (cl_device_id *)malloc(deviceNum * sizeof(cl_device_id));
shrCheckError(cdDevices != NULL, shrTRUE);
ciErrNum = clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, deviceNum * sizeof(cl_device_id), cdDevices, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
size_t logSize;
char *logTxt;
ciErrNum = clGetProgramBuildInfo(cpBlackScholes, cdDevices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize);
shrCheckError(ciErrNum, CL_SUCCESS);
logTxt = (char *)malloc(logSize);
shrCheckError(logTxt != NULL, shrTRUE);
ciErrNum = clGetProgramBuildInfo(cpBlackScholes, cdDevices[0], CL_PROGRAM_BUILD_LOG, logSize, logTxt, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
shrLog("%s\n", logTxt);
shrLog("*** Exiting ***\n");
free(logTxt);
free(cdDevices);
exit(666);
}
//Save ptx code to separate file
oclLogPtx(cpBlackScholes, oclGetFirstDev(cxGPUContext), "BlackScholes.ptx");
shrLog("...creating BlackScholes kernels\n");
ckBlackScholes = clCreateKernel(cpBlackScholes, "BlackScholes", &ciErrNum);
shrCheckError(ciErrNum, CL_SUCCESS);
cqDefaultCommandQueue = cqParamCommandQueue;
free(cBlackScholes);
free(cPathAndName);
}
extern "C" void closeBlackScholes(void){
cl_int ciErrNum;
ciErrNum = clReleaseKernel(ckBlackScholes);
ciErrNum |= clReleaseProgram(cpBlackScholes);
shrCheckError(ciErrNum, CL_SUCCESS);
}
////////////////////////////////////////////////////////////////////////////////
// OpenCL Black-Scholes kernel launcher
////////////////////////////////////////////////////////////////////////////////
extern "C" void BlackScholes(
cl_command_queue cqCommandQueue,
cl_mem d_Call, //Call option price
cl_mem d_Put, //Put option price
cl_mem d_S, //Current stock price
cl_mem d_X, //Option strike price
cl_mem d_T, //Option years
cl_float R, //Riskless rate of return
cl_float V, //Stock volatility
cl_uint optionCount
){
cl_int ciErrNum;
if(!cqCommandQueue)
cqCommandQueue = cqDefaultCommandQueue;
ciErrNum = clSetKernelArg(ckBlackScholes, 0, sizeof(cl_mem), (void *)&d_Call);
ciErrNum |= clSetKernelArg(ckBlackScholes, 1, sizeof(cl_mem), (void *)&d_Put);
ciErrNum |= clSetKernelArg(ckBlackScholes, 2, sizeof(cl_mem), (void *)&d_S);
ciErrNum |= clSetKernelArg(ckBlackScholes, 3, sizeof(cl_mem), (void *)&d_X);
ciErrNum |= clSetKernelArg(ckBlackScholes, 4, sizeof(cl_mem), (void *)&d_T);
ciErrNum |= clSetKernelArg(ckBlackScholes, 5, sizeof(cl_float), (void *)&R);
ciErrNum |= clSetKernelArg(ckBlackScholes, 6, sizeof(cl_float), (void *)&V);
ciErrNum |= clSetKernelArg(ckBlackScholes, 7, sizeof(cl_uint), (void *)&optionCount);
shrCheckError(ciErrNum, CL_SUCCESS);
//Run the kernel
size_t globalWorkSize = 60 * 1024;
size_t localWorkSize = 128;
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckBlackScholes, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
}

View File

@@ -1,198 +0,0 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef OCL_UTILS_H
#define OCL_UTILS_H
// *********************************************************************
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
// *********************************************************************
// Common headers: Cross-API utililties and OpenCL header
#include <shrUtils.h>
// All OpenCL headers
#if defined (__APPLE__) || defined(MACOSX)
#include <OpenCL/opencl.h>
#else
#include <CL/opencl.h>
#endif
// Includes
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
// For systems with CL_EXT that are not updated with these extensions, we copied these
// extensions from <CL/cl_ext.h>
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
#define CL_DEVICE_WARP_SIZE_NV 0x4003
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
#endif
// reminders for build output window and log
#ifdef _WIN32
#pragma message ("Note: including shrUtils.h")
#pragma message ("Note: including opencl.h")
#endif
// SDK Revision #
#define OCL_SDKREVISION "7027912"
// Error and Exit Handling Macros...
// *********************************************************************
// Full error handling macro with Cleanup() callback (if supplied)...
// (Companion Inline Function lower on page)
#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__)
// Short version without Cleanup() callback pointer
// Both Input (a) and Reference (b) are specified as args
#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0)
//////////////////////////////////////////////////////////////////////////////
//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
//!
//! @return the id
//! @param clSelectedPlatformID OpenCL platform ID
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
//////////////////////////////////////////////////////////////////////////////
//! Print info about the device
//!
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
//////////////////////////////////////////////////////////////////////////////
//! Get and return device capability
//!
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
extern "C" int oclGetDevCap(cl_device_id device);
//////////////////////////////////////////////////////////////////////////////
//! Print the device name
//!
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the first device from the context
//!
//! @return the id
//! @param cxGPUContext OpenCL context
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the nth device from the context
//!
//! @return the id or -1 when out of range
//! @param cxGPUContext OpenCL context
//! @param device_idx index of the device of interest
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of device with maximal FLOPS from the context
//!
//! @return the id
//! @param cxGPUContext OpenCL context
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
//////////////////////////////////////////////////////////////////////////////
//! Loads a Program file and prepends the cPreamble to the code.
//!
//! @return the source string if succeeded, 0 otherwise
//! @param cFilename program filename
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
//! @param szFinalLength returned length of the code string
//////////////////////////////////////////////////////////////////////////////
extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
//////////////////////////////////////////////////////////////////////////////
//! Get the binary (PTX) of the program associated with the device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//! @param binary returned code
//! @param length length of returned code
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
//////////////////////////////////////////////////////////////////////////////
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//! @param const char* cPtxFileName optional PTX file name
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
//////////////////////////////////////////////////////////////////////////////
//! Get and log the Build Log from the OpenCL compiler for the requested program & device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
// Helper function for De-allocating cl objects
// *********************************************************************
extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
// Helper function to get OpenCL error string from constant
// *********************************************************************
extern "C" const char* oclErrorString(cl_int error);
// Helper function to get OpenCL image format string (channel order and type) from constant
// *********************************************************************
extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
// *********************************************************************
inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
{
// An error condition is defined by the sample/test value not equal to the reference
if (iReference != iSample)
{
// If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
iSample = (iSample == 0) ? -9999 : iSample;
// Log the error info
shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
// Cleanup and exit, or just exit if no cleanup function pointer provided. Use iSample (error code in this case) as process exit code.
if (pCleanup != NULL)
{
pCleanup(iSample);
}
else
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
exit(iSample);
}
}
}
#endif

View File

@@ -1,238 +0,0 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef SHR_QATEST_H
#define SHR_QATEST_H
// *********************************************************************
// Generic utilities for NVIDIA GPU Computing SDK
// *********************************************************************
// OS dependent includes
#ifdef _WIN32
#pragma message ("Note: including windows.h")
#pragma message ("Note: including math.h")
#pragma message ("Note: including assert.h")
#pragma message ("Note: including time.h")
// Headers needed for Windows
#include <windows.h>
#include <time.h>
#else
// Headers needed for Linux
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#include <unistd.h>
#include <time.h>
#endif
#ifndef STRCASECMP
#ifdef _WIN32
#define STRCASECMP _stricmp
#else
#define STRCASECMP strcasecmp
#endif
#endif
#ifndef STRNCASECMP
#ifdef _WIN32
#define STRNCASECMP _strnicmp
#else
#define STRNCASECMP strncasecmp
#endif
#endif
// Standardized QA Start/Finish for CUDA SDK tests
#define shrQAStart(a, b) __shrQAStart(a, b)
#define shrQAFinish(a, b, c) __shrQAFinish(a, b, c)
#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
inline int findExeNameStart(const char *exec_name)
{
int exename_start = (int)strlen(exec_name);
while( (exename_start > 0) &&
(exec_name[exename_start] != '\\') &&
(exec_name[exename_start] != '/') )
{
exename_start--;
}
if (exec_name[exename_start] == '\\' ||
exec_name[exename_start] == '/')
{
return exename_start+1;
} else {
return exename_start;
}
}
inline int __shrQAStart(int argc, char **argv)
{
bool bQATest = false;
// First clear the output buffer
fflush(stdout);
fflush(stdout);
for (int i=1; i < argc; i++) {
int string_start = 0;
while (argv[i][string_start] == '-')
string_start++;
char *string_argv = &argv[i][string_start];
if (!STRCASECMP(string_argv, "qatest")) {
bQATest = true;
}
}
// We don't want to print the entire path, so we search for the first
int exename_start = findExeNameStart(argv[0]);
if (bQATest) {
fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
fprintf(stdout, "\n");
} else {
fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
}
fflush(stdout);
printf("\n"); fflush(stdout);
return exename_start;
}
enum eQAstatus {
QA_FAILED = 0,
QA_PASSED = 1,
QA_WAIVED = 2
};
inline void __ExitInTime(int seconds)
{
fprintf(stdout, "> exiting in %d seconds: ", seconds);
fflush(stdout);
time_t t;
int count;
for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
fprintf(stdout, "%d...", count);
#ifdef WIN32
Sleep(1000);
#else
sleep(1);
#endif
}
fprintf(stdout,"done!\n\n");
fflush(stdout);
}
inline void __shrQAFinish(int argc, const char **argv, int iStatus)
{
// By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
for (int i=1; i < argc; i++) {
int string_start = 0;
while (argv[i][string_start] == '-')
string_start++;
const char *string_argv = &argv[i][string_start];
if (!STRCASECMP(string_argv, "qatest")) {
bQATest = true;
}
// For SDK individual samples that don't specify -noprompt or -prompt,
// a 3 second delay will happen before exiting, giving a user time to view results
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
bNoPrompt = true;
bQuitInTime = false;
}
if (!STRCASECMP(string_argv, "prompt")) {
bNoPrompt = false;
bQuitInTime = false;
}
}
int exename_start = findExeNameStart(argv[0]);
if (bQATest) {
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
fprintf(stdout, "\n");
} else {
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
}
fflush(stdout);
printf("\n"); fflush(stdout);
if (bQuitInTime) {
__ExitInTime(3);
} else {
if (!bNoPrompt) {
fprintf(stdout, "\nPress <Enter> to exit...\n");
fflush(stdout);
getchar();
}
}
}
inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
{
bool bQuitInTime = true;
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
for (int i=1; i < argc; i++) {
int string_start = 0;
while (argv[i][string_start] == '-')
string_start++;
const char *string_argv = &argv[i][string_start];
// For SDK individual samples that don't specify -noprompt or -prompt,
// a 3 second delay will happen before exiting, giving a user time to view results
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
bQuitInTime = false;
}
if (!STRCASECMP(string_argv, "prompt")) {
bQuitInTime = false;
}
}
int exename_start = findExeNameStart(argv[0]);
if (bQATest) {
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
fprintf(stdout, "\n");
} else {
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
}
fflush(stdout);
if (bQuitInTime) {
__ExitInTime(3);
}
}
inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
{
__shrQAFinish(argc, argv, iStatus);
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
}
inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
{
__shrQAFinish2(bQAtest, argc, argv, iStatus);
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
}
#endif

View File

@@ -1,642 +0,0 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef SHR_UTILS_H
#define SHR_UTILS_H
// *********************************************************************
// Generic utilities for NVIDIA GPU Computing SDK
// *********************************************************************
// reminders for output window and build log
#ifdef _WIN32
#pragma message ("Note: including windows.h")
#pragma message ("Note: including math.h")
#pragma message ("Note: including assert.h")
#endif
// OS dependent includes
#ifdef _WIN32
// Headers needed for Windows
#include <windows.h>
#else
// Headers needed for Linux
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#endif
// Other headers needed for both Windows and Linux
#include <math.h>
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
// Un-comment the following #define to enable profiling code in SDK apps
//#define GPU_PROFILING
// Beginning of GPU Architecture definitions
inline int ConvertSMVer2Cores(int major, int minor)
{
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
typedef struct {
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
int Cores;
} sSMtoCores;
sSMtoCores nGpuArchCoresPerSM[] =
{ { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
{ 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
{ -1, -1 }
};
int index = 0;
while (nGpuArchCoresPerSM[index].SM != -1) {
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
return nGpuArchCoresPerSM[index].Cores;
}
index++;
}
printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
return -1;
}
// end of GPU Architecture definitions
// Defines and enum for use with logging functions
// *********************************************************************
#define DEFAULTLOGFILE "SdkConsoleLog.txt"
#define MASTERLOGFILE "SdkMasterLog.csv"
enum LOGMODES
{
LOGCONSOLE = 1, // bit to signal "log to console"
LOGFILE = 2, // bit to signal "log to file"
LOGBOTH = 3, // convenience union of first 2 bits to signal "log to both"
APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
MASTER = 8, // bit to signal master .csv log output
ERRORMSG = 16, // bit to signal "pre-pend Error"
CLOSELOG = 32 // bit to close log file, if open, after any requested file write
};
#define HDASHLINE "-----------------------------------------------------------\n"
// Standardized boolean
enum shrBOOL
{
shrFALSE = 0,
shrTRUE = 1
};
// Standardized MAX, MIN and CLAMP
#define MAX(a, b) ((a > b) ? a : b)
#define MIN(a, b) ((a < b) ? a : b)
#define CLAMP(a, b, c) MIN(MAX(a, b), c) // double sided clip of input a
#define TOPCLAMP(a, b) (a < b ? a:b) // single top side clip of input a
// Error and Exit Handling Macros...
// *********************************************************************
// Full error handling macro with Cleanup() callback (if supplied)...
// (Companion Inline Function lower on page)
#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__)
// Short version without Cleanup() callback pointer
// Both Input (a) and Reference (b) are specified as args
#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0)
// Standardized Exit Macro for leaving main()... extended version
// (Companion Inline Function lower on page)
#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
// Standardized Exit Macro for leaving main()... short version
// (Companion Inline Function lower on page)
#define shrEXIT(a, b) __shrExitEX(a, b, EXIT_SUCCESS)
// Simple argument checker macro
#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE
// Define for user-customized error handling
#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
// Function to deallocate memory allocated within shrUtils
// *********************************************************************
extern "C" void shrFree(void* ptr);
// *********************************************************************
// Helper function to log standardized information to Console, to File or to both
//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n");
//! : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
//!
//! Automatically opens file and stores handle if needed and not done yet
//! Closes file and nulls handle on request
//!
//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.
//! LOGFILE and LOGBOTH may be | 'd with APPENDMODE to select file append mode instead of overwrite mode
//! LOGFILE and LOGBOTH may be | 'd with CLOSELOG to "write and close"
//! First 3 options may be | 'd with MASTER to enable independent write to master data log file
//! First 3 options may be | 'd with ERRORMSG to start line with standard error message
//! @param 2 dValue:
//! Positive val = double value for time in secs to be formatted to 6 decimals.
//! Negative val is an error code and this give error preformatting.
//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.
//! ALL printf flags, width, precision and type specifiers are supported with this exception:
//! Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
//! Single byte char type specifiers (%s and %c) ARE supported
//! @param 4... variable args: like printf or fprintf. Must match format specifer type above.
//! @return 0 if OK, negative value on error or if error occurs or was passed in.
// *********************************************************************
extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0,
// *********************************************************************
extern "C" int shrLog(const char* cFormatString, ...);
// *********************************************************************
// Delta timer function for up to 3 independent timers using host high performance counters
// Maintains state for 3 independent counters
//! Example: double dElapsedTime = shrDeltaTime(0);
//!
//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
//! @return delta time of specified counter since last call in seconds. Otherwise -9999.0 if error
// *********************************************************************
extern "C" double shrDeltaT(int iCounterID);
// Optional LogFileNameOverride function
// *********************************************************************
extern "C" void shrSetLogFileName (const char* cOverRideName);
// Helper function to init data arrays
// *********************************************************************
extern "C" void shrFillArray(float* pfData, int iSize);
// Helper function to print data arrays
// *********************************************************************
extern "C" void shrPrintArray(float* pfData, int iSize);
////////////////////////////////////////////////////////////////////////////
//! Find the path for a filename
//! @return the path if succeeded, otherwise 0
//! @param filename name of the file
//! @param executablePath optional absolute path of the executable
////////////////////////////////////////////////////////////////////////////
extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing single precision floating point data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing double precision floating point data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing integer data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing unsigned integer data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data,
unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing char / byte data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing unsigned char / byte data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data,
unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing single precision floating point
//! data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
//! @param epsilon epsilon for comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
const float epsilon, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing double precision floating point
//! data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
//! @param epsilon epsilon for comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
const double epsilon, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing integer data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing unsigned integer data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data,
unsigned int len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing char / byte data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing unsigned char / byte data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
unsigned int len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Load PPM image file (with unsigned char as data element type), padding
//! 4th component
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param OutData handle to the data read
//! @param w width of the image
//! @param h height of the image
//!
//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData,
unsigned int *w, unsigned int *h);
////////////////////////////////////////////////////////////////////////////
//! Save PPM image file (with unsigned char as data element type, padded to
//! 4 bytes)
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////////
//! Save PGM image file (with unsigned char as data element type)
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////
//! Load PGM image file (with unsigned char as data element type)
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
unsigned int *w,unsigned int *h);
////////////////////////////////////////////////////////////////////////////
// Command line arguments: General notes
// * All command line arguments begin with '--' followed by the token;
// token and value are seperated by '='; example --samples=50
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
// (without whitespaces)
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
//! Check if command line argument \a flag-name is given
//! @return shrTRUE if command line argument \a flag_name has been given,
//! otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param flag_name name of command line flag
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv,
const char* flag_name);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type int
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv,
const char* arg_name, int* val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type unsigned int
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv,
const char* arg_name, unsigned int* val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type float
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv,
const char* arg_name, float* val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type string
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv,
const char* arg_name, char** val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument list those element are strings
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val command line argument list
//! @param len length of the list / number of elements
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv,
const char* arg_name, char** val,
unsigned int* len);
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparef( const float* reference, const float* data,
const unsigned int len);
////////////////////////////////////////////////////////////////////////////
//! Compare two integer arrays
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparei( const int* reference, const int* data,
const unsigned int len );
////////////////////////////////////////////////////////////////////////////////
//! Compare two unsigned integer arrays, with epsilon and threshold
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
const unsigned int len, const float epsilon, const float threshold );
////////////////////////////////////////////////////////////////////////////
//! Compare two unsigned char arrays
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
const unsigned int len );
////////////////////////////////////////////////////////////////////////////////
//! Compare two integers with a tolernance for # of byte errors
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
const unsigned int len, const float epsilon, const float threshold );
////////////////////////////////////////////////////////////////////////////////
//! Compare two integer arrays witha n epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
const unsigned int len, const float epsilon );
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays with an epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
const unsigned int len, const float epsilon );
////////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays with an epsilon tolerance for equality and a
//! threshold for # pixel errors
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
const unsigned int len, const float epsilon, const float threshold );
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays using L2-norm with an epsilon tolerance for
//! equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
const unsigned int len, const float epsilon );
////////////////////////////////////////////////////////////////////////////////
//! Compare two PPM image files with an epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param src_file filename for the image to be compared
//! @param data filename for the reference data / gold image
//! @param epsilon epsilon to use for the comparison
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
//! $param verboseErrors output details of image mismatch to std::err
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
////////////////////////////////////////////////////////////////////////////////
//! Compare two PGM image files with an epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param src_file filename for the image to be compared
//! @param data filename for the reference data / gold image
//! @param epsilon epsilon to use for the comparison
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
//! $param verboseErrors output details of image mismatch to std::err
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
extern "C" size_t shrRoundUp(int group_size, int global_size);
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
// *********************************************************************
inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
{
if (iReference != iSample)
{
shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile);
if (pCleanup != NULL)
{
pCleanup(EXIT_FAILURE);
}
else
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
exit(EXIT_FAILURE);
}
}
}
// Standardized Exit
// *********************************************************************
inline void __shrExitEX(int argc, const char** argv, int iExitCode)
{
#ifdef WIN32
if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
#else
if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
#endif
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");
getchar();
}
else
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]);
}
fflush(stderr);
exit(iExitCode);
}
#endif

View File

@@ -1,29 +0,0 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
__kernel void DotProduct (__global float* a, __global float* b, __global float* c, int iNumElements)
{
// find position in global arrays
int iGID = get_global_id(0);
// bound check (equivalent to the limit on a 'for' loop for standard/serial C code
if (iGID >= iNumElements)
{
return;
}
// process
int iInOffset = iGID << 2;
c[iGID] = a[iInOffset] * b[iInOffset]
+ a[iInOffset + 1] * b[iInOffset + 1]
+ a[iInOffset + 2] * b[iInOffset + 2]
+ a[iInOffset + 3] * b[iInOffset + 3];
}

View File

@@ -1,66 +0,0 @@
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH) -I.
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT=DotProduct
all: $(PROJECT).dump $(PROJECT).hex
lib$(PROJECT).a: DotProduct.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.elf *.dump *.hex

View File

@@ -1,270 +0,0 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
// *********************************************************************
// oclDotProduct Notes:
//
// A simple OpenCL API demo application that implements a
// vector dot product computation between 2 float arrays.
//
// Runs computations with OpenCL on the GPU device and then checks results
// against basic host CPU/C++ computation.
//
// Uses 'shr' and 'ocl' functions from oclUtils and shrUtils libraries for compactness.
// But these are NOT required libs for OpenCL developement in general.
// *********************************************************************
// standard utilities and systems includes
#include <oclUtils.h>
#include <shrQATest.h>
// Name of the file with the source code for the computation kernel
// *********************************************************************
const char* cSourceFile = "DotProduct.cl";
// Host buffers for demo
// *********************************************************************
void *srcA, *srcB, *dst; // Host buffers for OpenCL test
void* Golden; // Host buffer for host golden processing cross check
// OpenCL Vars
cl_platform_id cpPlatform; // OpenCL platform
cl_device_id *cdDevices; // OpenCL device
cl_context cxGPUContext; // OpenCL context
cl_command_queue cqCommandQueue;// OpenCL command que
cl_program program; // OpenCL program
cl_kernel ckKernel; // OpenCL kernel
cl_mem cmDevSrcA; // OpenCL device source buffer A
cl_mem cmDevSrcB; // OpenCL device source buffer B
cl_mem cmDevDst; // OpenCL device destination buffer
size_t szGlobalWorkSize; // Total # of work items in the 1D range
size_t szLocalWorkSize; // # of work items in the 1D work group
size_t szParmDataBytes; // Byte size of context information
size_t szKernelLength; // Byte size of kernel code
cl_int ciErrNum; // Error code var
char* cPathAndName = NULL; // var for full paths to data, src, etc.
char* cSourceCL = NULL; // Buffer to hold source for compilation
const char* cExecutableName = NULL;
// demo config vars
int iNumElements= 1277944; // Length of float arrays to process (odd # for illustration)
shrBOOL bNoPrompt = shrFALSE;
// Forward Declarations
// *********************************************************************
void DotProductHost(const float* pfData1, const float* pfData2, float* pfResult, int iNumElements);
void Cleanup (int iExitCode);
void (*pCleanup)(int) = &Cleanup;
int *gp_argc = NULL;
char ***gp_argv = NULL;
// Main function
// *********************************************************************
int main(int argc, char **argv)
{
gp_argc = &argc;
gp_argv = &argv;
shrQAStart(argc, argv);
// Get the NVIDIA platform
ciErrNum = oclGetPlatformID(&cpPlatform);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
shrLog("clGetPlatformID...\n");
// Get the NVIDIA platform
ciErrNum = oclGetPlatformID(&cpPlatform);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
shrLog("clGetPlatformID...\n");
//Get all the devices
cl_uint uiNumDevices = 0; // Number of devices available
cl_uint uiTargetDevice = 0; // Default Device to compute on
cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU)
shrLog("Get the Device info and select Device...\n");
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
// Get command line device options and config accordingly
shrLog(" # of Devices Available = %u\n", uiNumDevices);
if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE)
{
uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
}
shrLog(" Using Device %u: ", uiTargetDevice);
oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits);
// get command line arg for quick test, if provided
bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");
// start logs
cExecutableName = argv[0];
shrSetLogFileName ("oclDotProduct.txt");
shrLog("%s Starting...\n\n# of float elements per Array \t= %u\n", argv[0], iNumElements);
// set and log Global and Local work size dimensions
szLocalWorkSize = 256;
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize
shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n",
szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize));
// Allocate and initialize host arrays
shrLog( "Allocate and Init Host Mem...\n");
srcA = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize);
srcB = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize);
dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize);
Golden = (void *)malloc(sizeof(cl_float) * iNumElements);
shrFillArray((float*)srcA, 4 * iNumElements);
shrFillArray((float*)srcB, 4 * iNumElements);
// Get the NVIDIA platform
ciErrNum = oclGetPlatformID(&cpPlatform);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Get a GPU device
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 1, &cdDevices[uiTargetDevice], NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Create the context
cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Create a command-queue
shrLog("clCreateCommandQueue...\n");
cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Allocate the OpenCL buffer memory objects for source and result on the device GMEM
shrLog("clCreateBuffer (SrcA, SrcB and Dst in Device GMEM)...\n");
cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Read the OpenCL kernel in from source file
shrLog("oclLoadProgSource (%s)...\n", cSourceFile);
cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
//oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup);
cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
//oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup);
// Create the program
shrLog("clCreateProgramWithSource...\n");
//program = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);
cl_program program =
clCreateProgramWithBuiltInKernels(context, 1, &device_id, "sgemm", NULL);
// Build the program with 'mad' Optimization option
#ifdef MAC
char* flags = "-cl-fast-relaxed-math -DMAC";
#else
char* flags = "-cl-fast-relaxed-math";
#endif
shrLog("clBuildProgram...\n");
ciErrNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
if (ciErrNum != CL_SUCCESS)
{
// write out standard error, Build Log and PTX, then cleanup and exit
shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
oclLogBuildInfo(program, oclGetFirstDev(cxGPUContext));
oclLogPtx(program, oclGetFirstDev(cxGPUContext), "oclDotProduct.ptx");
Cleanup(EXIT_FAILURE);
}
// Create the kernel
shrLog("clCreateKernel (DotProduct)...\n");
ckKernel = clCreateKernel(program, "DotProduct", &ciErrNum);
// Set the Argument values
shrLog("clSetKernelArg 0 - 3...\n\n");
ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA);
ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB);
ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst);
ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// --------------------------------------------------------
// Core sequence... copy input data to GPU, compute, copy results back
// Asynchronous write of data to GPU device
shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n");
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcA, 0, NULL, NULL);
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcB, 0, NULL, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Launch kernel
shrLog("clEnqueueNDRangeKernel (DotProduct)...\n");
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Read back results and check accumulated errors
shrLog("clEnqueueReadBuffer (Dst)...\n\n");
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Compute and compare results for golden-host and report errors and pass/fail
shrLog("Comparing against Host/C++ computation...\n\n");
DotProductHost ((const float*)srcA, (const float*)srcB, (float*)Golden, iNumElements);
shrBOOL bMatch = shrComparefet((const float*)Golden, (const float*)dst, (unsigned int)iNumElements, 0.0f, 0);
// Cleanup and leave
Cleanup (EXIT_SUCCESS);
}
// "Golden" Host processing dot product function for comparison purposes
// *********************************************************************
void DotProductHost(const float* pfData1, const float* pfData2, float* pfResult, int iNumElements)
{
int i, j, k;
for (i = 0, j = 0; i < iNumElements; i++)
{
pfResult[i] = 0.0f;
for (k = 0; k < 4; k++, j++)
{
pfResult[i] += pfData1[j] * pfData2[j];
}
}
}
// Cleanup and exit code
// *********************************************************************
void Cleanup(int iExitCode)
{
// Cleanup allocated objects
shrLog("Starting Cleanup...\n\n");
if(cPathAndName)free(cPathAndName);
if(cSourceCL)free(cSourceCL);
if(ckKernel)clReleaseKernel(ckKernel);
if(program)clReleaseProgram(program);
if(cqCommandQueue)clReleaseCommandQueue(cqCommandQueue);
if(cxGPUContext)clReleaseContext(cxGPUContext);
if (cmDevSrcA)clReleaseMemObject(cmDevSrcA);
if (cmDevSrcB)clReleaseMemObject(cmDevSrcB);
if (cmDevDst)clReleaseMemObject(cmDevDst);
// Free host memory
free(srcA);
free(srcB);
free (dst);
free(Golden);
if (cdDevices) free(cdDevices);
shrQAFinishExit(*gp_argc, (const char **)*gp_argv, (iExitCode == EXIT_SUCCESS) ? QA_PASSED : QA_FAILED);
}

View File

@@ -1,198 +0,0 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef OCL_UTILS_H
#define OCL_UTILS_H
// *********************************************************************
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
// *********************************************************************
// Common headers: Cross-API utililties and OpenCL header
#include <shrUtils.h>
// All OpenCL headers
#if defined (__APPLE__) || defined(MACOSX)
#include <OpenCL/opencl.h>
#else
#include <CL/opencl.h>
#endif
// Includes
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
// For systems with CL_EXT that are not updated with these extensions, we copied these
// extensions from <CL/cl_ext.h>
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
#define CL_DEVICE_WARP_SIZE_NV 0x4003
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
#endif
// reminders for build output window and log
#ifdef _WIN32
#pragma message ("Note: including shrUtils.h")
#pragma message ("Note: including opencl.h")
#endif
// SDK Revision #
#define OCL_SDKREVISION "7027912"
// Error and Exit Handling Macros...
// *********************************************************************
// Full error handling macro with Cleanup() callback (if supplied)...
// (Companion Inline Function lower on page)
#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__)
// Short version without Cleanup() callback pointer
// Both Input (a) and Reference (b) are specified as args
#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0)
//////////////////////////////////////////////////////////////////////////////
//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
//!
//! @return the id
//! @param clSelectedPlatformID OpenCL platform ID
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
//////////////////////////////////////////////////////////////////////////////
//! Print info about the device
//!
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
//////////////////////////////////////////////////////////////////////////////
//! Get and return device capability
//!
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
extern "C" int oclGetDevCap(cl_device_id device);
//////////////////////////////////////////////////////////////////////////////
//! Print the device name
//!
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the first device from the context
//!
//! @return the id
//! @param cxGPUContext OpenCL context
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the nth device from the context
//!
//! @return the id or -1 when out of range
//! @param cxGPUContext OpenCL context
//! @param device_idx index of the device of interest
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of device with maximal FLOPS from the context
//!
//! @return the id
//! @param cxGPUContext OpenCL context
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
//////////////////////////////////////////////////////////////////////////////
//! Loads a Program file and prepends the cPreamble to the code.
//!
//! @return the source string if succeeded, 0 otherwise
//! @param cFilename program filename
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
//! @param szFinalLength returned length of the code string
//////////////////////////////////////////////////////////////////////////////
extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
//////////////////////////////////////////////////////////////////////////////
//! Get the binary (PTX) of the program associated with the device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//! @param binary returned code
//! @param length length of returned code
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
//////////////////////////////////////////////////////////////////////////////
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//! @param const char* cPtxFileName optional PTX file name
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
//////////////////////////////////////////////////////////////////////////////
//! Get and log the Build Log from the OpenCL compiler for the requested program & device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
// Helper function for De-allocating cl objects
// *********************************************************************
extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
// Helper function to get OpenCL error string from constant
// *********************************************************************
extern "C" const char* oclErrorString(cl_int error);
// Helper function to get OpenCL image format string (channel order and type) from constant
// *********************************************************************
extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
// *********************************************************************
inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
{
// An error condition is defined by the sample/test value not equal to the reference
if (iReference != iSample)
{
// If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
iSample = (iSample == 0) ? -9999 : iSample;
// Log the error info
shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
// Cleanup and exit, or just exit if no cleanup function pointer provided. Use iSample (error code in this case) as process exit code.
if (pCleanup != NULL)
{
pCleanup(iSample);
}
else
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
exit(iSample);
}
}
}
#endif

View File

@@ -1,238 +0,0 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef SHR_QATEST_H
#define SHR_QATEST_H
// *********************************************************************
// Generic utilities for NVIDIA GPU Computing SDK
// *********************************************************************
// OS dependent includes
#ifdef _WIN32
#pragma message ("Note: including windows.h")
#pragma message ("Note: including math.h")
#pragma message ("Note: including assert.h")
#pragma message ("Note: including time.h")
// Headers needed for Windows
#include <windows.h>
#include <time.h>
#else
// Headers needed for Linux
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#include <unistd.h>
#include <time.h>
#endif
#ifndef STRCASECMP
#ifdef _WIN32
#define STRCASECMP _stricmp
#else
#define STRCASECMP strcasecmp
#endif
#endif
#ifndef STRNCASECMP
#ifdef _WIN32
#define STRNCASECMP _strnicmp
#else
#define STRNCASECMP strncasecmp
#endif
#endif
// Standardized QA Start/Finish for CUDA SDK tests
#define shrQAStart(a, b) __shrQAStart(a, b)
#define shrQAFinish(a, b, c) __shrQAFinish(a, b, c)
#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
inline int findExeNameStart(const char *exec_name)
{
int exename_start = (int)strlen(exec_name);
while( (exename_start > 0) &&
(exec_name[exename_start] != '\\') &&
(exec_name[exename_start] != '/') )
{
exename_start--;
}
if (exec_name[exename_start] == '\\' ||
exec_name[exename_start] == '/')
{
return exename_start+1;
} else {
return exename_start;
}
}
inline int __shrQAStart(int argc, char **argv)
{
bool bQATest = false;
// First clear the output buffer
fflush(stdout);
fflush(stdout);
for (int i=1; i < argc; i++) {
int string_start = 0;
while (argv[i][string_start] == '-')
string_start++;
char *string_argv = &argv[i][string_start];
if (!STRCASECMP(string_argv, "qatest")) {
bQATest = true;
}
}
// We don't want to print the entire path, so we search for the first
int exename_start = findExeNameStart(argv[0]);
if (bQATest) {
fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
fprintf(stdout, "\n");
} else {
fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
}
fflush(stdout);
printf("\n"); fflush(stdout);
return exename_start;
}
enum eQAstatus {
QA_FAILED = 0,
QA_PASSED = 1,
QA_WAIVED = 2
};
inline void __ExitInTime(int seconds)
{
fprintf(stdout, "> exiting in %d seconds: ", seconds);
fflush(stdout);
time_t t;
int count;
for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
fprintf(stdout, "%d...", count);
#ifdef WIN32
Sleep(1000);
#else
sleep(1);
#endif
}
fprintf(stdout,"done!\n\n");
fflush(stdout);
}
inline void __shrQAFinish(int argc, const char **argv, int iStatus)
{
// By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
for (int i=1; i < argc; i++) {
int string_start = 0;
while (argv[i][string_start] == '-')
string_start++;
const char *string_argv = &argv[i][string_start];
if (!STRCASECMP(string_argv, "qatest")) {
bQATest = true;
}
// For SDK individual samples that don't specify -noprompt or -prompt,
// a 3 second delay will happen before exiting, giving a user time to view results
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
bNoPrompt = true;
bQuitInTime = false;
}
if (!STRCASECMP(string_argv, "prompt")) {
bNoPrompt = false;
bQuitInTime = false;
}
}
int exename_start = findExeNameStart(argv[0]);
if (bQATest) {
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
fprintf(stdout, "\n");
} else {
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
}
fflush(stdout);
printf("\n"); fflush(stdout);
if (bQuitInTime) {
__ExitInTime(3);
} else {
if (!bNoPrompt) {
fprintf(stdout, "\nPress <Enter> to exit...\n");
fflush(stdout);
getchar();
}
}
}
inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
{
bool bQuitInTime = true;
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
for (int i=1; i < argc; i++) {
int string_start = 0;
while (argv[i][string_start] == '-')
string_start++;
const char *string_argv = &argv[i][string_start];
// For SDK individual samples that don't specify -noprompt or -prompt,
// a 3 second delay will happen before exiting, giving a user time to view results
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
bQuitInTime = false;
}
if (!STRCASECMP(string_argv, "prompt")) {
bQuitInTime = false;
}
}
int exename_start = findExeNameStart(argv[0]);
if (bQATest) {
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
fprintf(stdout, "\n");
} else {
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
}
fflush(stdout);
if (bQuitInTime) {
__ExitInTime(3);
}
}
inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
{
__shrQAFinish(argc, argv, iStatus);
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
}
inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
{
__shrQAFinish2(bQAtest, argc, argv, iStatus);
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
}
#endif

View File

@@ -1,642 +0,0 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef SHR_UTILS_H
#define SHR_UTILS_H
// *********************************************************************
// Generic utilities for NVIDIA GPU Computing SDK
// *********************************************************************
// reminders for output window and build log
#ifdef _WIN32
#pragma message ("Note: including windows.h")
#pragma message ("Note: including math.h")
#pragma message ("Note: including assert.h")
#endif
// OS dependent includes
#ifdef _WIN32
// Headers needed for Windows
#include <windows.h>
#else
// Headers needed for Linux
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#endif
// Other headers needed for both Windows and Linux
#include <math.h>
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
// Un-comment the following #define to enable profiling code in SDK apps
//#define GPU_PROFILING
// Beginning of GPU Architecture definitions
inline int ConvertSMVer2Cores(int major, int minor)
{
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
typedef struct {
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
int Cores;
} sSMtoCores;
sSMtoCores nGpuArchCoresPerSM[] =
{ { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
{ 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
{ -1, -1 }
};
int index = 0;
while (nGpuArchCoresPerSM[index].SM != -1) {
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
return nGpuArchCoresPerSM[index].Cores;
}
index++;
}
printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
return -1;
}
// end of GPU Architecture definitions
// Defines and enum for use with logging functions
// *********************************************************************
#define DEFAULTLOGFILE "SdkConsoleLog.txt"
#define MASTERLOGFILE "SdkMasterLog.csv"
enum LOGMODES
{
LOGCONSOLE = 1, // bit to signal "log to console"
LOGFILE = 2, // bit to signal "log to file"
LOGBOTH = 3, // convenience union of first 2 bits to signal "log to both"
APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
MASTER = 8, // bit to signal master .csv log output
ERRORMSG = 16, // bit to signal "pre-pend Error"
CLOSELOG = 32 // bit to close log file, if open, after any requested file write
};
#define HDASHLINE "-----------------------------------------------------------\n"
// Standardized boolean
enum shrBOOL
{
shrFALSE = 0,
shrTRUE = 1
};
// Standardized MAX, MIN and CLAMP
#define MAX(a, b) ((a > b) ? a : b)
#define MIN(a, b) ((a < b) ? a : b)
#define CLAMP(a, b, c) MIN(MAX(a, b), c) // double sided clip of input a
#define TOPCLAMP(a, b) (a < b ? a:b) // single top side clip of input a
// Error and Exit Handling Macros...
// *********************************************************************
// Full error handling macro with Cleanup() callback (if supplied)...
// (Companion Inline Function lower on page)
#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__)
// Short version without Cleanup() callback pointer
// Both Input (a) and Reference (b) are specified as args
#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0)
// Standardized Exit Macro for leaving main()... extended version
// (Companion Inline Function lower on page)
#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
// Standardized Exit Macro for leaving main()... short version
// (Companion Inline Function lower on page)
#define shrEXIT(a, b) __shrExitEX(a, b, EXIT_SUCCESS)
// Simple argument checker macro
#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE
// Define for user-customized error handling
#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
// Function to deallocate memory allocated within shrUtils
// *********************************************************************
extern "C" void shrFree(void* ptr);
// *********************************************************************
// Helper function to log standardized information to Console, to File or to both
//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n");
//! : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
//!
//! Automatically opens file and stores handle if needed and not done yet
//! Closes file and nulls handle on request
//!
//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.
//! LOGFILE and LOGBOTH may be | 'd with APPENDMODE to select file append mode instead of overwrite mode
//! LOGFILE and LOGBOTH may be | 'd with CLOSELOG to "write and close"
//! First 3 options may be | 'd with MASTER to enable independent write to master data log file
//! First 3 options may be | 'd with ERRORMSG to start line with standard error message
//! @param 2 dValue:
//! Positive val = double value for time in secs to be formatted to 6 decimals.
//! Negative val is an error code and this give error preformatting.
//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.
//! ALL printf flags, width, precision and type specifiers are supported with this exception:
//! Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
//! Single byte char type specifiers (%s and %c) ARE supported
//! @param 4... variable args: like printf or fprintf. Must match format specifer type above.
//! @return 0 if OK, negative value on error or if error occurs or was passed in.
// *********************************************************************
extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0,
// *********************************************************************
extern "C" int shrLog(const char* cFormatString, ...);
// *********************************************************************
// Delta timer function for up to 3 independent timers using host high performance counters
// Maintains state for 3 independent counters
//! Example: double dElapsedTime = shrDeltaTime(0);
//!
//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
//! @return delta time of specified counter since last call in seconds. Otherwise -9999.0 if error
// *********************************************************************
extern "C" double shrDeltaT(int iCounterID);
// Optional LogFileNameOverride function
// *********************************************************************
extern "C" void shrSetLogFileName (const char* cOverRideName);
// Helper function to init data arrays
// *********************************************************************
extern "C" void shrFillArray(float* pfData, int iSize);
// Helper function to print data arrays
// *********************************************************************
extern "C" void shrPrintArray(float* pfData, int iSize);
////////////////////////////////////////////////////////////////////////////
//! Find the path for a filename
//! @return the path if succeeded, otherwise 0
//! @param filename name of the file
//! @param executablePath optional absolute path of the executable
////////////////////////////////////////////////////////////////////////////
extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing single precision floating point data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing double precision floating point data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing integer data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing unsigned integer data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data,
unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing char / byte data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing unsigned char / byte data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data,
unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing single precision floating point
//! data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
//! @param epsilon epsilon for comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
const float epsilon, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing double precision floating point
//! data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
//! @param epsilon epsilon for comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
const double epsilon, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing integer data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing unsigned integer data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data,
unsigned int len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing char / byte data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing unsigned char / byte data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
unsigned int len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Load PPM image file (with unsigned char as data element type), padding
//! 4th component
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param OutData handle to the data read
//! @param w width of the image
//! @param h height of the image
//!
//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData,
unsigned int *w, unsigned int *h);
////////////////////////////////////////////////////////////////////////////
//! Save PPM image file (with unsigned char as data element type, padded to
//! 4 bytes)
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////////
//! Save PGM image file (with unsigned char as data element type)
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////
//! Load PGM image file (with unsigned char as data element type)
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
unsigned int *w,unsigned int *h);
////////////////////////////////////////////////////////////////////////////
// Command line arguments: General notes
// * All command line arguments begin with '--' followed by the token;
// token and value are seperated by '='; example --samples=50
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
// (without whitespaces)
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
//! Check if command line argument \a flag-name is given
//! @return shrTRUE if command line argument \a flag_name has been given,
//! otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param flag_name name of command line flag
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv,
const char* flag_name);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type int
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv,
const char* arg_name, int* val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type unsigned int
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv,
const char* arg_name, unsigned int* val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type float
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv,
const char* arg_name, float* val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type string
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv,
const char* arg_name, char** val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument list those element are strings
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val command line argument list
//! @param len length of the list / number of elements
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv,
const char* arg_name, char** val,
unsigned int* len);
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparef( const float* reference, const float* data,
const unsigned int len);
////////////////////////////////////////////////////////////////////////////
//! Compare two integer arrays
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparei( const int* reference, const int* data,
const unsigned int len );
////////////////////////////////////////////////////////////////////////////////
//! Compare two unsigned integer arrays, with epsilon and threshold
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
const unsigned int len, const float epsilon, const float threshold );
////////////////////////////////////////////////////////////////////////////
//! Compare two unsigned char arrays
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
const unsigned int len );
////////////////////////////////////////////////////////////////////////////////
//! Compare two integers with a tolernance for # of byte errors
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
const unsigned int len, const float epsilon, const float threshold );
////////////////////////////////////////////////////////////////////////////////
//! Compare two integer arrays witha n epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
const unsigned int len, const float epsilon );
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays with an epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
const unsigned int len, const float epsilon );
////////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays with an epsilon tolerance for equality and a
//! threshold for # pixel errors
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
const unsigned int len, const float epsilon, const float threshold );
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays using L2-norm with an epsilon tolerance for
//! equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
const unsigned int len, const float epsilon );
////////////////////////////////////////////////////////////////////////////////
//! Compare two PPM image files with an epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param src_file filename for the image to be compared
//! @param data filename for the reference data / gold image
//! @param epsilon epsilon to use for the comparison
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
//! $param verboseErrors output details of image mismatch to std::err
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
////////////////////////////////////////////////////////////////////////////////
//! Compare two PGM image files with an epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param src_file filename for the image to be compared
//! @param data filename for the reference data / gold image
//! @param epsilon epsilon to use for the comparison
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
//! $param verboseErrors output details of image mismatch to std::err
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
extern "C" size_t shrRoundUp(int group_size, int global_size);
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
// *********************************************************************
inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
{
if (iReference != iSample)
{
shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile);
if (pCleanup != NULL)
{
pCleanup(EXIT_FAILURE);
}
else
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
exit(EXIT_FAILURE);
}
}
}
// Standardized Exit
// *********************************************************************
inline void __shrExitEX(int argc, const char** argv, int iExitCode)
{
#ifdef WIN32
if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
#else
if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
#endif
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");
getchar();
}
else
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]);
}
fflush(stderr);
exit(iExitCode);
}
#endif

View File

@@ -1,66 +0,0 @@
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH) -I.
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT=VectorHypot
all: $(PROJECT).dump $(PROJECT).hex
lib$(PROJECT).a: VectorHypot.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) main.cc $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: main.cc lib$(PROJECT).a
$(CXX) $(CXXFLAGS) main.cc $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.elf *.dump *.hex

View File

@@ -1,41 +0,0 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
// OpenCL Kernel Function Naive Implementation for hyptenuse
__kernel void VectorHypot(__global float4* fg4A, __global float4* fg4B, __global float4* fg4Hypot, unsigned int uiOffset, int iInnerLoopCount, unsigned int uiNumElements)
{
// get index into global data array
size_t szGlobalOffset = get_global_id(0) + uiOffset;
// bound check
if (szGlobalOffset >= uiNumElements)
{
return;
}
// Processing 4 elements per work item, so read fgA and fgB source values from GMEM
float4 f4A = fg4A[szGlobalOffset];
float4 f4B = fg4B[szGlobalOffset];
float4 f4H = (float4)0.0f;
// Get the hypotenuses the vectors of 'legs', but exaggerate the time needed with loop
for (int i = 0; i < iInnerLoopCount; i++)
{
// compute the 4 hypotenuses using built-in function
f4H.x = hypot (f4A.x, f4B.x);
f4H.y = hypot (f4A.y, f4B.y);
f4H.z = hypot (f4A.z, f4B.z);
f4H.w = hypot (f4A.w, f4B.w);
}
// Write 4 result values back out to GMEM
fg4Hypot[szGlobalOffset] = f4H;
}

View File

@@ -1,686 +0,0 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
// *********************************************************************
// oclCopyComputeOverlap Notes:
//
// OpenCL API demo application for NVIDIA CUDA GPU's that implements a
// element by element vector hyptenuse computation using 2 input float arrays
// and 1 output float array.
//
// Demonstrates host->GPU and GPU->host copies that are asynchronous/overlapped
// with respect to GPU computation (and with respect to host thread).
//
// Because the overlap acheivable for this computation and data set on a given system depends upon the GPU being used and the
// GPU/Host bandwidth, the sample adjust the computation duration to test the most ideal case and test against a consistent standard.
// This sample should be able to achieve up to 30% overlap on GPU's arch 1.2 and 1.3, and up to 50% on arch 2.0+ (Fermi) GPU's.
//
// After setup, warmup and calibration to the system, the sample runs 4 scenarios:
// A) Computations with 2 command queues on GPU
// A multiple-cycle sequence is executed, timed and compared against the host
// B) Computations with 1 command queue on GPU
// A multiple-cycle sequence is executed, timed and compared against the host
//
// The 2-command queue approach ought to be substantially faster
//
// For developmental purposes, the "iInnerLoopCount" variable passes into kernel and independently
// increases compute time without increasing data size (via a loop inside the kernel)
//
// At some value of iInnerLoopCount, # of elements, workgroup size, etc the Overlap percentage should reach 30%:
// (This ~naively assumes time H2D bandwidth is the same as D2H bandwidth, but this is close on most systems)
//
// If we name the time to copy single input vector H2D (or outpute vector D2H) as "T", then the optimum comparison case is:
//
// Single Queue with all the data and all the work
// Ttot (serial) = 4T + 4T + 2T = 10T
//
// Dual Queue, where each queue has 1/2 the data and 1/2 the work
// Tq0 (overlap) = 2T + 2T + T ....
// Tq1 (overlap) = .... 2T + 2T + T
//
// Ttot (elapsed, wall) = 2T + 2T + 2T + T = 7T
//
// Best Overlap % = 100.0 * (10T - 7T)/10T = 30.0 % (Tesla arch 1.2 or 1.3, single copy engine)
//
// For multiple independent cycles using arch >= 2.0 with 2 copy engines, input and output copies can also be overlapped.
// This doesn't help for the first cycle, but theoretically can lead to 50% overlap over many independent cycles.
// *********************************************************************
// common SDK header for standard utilities and system libs
#include <oclUtils.h>
#include <shrQATest.h>
// Best possible and Min ratio of compute/copy overlap timing benefit to pass the test
// values greater than 0.0f represent a speed-up relative to non-overlapped
#define EXPECTED_OVERLAP 30.0f
#define EXPECTED_OVERLAP_FERMI 45.0f
#define PASS_FACTOR 0.60f
#define RETRIES_ON_FAILURE 1
// Base sizes for parameters manipulated dynamically or on the command line
#define BASE_WORK_ITEMS 64
#define BASE_ARRAY_LENGTH 40000
#define BASE_LOOP_COUNT 32
// Vars
// *********************************************************************
cl_platform_id cpPlatform; // OpenCL platform
cl_context cxGPUContext; // OpenCL context
cl_command_queue cqCommandQueue[2]; // OpenCL command queues
cl_device_id* cdDevices; // OpenCL device list
cl_program cpProgram; // OpenCL program
cl_kernel ckKernel[2]; // OpenCL kernel, 1 per queue
cl_mem cmPinnedSrcA; // OpenCL pinned host source buffer A
cl_mem cmPinnedSrcB; // OpenCL pinned host source buffer B
cl_mem cmPinnedResult; // OpenCL pinned host result buffer
float* fSourceA = NULL; // Mapped pointer for pinned Host source A buffer
float* fSourceB = NULL; // Mapped pointer for pinned Host source B buffer
float* fResult = NULL; // Mapped pointer for pinned Host result buffer
cl_mem cmDevSrcA; // OpenCL device source buffer A
cl_mem cmDevSrcB; // OpenCL device source buffer B
cl_mem cmDevResult; // OpenCL device result buffer
size_t szBuffBytes; // Size of main buffers
size_t szGlobalWorkSize; // 1D var for Total # of work items in the launched ND range
size_t szLocalWorkSize = BASE_WORK_ITEMS; // initial # of work items in the work group
cl_int ciErrNum; // Error code var
char* cPathAndName = NULL; // Var for full paths to data, src, etc.
char* cSourceCL = NULL; // Buffer to hold source for compilation
const char* cExecutableName = NULL;
// demo config vars
const char* cSourceFile = "VectorHypot.cl"; // OpenCL computation kernel source code
float* Golden = NULL; // temp buffer to hold golden results for cross check
bool bNoPrompt = false; // Command line switch to skip exit prompt
bool bQATest = false; // Command line switch to test
// Forward Declarations
// *********************************************************************
double DualQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig);
double OneQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig);
int AdjustCompute(cl_device_id cdTargetDevice, unsigned int uiNumElements, int iInitialLoopCount, int iCycles);
void VectorHypotHost(const float* pfData1, const float* pfData2, float* pfResult, unsigned int uiNumElements, int iInnerLoopCount);
void Cleanup (int iExitCode);
void (*pCleanup)(int) = &Cleanup;
int *gp_argc = 0;
const char *** gp_argv = NULL;
// Main function
// *********************************************************************
int main(int argc, const char **argv)
{
//Locals
size_t szKernelLength; // Byte size of kernel code
double dBuildTime; // Compile time
cl_uint uiTargetDevice = 0; // Default Device to compute on
cl_uint uiNumDevsUsed = 1; // Number of devices used in this sample
cl_uint uiNumDevices; // Number of devices available
int iDevCap = -1; // Capability of device
int iInnerLoopCount = BASE_LOOP_COUNT; // Varies "compute intensity" per data within the kernel
const int iTestCycles = 10; // How many times to run the external test loop
const int iWarmupCycles = 8; // How many times to run the warmup sequence
cl_uint uiWorkGroupMultiple = 4; // Command line var (using "workgroupmult=<n>") to optionally increase workgroup size
cl_uint uiNumElements = BASE_ARRAY_LENGTH; // initial # of elements per array to process (note: procesing 4 per work item)
cl_uint uiSizeMultiple = 4; // Command line var (using "sizemult=<n>") to optionally increase vector sizes
bool bPassFlag = false; // Var to accumulate test pass/fail
shrBOOL bMatch = shrFALSE; // Cross check result
shrBOOL bTestOverlap = shrFALSE;
double dAvgGPUTime[2] = {0.0, 0.0}; // Average time of iTestCycles calls for 2-Queue and 1-Queue test
double dHostTime[2] = {0.0, 0.0}; // Host computation time (2nd test is redundant but a good stability indicator)
float fMinPassCriteria[2] = {0.0f, 0.0f}; // Test pass cireria, adjusted dependant on GPU arch
gp_argc = &argc;
gp_argv = &argv;
shrQAStart(argc, (char **)argv);
// start logs
cExecutableName = argv[0];
shrSetLogFileName ("oclCopyComputeOverlap.txt");
shrLog("%s Starting...\n\n", argv[0]);
// get basic command line args
bNoPrompt = (shrTRUE == shrCheckCmdLineFlag(argc, argv, "noprompt"));
bQATest = (shrTRUE == shrCheckCmdLineFlag(argc, argv, "qatest"));
shrGetCmdLineArgumentu(argc, argv, "device", &uiTargetDevice);
// Optional Command-line multiplier for vector size
// Default val of 4 gives 10.24 million float elements per vector
// Range of 3 - 16 (7.68 to 40.96 million floats) is reasonable range (if system and GPU have enough memory)
shrGetCmdLineArgumentu(argc, argv, "sizemult", &uiSizeMultiple);
uiSizeMultiple = CLAMP(uiSizeMultiple, 1, 50);
uiNumElements = uiSizeMultiple * BASE_ARRAY_LENGTH * BASE_WORK_ITEMS;
shrLog("Array sizes = %u float elements\n", uiNumElements);
// Optional Command-line multiplier for workgroup size (x 64 work items)
// Default val of 4 gives szLocalWorkSize of 256.
// Range of 1 - 8 (resulting in workgroup sizes of 64 to 512) is reasonable range
shrGetCmdLineArgumentu(argc, argv, "workgroupmult", &uiWorkGroupMultiple);
uiWorkGroupMultiple = CLAMP(uiWorkGroupMultiple, 1, 10);
szLocalWorkSize = uiWorkGroupMultiple * BASE_WORK_ITEMS;
shrLog("Workgroup Size = %u\n\n", szLocalWorkSize);
// Get the NVIDIA platform if available, otherwise use default
shrLog("Get the Platform ID...\n\n");
ciErrNum = oclGetPlatformID(&cpPlatform);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Get OpenCL platform name and version
char cBuffer[256];
ciErrNum = clGetPlatformInfo (cpPlatform, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("Platform Name = %s\n\n", cBuffer);
// Get all the devices
shrLog("Get the Device info and select Device...\n");
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cdDevices = (cl_device_id*)malloc(uiNumDevices * sizeof(cl_device_id));
// Ethans changes
CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
//ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Set target device and check capabilities
shrLog(" # of Devices Available = %u\n", uiNumDevices);
uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
shrLog(" Using Device %u, ", uiTargetDevice);
oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
iDevCap = oclGetDevCap(cdDevices[uiTargetDevice]);
if (iDevCap > 0) {
shrLog(", Capability = %d.%d\n\n", iDevCap/10, iDevCap%10);
} else {
shrLog("\n\n", iDevCap);
}
if (strstr(cBuffer, "NVIDIA") != NULL)
{
if (iDevCap < 12)
{
shrLog("Device doesn't have overlap capability. Skipping test...\n");
Cleanup (EXIT_SUCCESS);
}
// Device and Platform eligible for overlap testing
bTestOverlap = shrTRUE;
// If device has overlap capability, proceed
fMinPassCriteria[0] = PASS_FACTOR * EXPECTED_OVERLAP; // 1st cycle overlap is same for 1 or 2 copy engines
if (iDevCap != 20)
{
// Single copy engine
fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP; // avg of many cycles
}
else
{
char cDevName[1024];
clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_NAME, sizeof(cDevName), &cDevName, NULL);
if(strstr(cDevName, "Quadro")!=0 || strstr(cDevName, "Tesla")!=0)
{
// Tesla or Quadro (arch = 2.0) ... Dual copy engine
fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP_FERMI; // average of many cycles
}
else
{
// Geforce ... Single copy engine
fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP; // average of many cycles
}
}
}
// Create the context
shrLog("clCreateContext...\n");
cxGPUContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Create 2 command-queues
cqCommandQueue[0] = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clCreateCommandQueue [0]...\n");
cqCommandQueue[1] = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clCreateCommandQueue [1]...\n");
// Allocate the OpenCL source and result buffer memory objects on GPU device GMEM
szBuffBytes = sizeof(cl_float) * uiNumElements;
cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szBuffBytes, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szBuffBytes, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmDevResult = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, szBuffBytes, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clCreateBuffer (Src A, Src B and Result GPU Device GMEM, 3 x %u floats) ...\n", uiNumElements);
// Allocate pinned source and result host buffers:
// Note: Pinned (Page Locked) memory is needed for async host<->GPU memory copy operations ***
cmPinnedSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmPinnedSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmPinnedResult = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clCreateBuffer (Src A, Src B and Result Pinned Host buffers, 3 x %u floats)...\n\n", uiNumElements);
// Get mapped pointers to pinned input host buffers
// Note: This allows general (non-OpenCL) host functions to access pinned buffers using standard pointers
fSourceA = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedSrcA, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
fSourceB = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedSrcB, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
fResult = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedResult, CL_TRUE, CL_MAP_READ, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
//oclCheckErrorEX (ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clEnqueueMapBuffer (Pointers to 3 pinned host buffers)...\n");
// Alloc temp golden buffer for cross checks
Golden = (float*)malloc(szBuffBytes);
//oclCheckErrorEX(Golden != NULL, shrTRUE, pCleanup);
// Read the OpenCL kernel in from source file
cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
//oclCheckError(cPathAndName != NULL, shrTRUE);
cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
// oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("oclLoadProgSource (%s)...\n", cSourceFile);
// Create the program object
//cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clCreateProgramWithSource...\n");
cl_program program =
clCreateProgramWithBuiltInKernels(context, 1, &device_id, "VectorHypot", NULL);
// Build the program for the target device
clFinish(cqCommandQueue[0]);
shrDeltaT(0);
ciErrNum = clBuildProgram(program, uiNumDevsUsed, &cdDevices[uiTargetDevice], "-cl-fast-relaxed-math", NULL, NULL);
shrLog("clBuildProgram...");
if (ciErrNum != CL_SUCCESS)
{
// write out standard error, Build Log and PTX, then cleanup and exit
shrLogEx(LOGBOTH | ERRORMSG, (double)ciErrNum, STDERROR);
oclLogBuildInfo(program, oclGetFirstDev(cxGPUContext));
oclLogPtx(program, oclGetFirstDev(cxGPUContext), "VectorHypot.ptx");
Cleanup(EXIT_FAILURE);
}
dBuildTime = shrDeltaT(0);
// Ethan - Kernel Addition
if (program == NULL) {
std::cerr << "Failed to write program binary" << std::endl;
Cleanup(context, queue, program, kernel, memObjects);
return 1;
} else {
std::cout << "Read program from binary." << std::endl;
}
// Create the kernel
ckKernel[0] = clCreateKernel(program, "VectorHypot", &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
ckKernel[1] = clCreateKernel(program, "VectorHypot", &ciErrNum);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clCreateKernel (ckKernel[2])...\n");
// Offsets for 2 queues
cl_uint uiOffset[2] = {0, uiNumElements / (2 * 4)};
// Set the Argument values for the 1st kernel instance (queue 0)
ciErrNum = clSetKernelArg(ckKernel[0], 0, sizeof(cl_mem), (void*)&cmDevSrcA);
ciErrNum |= clSetKernelArg(ckKernel[0], 1, sizeof(cl_mem), (void*)&cmDevSrcB);
ciErrNum |= clSetKernelArg(ckKernel[0], 2, sizeof(cl_mem), (void*)&cmDevResult);
ciErrNum |= clSetKernelArg(ckKernel[0], 3, sizeof(cl_uint), (void*)&uiOffset[0]);
ciErrNum |= clSetKernelArg(ckKernel[0], 4, sizeof(cl_int), (void*)&iInnerLoopCount);
ciErrNum |= clSetKernelArg(ckKernel[0], 5, sizeof(cl_uint), (void*)&uiNumElements);
shrLog("clSetKernelArg ckKernel[0] args 0 - 5...\n");
// Set the Argument values for the 2d kernel instance (queue 1)
ciErrNum |= clSetKernelArg(ckKernel[1], 0, sizeof(cl_mem), (void*)&cmDevSrcA);
ciErrNum |= clSetKernelArg(ckKernel[1], 1, sizeof(cl_mem), (void*)&cmDevSrcB);
ciErrNum |= clSetKernelArg(ckKernel[1], 2, sizeof(cl_mem), (void*)&cmDevResult);
ciErrNum |= clSetKernelArg(ckKernel[1], 3, sizeof(cl_uint), (void*)&uiOffset[1]);
ciErrNum |= clSetKernelArg(ckKernel[1], 4, sizeof(cl_int), (void*)&iInnerLoopCount);
ciErrNum |= clSetKernelArg(ckKernel[1], 5, sizeof(cl_uint), (void*)&uiNumElements);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clSetKernelArg ckKernel[1] args 0 - 5...\n\n");
//*******************************************
// Warmup the driver with dual queue sequence
//*******************************************
// Warmup with dual queue sequence for iTestCycles
shrLog("Warmup with 2-Queue sequence, %d cycles...\n", iWarmupCycles);
DualQueueSequence(iWarmupCycles, uiNumElements, false);
// Use single queue config to adjust compute intensity
shrLog("Adjust compute for GPU / system...\n");
iInnerLoopCount = AdjustCompute(cdDevices[uiTargetDevice], uiNumElements, iInnerLoopCount, iTestCycles);
shrLog(" Kernel inner loop count = %d\n", iInnerLoopCount);
//*******************************************
// Run and time with 2 command-queues
//*******************************************
for( int iRun =0; iRun <= RETRIES_ON_FAILURE; ++iRun ) {
// Run the sequence iTestCycles times
dAvgGPUTime[0] = DualQueueSequence(iTestCycles, uiNumElements, false);
// Warmup then Compute on host iTestCycles times (using mapped standard pointer to pinned host cl_mem buffer)
shrLog(" Device vs Host Result Comparison\t: ");
VectorHypotHost(fSourceA, fSourceB, Golden, uiNumElements, iInnerLoopCount);
shrDeltaT(0);
for (int i = 0; i < iTestCycles; i++)
{
VectorHypotHost (fSourceA, fSourceB, Golden, uiNumElements, iInnerLoopCount);
}
dHostTime[0] = shrDeltaT(0)/iTestCycles;
// Compare host and GPU results (using mapped standard pointer to pinned host cl_mem buffer)
bMatch = shrComparefet(Golden, fResult, uiNumElements, 0.0f, 0);
shrLog("gpu %s cpu\n", (bMatch == shrTRUE) ? "MATCHES" : "DOESN'T MATCH");
bPassFlag = (bMatch == shrTRUE);
//*******************************************
// Run and time with 1 command queue
//*******************************************
// Run the sequence iTestCycles times
dAvgGPUTime[1] = OneQueueSequence(iTestCycles, uiNumElements, false);
// Compute on host iTestCycles times (using mapped standard pointer to pinned host cl_mem buffer)
shrLog(" Device vs Host Result Comparison\t: ");
shrDeltaT(0);
for (int i = 0; i < iTestCycles; i++)
{
VectorHypotHost(fSourceA, fSourceB, Golden, (int)uiNumElements, iInnerLoopCount);
}
dHostTime[1] = shrDeltaT(0)/iTestCycles;
// Compare host and GPU results (using mapped standard pointer to pinned host cl_mem buffer)
bMatch = shrComparefet(Golden, fResult, uiNumElements, 0.0f, 0);
shrLog("gpu %s cpu\n", (bMatch == shrTRUE) ? "MATCHES" : "DOESN'T MATCH");
bPassFlag &= (bMatch == shrTRUE);
//*******************************************
// Compare Single and Dual queue timing
shrLog("\nResult Summary:\n");
// Log GPU and CPU Time for 2-queue scenario
shrLog(" Avg GPU Elapsed Time for 2-Queues\t= %.5f s\n", dAvgGPUTime[0]);
shrLog(" Avg Host Elapsed Time\t\t\t= %.5f s\n\n", dHostTime[0]);
// Log GPU and CPU Time for 1-queue scenario
shrLog(" Avg GPU Elapsed Time for 1-Queue\t= %.5f s\n", dAvgGPUTime[1]);
shrLog(" Avg Host Elapsed Time\t\t\t= %.5f s\n\n", dHostTime[1]);
// Log overlap % for GPU (comparison of 2-queue and 1 queue scenarios) and status
double dAvgOverlap = 100.0 * (1.0 - dAvgGPUTime[0]/dAvgGPUTime[1]);
if( bTestOverlap ) {
bool bAvgOverlapOK = (dAvgOverlap >= fMinPassCriteria[1]);
if( iRun == RETRIES_ON_FAILURE || bAvgOverlapOK ) {
shrLog(" Measured and (Acceptable) Avg Overlap\t= %.1f %% (%.1f %%) -> Measured Overlap is %s\n\n", dAvgOverlap, fMinPassCriteria[1], bAvgOverlapOK ? "Acceptable" : "NOT Acceptable");
// Log info to master log in standard format
shrLogEx(LOGBOTH | MASTER, 0, "oclCopyComputeOverlap-Avg, Throughput = %.4f OverlapPercent, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n",
dAvgOverlap, dAvgGPUTime[0], uiNumElements, uiNumDevsUsed, szLocalWorkSize);
bPassFlag &= bAvgOverlapOK;
break;
}
}
shrLog(" Measured and (Acceptable) Avg Overlap\t= %.1f %% (%.1f %%) -> Retry %d more time(s)...\n\n", dAvgOverlap, fMinPassCriteria[1], RETRIES_ON_FAILURE - iRun);
}
//*******************************************
// Report pass/fail, cleanup and exit
Cleanup (bPassFlag ? EXIT_SUCCESS : EXIT_FAILURE);
}
// Run 1 queue sequence for n cycles
// *********************************************************************
double OneQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig)
{
// Use fresh source Data: (re)initialize pinned host array buffers (using mapped standard pointer to pinned host cl_mem buffer)
shrFillArray(fSourceA, (int)uiNumElements);
shrFillArray(fSourceB, (int)uiNumElements);
// Reset Global work size for 1 command-queue, and log work sizes & dimensions
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/4));
// *** Make sure queues are empty and then start timer
double dAvgTime = 0.0;
clFinish(cqCommandQueue[0]);
clFinish(cqCommandQueue[1]);
shrDeltaT(0);
// Run the sequence iCycles times
for (int i = 0; i < iCycles; i++)
{
// Nonblocking Write of all of input data from host to device in command-queue 0
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szBuffBytes, (void*)&fSourceA[0], 0, NULL, NULL);
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szBuffBytes, (void*)&fSourceB[0], 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
// Launch kernel computation, command-queue 0
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Non Blocking Read of output data from device to host, command-queue 0
ciErrNum = clEnqueueReadBuffer(cqCommandQueue[0], cmDevResult, CL_FALSE, 0, szBuffBytes, (void*)&fResult[0], 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
// Flush sequence to device (may not be necessary on Linux or WinXP or when using the NVIDIA Tesla Computing Cluster driver)
clFlush(cqCommandQueue[0]);
}
// *** Assure sync to host and return average sequence time
clFinish(cqCommandQueue[0]);
dAvgTime = shrDeltaT(0)/(double)iCycles;
// Log config if asked for
if (bShowConfig)
{
shrLog("\n1-Queue sequence Configuration:\n");
shrLog(" Global Work Size (per command-queue)\t= %u\n Local Work Size \t\t\t= %u\n # of Work Groups (per command-queue)\t= %u\n # of command-queues\t\t\t= 1\n",
szGlobalWorkSize, szLocalWorkSize, szGlobalWorkSize/szLocalWorkSize);
}
return dAvgTime;
}
// Run 2 queue sequence for n cycles
// *********************************************************************
double DualQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig)
{
// Locals
size_t szHalfBuffer = szBuffBytes / 2;
size_t szHalfOffset = szHalfBuffer / sizeof(float);
double dAvgTime = 0.0;
// Use fresh source Data: (re)initialize pinned host array buffers (using mapped standard pointer to pinned host cl_mem buffer)
shrFillArray(fSourceA, (int)uiNumElements);
shrFillArray(fSourceB, (int)uiNumElements);
// Set Global work size for 2 command-queues, and log work sizes & dimensions
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/(2 * 4)));
// Make sure queues are empty and then start timer
clFinish(cqCommandQueue[0]);
clFinish(cqCommandQueue[1]);
shrDeltaT(0);
for (int i = 0; i < iCycles; i++)
{
// Mid Phase 0
// Nonblocking Write of 1st half of input data from host to device in command-queue 0
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szHalfBuffer, (void*)&fSourceA[0], 0, NULL, NULL);
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szHalfBuffer, (void*)&fSourceB[0], 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
// Push out the write for queue 0 (and prior read from queue 1 at end of loop) to the driver
// (not necessary on Linux, Mac OSX or WinXP)
clFlush(cqCommandQueue[0]);
clFlush(cqCommandQueue[1]);
// Start Phase 1 ***********************************
// Launch kernel computation, command-queue 0
// (Note: The order MATTERS here on Fermi ! THE KERNEL IN THIS PHASE SHOULD BE LAUNCHED BEFORE THE WRITE)
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Nonblocking Write of 2nd half of input data from host to device in command-queue 1
// (Note: The order MATTERS here on Fermi ! THE KERNEL IN THIS PHASE SHOULD BE LAUNCHED BEFORE THE WRITE)
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[1], cmDevSrcA, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fSourceA[szHalfOffset], 0, NULL, NULL);
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[1], cmDevSrcB, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fSourceB[szHalfOffset], 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
// Push out the compute for queue 0 and write for queue 1 to the driver
// (not necessary on Linux, Mac OSX or WinXP)
clFlush(cqCommandQueue[0]);
clFlush(cqCommandQueue[1]);
// Start Phase 2 ***********************************
// Launch kernel computation, command-queue 1
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[1], ckKernel[1], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
//oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Non Blocking Read of 1st half of output data from device to host, command-queue 0
ciErrNum = clEnqueueReadBuffer(cqCommandQueue[0], cmDevResult, CL_FALSE, 0, szHalfBuffer, (void*)&fResult[0], 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
// Push out the compute for queue 1 and the read for queue 0 to the driver
// (not necessary on Linux, Mac OSX or WinXP)
clFlush(cqCommandQueue[0]);
clFlush(cqCommandQueue[1]);
// Start Phase 0 (Rolls over) ***********************************
// Non Blocking Read of 2nd half of output data from device to host, command-queue 1
ciErrNum = clEnqueueReadBuffer(cqCommandQueue[1], cmDevResult, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fResult[szHalfOffset], 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
}
// *** Sync to host and get average sequence time
clFinish(cqCommandQueue[0]);
clFinish(cqCommandQueue[1]);
dAvgTime = shrDeltaT(0)/(double)iCycles;
// Log config if asked for
if (bShowConfig)
{
shrLog("\n2-Queue sequence Configuration:\n");
shrLog(" Global Work Size (per command-queue)\t= %u\n Local Work Size \t\t\t= %u\n # of Work Groups (per command-queue)\t= %u\n # of command-queues\t\t\t= 2\n",
szGlobalWorkSize, szLocalWorkSize, szGlobalWorkSize/szLocalWorkSize);
}
return dAvgTime;
}
// Function to adjust compute task according to device capability
// This allows a consistent overlap % across a wide variety of GPU's for test purposes
// It also implitly illustrates the relationship between compute capability and overlap at fixed work size
// *********************************************************************
int AdjustCompute(cl_device_id cdTargetDevice, unsigned int uiNumElements, int iInitLoopCount, int iCycles)
{
// Locals
double dCopyTime, dComputeTime;
int iComputedLoopCount;
// Change Source Data
shrFillArray(fSourceA, (int)uiNumElements);
shrFillArray(fSourceB, (int)uiNumElements);
// Reset Global work size for 1 command-queue, and log work sizes & dimensions
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/4));
// *** Make sure queues are empty and then start timer
clFinish(cqCommandQueue[0]);
clFinish(cqCommandQueue[1]);
shrDeltaT(0);
// Run the copy iCycles times and measure copy time on this system
for (int i = 0; i < iCycles; i++)
{
// Nonblocking Write of all of input data from host to device in command-queue 0
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szBuffBytes, (void*)&fSourceA[0], 0, NULL, NULL);
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szBuffBytes, (void*)&fSourceB[0], 0, NULL, NULL);
ciErrNum |= clFlush(cqCommandQueue[0]);
shrCheckError(ciErrNum, CL_SUCCESS);
}
clFinish(cqCommandQueue[0]);
dCopyTime = shrDeltaT(0);
// Run the compute iCycles times and measure compute time on this system
for (int i = 0; i < iCycles; i++)
{
// Launch kernel computation, command-queue 0
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
ciErrNum |= clFlush(cqCommandQueue[0]);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
}
clFinish(cqCommandQueue[0]);
dComputeTime = shrDeltaT(0);
// Determine number of core loop cycles proportional to copy/compute time ratio
dComputeTime = MAX(dComputeTime, 1.0e-6);
iComputedLoopCount = CLAMP(2, (int)((dCopyTime/dComputeTime) * (double)iInitLoopCount), (iInitLoopCount * 4));
ciErrNum |= clSetKernelArg(ckKernel[0], 4, sizeof(cl_int), (void*)&iComputedLoopCount);
ciErrNum |= clSetKernelArg(ckKernel[1], 4, sizeof(cl_int), (void*)&iComputedLoopCount);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
return (iComputedLoopCount);
}
// Cleanup/Exit function
// *********************************************************************
void Cleanup (int iExitCode)
{
// Cleanup allocated objects
shrLog("Starting Cleanup...\n\n");
if(cPathAndName)free(cPathAndName);
if(cSourceCL)free(cSourceCL);
if(Golden)free(Golden);
if(ckKernel[0])clReleaseKernel(ckKernel[0]);
if(ckKernel[1])clReleaseKernel(ckKernel[1]);
if(program)clReleaseProgram(program);
if(fSourceA)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedSrcA, (void*)fSourceA, 0, NULL, NULL);
if(fSourceB)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedSrcB, (void*)fSourceB, 0, NULL, NULL);
if(fResult)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedResult, (void*)fResult, 0, NULL, NULL);
if(cmDevSrcA)clReleaseMemObject(cmDevSrcA);
if(cmDevSrcB)clReleaseMemObject(cmDevSrcB);
if(cmDevResult)clReleaseMemObject(cmDevResult);
if(cmPinnedSrcA)clReleaseMemObject(cmPinnedSrcA);
if(cmPinnedSrcB)clReleaseMemObject(cmPinnedSrcB);
if(cmPinnedResult)clReleaseMemObject(cmPinnedResult);
if(cqCommandQueue[0])clReleaseCommandQueue(cqCommandQueue[0]);
if(cqCommandQueue[1])clReleaseCommandQueue(cqCommandQueue[1]);
if(cxGPUContext)clReleaseContext(cxGPUContext);
if(cdDevices)free(cdDevices);
// Master status Pass/Fail (all tests)
shrQAFinishExit( *gp_argc, (const char **)*gp_argv, (iExitCode == EXIT_SUCCESS) ? QA_PASSED : QA_FAILED );
}
// "Golden" Host processing vector hyptenuse function for comparison purposes
// *********************************************************************
void VectorHypotHost(const float* pfData1, const float* pfData2, float* pfResult, unsigned int uiNumElements, int iInnerLoopCount)
{
for (unsigned int i = 0; i < uiNumElements; i++)
{
float fA = pfData1[i];
float fB = pfData2[i];
float fC = sqrtf(fA * fA + fB * fB);
pfResult[i] = fC;
}
}

View File

@@ -1,198 +0,0 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef OCL_UTILS_H
#define OCL_UTILS_H
// *********************************************************************
// Utilities specific to OpenCL samples in NVIDIA GPU Computing SDK
// *********************************************************************
// Common headers: Cross-API utililties and OpenCL header
#include <shrUtils.h>
// All OpenCL headers
#if defined (__APPLE__) || defined(MACOSX)
#include <OpenCL/opencl.h>
#else
#include <CL/opencl.h>
#endif
// Includes
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
// For systems with CL_EXT that are not updated with these extensions, we copied these
// extensions from <CL/cl_ext.h>
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
#define CL_DEVICE_WARP_SIZE_NV 0x4003
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
#endif
// reminders for build output window and log
#ifdef _WIN32
#pragma message ("Note: including shrUtils.h")
#pragma message ("Note: including opencl.h")
#endif
// SDK Revision #
#define OCL_SDKREVISION "7027912"
// Error and Exit Handling Macros...
// *********************************************************************
// Full error handling macro with Cleanup() callback (if supplied)...
// (Companion Inline Function lower on page)
#define oclCheckErrorEX(a, b, c) __oclCheckErrorEX(a, b, c, __FILE__ , __LINE__)
// Short version without Cleanup() callback pointer
// Both Input (a) and Reference (b) are specified as args
#define oclCheckError(a, b) oclCheckErrorEX(a, b, 0)
//////////////////////////////////////////////////////////////////////////////
//! Gets the platform ID for NVIDIA if available, otherwise default to platform 0
//!
//! @return the id
//! @param clSelectedPlatformID OpenCL platform ID
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_int oclGetPlatformID(cl_platform_id* clSelectedPlatformID);
//////////////////////////////////////////////////////////////////////////////
//! Print info about the device
//!
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclPrintDevInfo(int iLogMode, cl_device_id device);
//////////////////////////////////////////////////////////////////////////////
//! Get and return device capability
//!
//! @return the 2 digit integer representation of device Cap (major minor). return -1 if NA
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
extern "C" int oclGetDevCap(cl_device_id device);
//////////////////////////////////////////////////////////////////////////////
//! Print the device name
//!
//! @param iLogMode enum LOGBOTH, LOGCONSOLE, LOGFILE
//! @param device OpenCL id of the device
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclPrintDevName(int iLogMode, cl_device_id device);
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the first device from the context
//!
//! @return the id
//! @param cxGPUContext OpenCL context
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_device_id oclGetFirstDev(cl_context cxGPUContext);
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of the nth device from the context
//!
//! @return the id or -1 when out of range
//! @param cxGPUContext OpenCL context
//! @param device_idx index of the device of interest
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_device_id oclGetDev(cl_context cxGPUContext, unsigned int device_idx);
//////////////////////////////////////////////////////////////////////////////
//! Gets the id of device with maximal FLOPS from the context
//!
//! @return the id
//! @param cxGPUContext OpenCL context
//////////////////////////////////////////////////////////////////////////////
extern "C" cl_device_id oclGetMaxFlopsDev(cl_context cxGPUContext);
//////////////////////////////////////////////////////////////////////////////
//! Loads a Program file and prepends the cPreamble to the code.
//!
//! @return the source string if succeeded, 0 otherwise
//! @param cFilename program filename
//! @param cPreamble code that is prepended to the loaded file, typically a set of #defines or a header
//! @param szFinalLength returned length of the code string
//////////////////////////////////////////////////////////////////////////////
extern "C" char* oclLoadProgSource(const char* cFilename, const char* cPreamble, size_t* szFinalLength);
//////////////////////////////////////////////////////////////////////////////
//! Get the binary (PTX) of the program associated with the device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//! @param binary returned code
//! @param length length of returned code
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclGetProgBinary( cl_program cpProgram, cl_device_id cdDevice, char** binary, size_t* length);
//////////////////////////////////////////////////////////////////////////////
//! Get and log the binary (PTX) from the OpenCL compiler for the requested program & device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//! @param const char* cPtxFileName optional PTX file name
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclLogPtx(cl_program cpProgram, cl_device_id cdDevice, const char* cPtxFileName);
//////////////////////////////////////////////////////////////////////////////
//! Get and log the Build Log from the OpenCL compiler for the requested program & device
//!
//! @param cpProgram OpenCL program
//! @param cdDevice device of interest
//////////////////////////////////////////////////////////////////////////////
extern "C" void oclLogBuildInfo(cl_program cpProgram, cl_device_id cdDevice);
// Helper function for De-allocating cl objects
// *********************************************************************
extern "C" void oclDeleteMemObjs(cl_mem* cmMemObjs, int iNumObjs);
// Helper function to get OpenCL error string from constant
// *********************************************************************
extern "C" const char* oclErrorString(cl_int error);
// Helper function to get OpenCL image format string (channel order and type) from constant
// *********************************************************************
extern "C" const char* oclImageFormatString(cl_uint uiImageFormat);
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
// *********************************************************************
inline void __oclCheckErrorEX(cl_int iSample, cl_int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
{
// An error condition is defined by the sample/test value not equal to the reference
if (iReference != iSample)
{
// If the sample/test value isn't equal to the ref, it's an error by defnition, so override 0 sample/test value
iSample = (iSample == 0) ? -9999 : iSample;
// Log the error info
shrLog("\n !!! Error # %i (%s) at line %i , in file %s !!!\n\n", iSample, oclErrorString(iSample), iLine, cFile);
// Cleanup and exit, or just exit if no cleanup function pointer provided. Use iSample (error code in this case) as process exit code.
if (pCleanup != NULL)
{
pCleanup(iSample);
}
else
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
exit(iSample);
}
}
}
#endif

View File

@@ -1,238 +0,0 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef SHR_QATEST_H
#define SHR_QATEST_H
// *********************************************************************
// Generic utilities for NVIDIA GPU Computing SDK
// *********************************************************************
// OS dependent includes
#ifdef _WIN32
#pragma message ("Note: including windows.h")
#pragma message ("Note: including math.h")
#pragma message ("Note: including assert.h")
#pragma message ("Note: including time.h")
// Headers needed for Windows
#include <windows.h>
#include <time.h>
#else
// Headers needed for Linux
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#include <unistd.h>
#include <time.h>
#endif
#ifndef STRCASECMP
#ifdef _WIN32
#define STRCASECMP _stricmp
#else
#define STRCASECMP strcasecmp
#endif
#endif
#ifndef STRNCASECMP
#ifdef _WIN32
#define STRNCASECMP _strnicmp
#else
#define STRNCASECMP strncasecmp
#endif
#endif
// Standardized QA Start/Finish for CUDA SDK tests
#define shrQAStart(a, b) __shrQAStart(a, b)
#define shrQAFinish(a, b, c) __shrQAFinish(a, b, c)
#define shrQAFinish2(a, b, c, d) __shrQAFinish2(a, b, c, d)
inline int findExeNameStart(const char *exec_name)
{
int exename_start = (int)strlen(exec_name);
while( (exename_start > 0) &&
(exec_name[exename_start] != '\\') &&
(exec_name[exename_start] != '/') )
{
exename_start--;
}
if (exec_name[exename_start] == '\\' ||
exec_name[exename_start] == '/')
{
return exename_start+1;
} else {
return exename_start;
}
}
inline int __shrQAStart(int argc, char **argv)
{
bool bQATest = false;
// First clear the output buffer
fflush(stdout);
fflush(stdout);
for (int i=1; i < argc; i++) {
int string_start = 0;
while (argv[i][string_start] == '-')
string_start++;
char *string_argv = &argv[i][string_start];
if (!STRCASECMP(string_argv, "qatest")) {
bQATest = true;
}
}
// We don't want to print the entire path, so we search for the first
int exename_start = findExeNameStart(argv[0]);
if (bQATest) {
fprintf(stdout, "&&&& RUNNING %s", &(argv[0][exename_start]));
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
fprintf(stdout, "\n");
} else {
fprintf(stdout, "[%s] starting...\n", &(argv[0][exename_start]));
}
fflush(stdout);
printf("\n"); fflush(stdout);
return exename_start;
}
enum eQAstatus {
QA_FAILED = 0,
QA_PASSED = 1,
QA_WAIVED = 2
};
inline void __ExitInTime(int seconds)
{
fprintf(stdout, "> exiting in %d seconds: ", seconds);
fflush(stdout);
time_t t;
int count;
for (t=time(0)+seconds, count=seconds; time(0) < t; count--) {
fprintf(stdout, "%d...", count);
#ifdef WIN32
Sleep(1000);
#else
sleep(1);
#endif
}
fprintf(stdout,"done!\n\n");
fflush(stdout);
}
inline void __shrQAFinish(int argc, const char **argv, int iStatus)
{
// By default QATest is disabled and NoPrompt is Enabled (times out at seconds passed into __ExitInTime() )
bool bQATest = false, bNoPrompt = true, bQuitInTime = true;
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
for (int i=1; i < argc; i++) {
int string_start = 0;
while (argv[i][string_start] == '-')
string_start++;
const char *string_argv = &argv[i][string_start];
if (!STRCASECMP(string_argv, "qatest")) {
bQATest = true;
}
// For SDK individual samples that don't specify -noprompt or -prompt,
// a 3 second delay will happen before exiting, giving a user time to view results
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
bNoPrompt = true;
bQuitInTime = false;
}
if (!STRCASECMP(string_argv, "prompt")) {
bNoPrompt = false;
bQuitInTime = false;
}
}
int exename_start = findExeNameStart(argv[0]);
if (bQATest) {
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
fprintf(stdout, "\n");
} else {
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
}
fflush(stdout);
printf("\n"); fflush(stdout);
if (bQuitInTime) {
__ExitInTime(3);
} else {
if (!bNoPrompt) {
fprintf(stdout, "\nPress <Enter> to exit...\n");
fflush(stdout);
getchar();
}
}
}
inline void __shrQAFinish2(bool bQATest, int argc, const char **argv, int iStatus)
{
bool bQuitInTime = true;
const char *sStatus[] = { "FAILED", "PASSED", "WAIVED", NULL };
for (int i=1; i < argc; i++) {
int string_start = 0;
while (argv[i][string_start] == '-')
string_start++;
const char *string_argv = &argv[i][string_start];
// For SDK individual samples that don't specify -noprompt or -prompt,
// a 3 second delay will happen before exiting, giving a user time to view results
if (!STRCASECMP(string_argv, "noprompt") || !STRCASECMP(string_argv, "help")) {
bQuitInTime = false;
}
if (!STRCASECMP(string_argv, "prompt")) {
bQuitInTime = false;
}
}
int exename_start = findExeNameStart(argv[0]);
if (bQATest) {
fprintf(stdout, "&&&& %s %s", sStatus[iStatus], &(argv[0][exename_start]));
for (int i=1; i < argc; i++) fprintf(stdout, " %s", argv[i]);
fprintf(stdout, "\n");
} else {
fprintf(stdout, "[%s] test results...\n%s\n", &(argv[0][exename_start]), sStatus[iStatus]);
}
fflush(stdout);
if (bQuitInTime) {
__ExitInTime(3);
}
}
inline void shrQAFinishExit(int argc, const char **argv, int iStatus)
{
__shrQAFinish(argc, argv, iStatus);
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
}
inline void shrQAFinishExit2(bool bQAtest, int argc, const char **argv, int iStatus)
{
__shrQAFinish2(bQAtest, argc, argv, iStatus);
exit(iStatus ? EXIT_SUCCESS : EXIT_FAILURE);
}
#endif

View File

@@ -1,642 +0,0 @@
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#ifndef SHR_UTILS_H
#define SHR_UTILS_H
// *********************************************************************
// Generic utilities for NVIDIA GPU Computing SDK
// *********************************************************************
// reminders for output window and build log
#ifdef _WIN32
#pragma message ("Note: including windows.h")
#pragma message ("Note: including math.h")
#pragma message ("Note: including assert.h")
#endif
// OS dependent includes
#ifdef _WIN32
// Headers needed for Windows
#include <windows.h>
#else
// Headers needed for Linux
#include <sys/stat.h>
#include <sys/types.h>
#include <sys/time.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdarg.h>
#endif
// Other headers needed for both Windows and Linux
#include <math.h>
#include <assert.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
// Un-comment the following #define to enable profiling code in SDK apps
//#define GPU_PROFILING
// Beginning of GPU Architecture definitions
inline int ConvertSMVer2Cores(int major, int minor)
{
// Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
typedef struct {
int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
int Cores;
} sSMtoCores;
sSMtoCores nGpuArchCoresPerSM[] =
{ { 0x10, 8 }, // Tesla Generation (SM 1.0) G80 class
{ 0x11, 8 }, // Tesla Generation (SM 1.1) G8x class
{ 0x12, 8 }, // Tesla Generation (SM 1.2) G9x class
{ 0x13, 8 }, // Tesla Generation (SM 1.3) GT200 class
{ 0x20, 32 }, // Fermi Generation (SM 2.0) GF100 class
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
{ 0x30, 192}, // Fermi Generation (SM 3.0) GK10x class
{ -1, -1 }
};
int index = 0;
while (nGpuArchCoresPerSM[index].SM != -1) {
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) {
return nGpuArchCoresPerSM[index].Cores;
}
index++;
}
printf("MapSMtoCores SM %d.%d is undefined (please update to the latest SDK)!\n", major, minor);
return -1;
}
// end of GPU Architecture definitions
// Defines and enum for use with logging functions
// *********************************************************************
#define DEFAULTLOGFILE "SdkConsoleLog.txt"
#define MASTERLOGFILE "SdkMasterLog.csv"
enum LOGMODES
{
LOGCONSOLE = 1, // bit to signal "log to console"
LOGFILE = 2, // bit to signal "log to file"
LOGBOTH = 3, // convenience union of first 2 bits to signal "log to both"
APPENDMODE = 4, // bit to set "file append" mode instead of "replace mode" on open
MASTER = 8, // bit to signal master .csv log output
ERRORMSG = 16, // bit to signal "pre-pend Error"
CLOSELOG = 32 // bit to close log file, if open, after any requested file write
};
#define HDASHLINE "-----------------------------------------------------------\n"
// Standardized boolean
enum shrBOOL
{
shrFALSE = 0,
shrTRUE = 1
};
// Standardized MAX, MIN and CLAMP
#define MAX(a, b) ((a > b) ? a : b)
#define MIN(a, b) ((a < b) ? a : b)
#define CLAMP(a, b, c) MIN(MAX(a, b), c) // double sided clip of input a
#define TOPCLAMP(a, b) (a < b ? a:b) // single top side clip of input a
// Error and Exit Handling Macros...
// *********************************************************************
// Full error handling macro with Cleanup() callback (if supplied)...
// (Companion Inline Function lower on page)
#define shrCheckErrorEX(a, b, c) __shrCheckErrorEX(a, b, c, __FILE__ , __LINE__)
// Short version without Cleanup() callback pointer
// Both Input (a) and Reference (b) are specified as args
#define shrCheckError(a, b) shrCheckErrorEX(a, b, 0)
// Standardized Exit Macro for leaving main()... extended version
// (Companion Inline Function lower on page)
#define shrExitEX(a, b, c) __shrExitEX(a, b, c)
// Standardized Exit Macro for leaving main()... short version
// (Companion Inline Function lower on page)
#define shrEXIT(a, b) __shrExitEX(a, b, EXIT_SUCCESS)
// Simple argument checker macro
#define ARGCHECK(a) if((a) != shrTRUE)return shrFALSE
// Define for user-customized error handling
#define STDERROR "file %s, line %i\n\n" , __FILE__ , __LINE__
// Function to deallocate memory allocated within shrUtils
// *********************************************************************
extern "C" void shrFree(void* ptr);
// *********************************************************************
// Helper function to log standardized information to Console, to File or to both
//! Examples: shrLogEx(LOGBOTH, 0, "Function A\n");
//! : shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
//!
//! Automatically opens file and stores handle if needed and not done yet
//! Closes file and nulls handle on request
//!
//! @param 0 iLogMode: LOGCONSOLE, LOGFILE, LOGBOTH, APPENDMODE, MASTER, ERRORMSG, CLOSELOG.
//! LOGFILE and LOGBOTH may be | 'd with APPENDMODE to select file append mode instead of overwrite mode
//! LOGFILE and LOGBOTH may be | 'd with CLOSELOG to "write and close"
//! First 3 options may be | 'd with MASTER to enable independent write to master data log file
//! First 3 options may be | 'd with ERRORMSG to start line with standard error message
//! @param 2 dValue:
//! Positive val = double value for time in secs to be formatted to 6 decimals.
//! Negative val is an error code and this give error preformatting.
//! @param 3 cFormatString: String with formatting specifiers like printf or fprintf.
//! ALL printf flags, width, precision and type specifiers are supported with this exception:
//! Wide char type specifiers intended for wprintf (%S and %C) are NOT supported
//! Single byte char type specifiers (%s and %c) ARE supported
//! @param 4... variable args: like printf or fprintf. Must match format specifer type above.
//! @return 0 if OK, negative value on error or if error occurs or was passed in.
// *********************************************************************
extern "C" int shrLogEx(int iLogMode, int iErrNum, const char* cFormatString, ...);
// Short version of shrLogEx defaulting to shrLogEx(LOGBOTH, 0,
// *********************************************************************
extern "C" int shrLog(const char* cFormatString, ...);
// *********************************************************************
// Delta timer function for up to 3 independent timers using host high performance counters
// Maintains state for 3 independent counters
//! Example: double dElapsedTime = shrDeltaTime(0);
//!
//! @param 0 iCounterID: Which timer to check/reset. (0, 1, 2)
//! @return delta time of specified counter since last call in seconds. Otherwise -9999.0 if error
// *********************************************************************
extern "C" double shrDeltaT(int iCounterID);
// Optional LogFileNameOverride function
// *********************************************************************
extern "C" void shrSetLogFileName (const char* cOverRideName);
// Helper function to init data arrays
// *********************************************************************
extern "C" void shrFillArray(float* pfData, int iSize);
// Helper function to print data arrays
// *********************************************************************
extern "C" void shrPrintArray(float* pfData, int iSize);
////////////////////////////////////////////////////////////////////////////
//! Find the path for a filename
//! @return the path if succeeded, otherwise 0
//! @param filename name of the file
//! @param executablePath optional absolute path of the executable
////////////////////////////////////////////////////////////////////////////
extern "C" char* shrFindFilePath(const char* filename, const char* executablePath);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing single precision floating point data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFilef( const char* filename, float** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing double precision floating point data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFiled( const char* filename, double** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing integer data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFilei( const char* filename, int** data, unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing unsigned integer data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFileui( const char* filename, unsigned int** data,
unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing char / byte data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFileb( const char* filename, char** data, unsigned int* len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Read file \filename containing unsigned char / byte data
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param filename name of the source file
//! @param data uninitialized pointer, returned initialized and pointing to
//! the data read
//! @param len number of data elements in data, -1 on error
//! @note If a NULL pointer is passed to this function and it is
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrReadFileub( const char* filename, unsigned char** data,
unsigned int* len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing single precision floating point
//! data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
//! @param epsilon epsilon for comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFilef( const char* filename, const float* data, unsigned int len,
const float epsilon, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing double precision floating point
//! data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
//! @param epsilon epsilon for comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFiled( const char* filename, const float* data, unsigned int len,
const double epsilon, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing integer data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFilei( const char* filename, const int* data, unsigned int len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing unsigned integer data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFileui( const char* filename, const unsigned int* data,
unsigned int len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing char / byte data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFileb( const char* filename, const char* data, unsigned int len,
bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Write a data file \filename containing unsigned char / byte data
//! @return shrTRUE if writing the file succeeded, otherwise shrFALSE
//! @param filename name of the file to write
//! @param data pointer to data to write
//! @param len number of data elements in data, -1 on error
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrWriteFileub( const char* filename, const unsigned char* data,
unsigned int len, bool verbose = false);
////////////////////////////////////////////////////////////////////////////
//! Load PPM image file (with unsigned char as data element type), padding
//! 4th component
//! @return shrTRUE if reading the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param OutData handle to the data read
//! @param w width of the image
//! @param h height of the image
//!
//! Note: If *OutData is NULL this function allocates buffer that must be freed by caller
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrLoadPPM4ub(const char* file, unsigned char** OutData,
unsigned int *w, unsigned int *h);
////////////////////////////////////////////////////////////////////////////
//! Save PPM image file (with unsigned char as data element type, padded to
//! 4 bytes)
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrSavePPM4ub( const char* file, unsigned char *data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////////
//! Save PGM image file (with unsigned char as data element type)
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrSavePGMub( const char* file, unsigned char *data,
unsigned int w, unsigned int h);
////////////////////////////////////////////////////////////////////////////
//! Load PGM image file (with unsigned char as data element type)
//! @return shrTRUE if saving the file succeeded, otherwise shrFALSE
//! @param file name of the image file
//! @param data handle to the data read
//! @param w width of the image
//! @param h height of the image
//! @note If a NULL pointer is passed to this function and it is initialized
//! within shrUtils, then free() has to be used to deallocate the memory
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrLoadPGMub( const char* file, unsigned char** data,
unsigned int *w,unsigned int *h);
////////////////////////////////////////////////////////////////////////////
// Command line arguments: General notes
// * All command line arguments begin with '--' followed by the token;
// token and value are seperated by '='; example --samples=50
// * Arrays have the form --model=[one.obj,two.obj,three.obj]
// (without whitespaces)
////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////
//! Check if command line argument \a flag-name is given
//! @return shrTRUE if command line argument \a flag_name has been given,
//! otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param flag_name name of command line flag
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCheckCmdLineFlag( const int argc, const char** argv,
const char* flag_name);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type int
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumenti( const int argc, const char** argv,
const char* arg_name, int* val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type unsigned int
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentu( const int argc, const char** argv,
const char* arg_name, unsigned int* val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type float
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentf( const int argc, const char** argv,
const char* arg_name, float* val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument of type string
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val value of the command line argument
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentstr( const int argc, const char** argv,
const char* arg_name, char** val);
////////////////////////////////////////////////////////////////////////////
//! Get the value of a command line argument list those element are strings
//! @return shrTRUE if command line argument \a arg_name has been given and
//! is of the requested type, otherwise shrFALSE
//! @param argc argc as passed to main()
//! @param argv argv as passed to main()
//! @param arg_name name of the command line argument
//! @param val command line argument list
//! @param len length of the list / number of elements
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrGetCmdLineArgumentListstr( const int argc, const char** argv,
const char* arg_name, char** val,
unsigned int* len);
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparef( const float* reference, const float* data,
const unsigned int len);
////////////////////////////////////////////////////////////////////////////
//! Compare two integer arrays
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparei( const int* reference, const int* data,
const unsigned int len );
////////////////////////////////////////////////////////////////////////////////
//! Compare two unsigned integer arrays, with epsilon and threshold
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareuit( const unsigned int* reference, const unsigned int* data,
const unsigned int len, const float epsilon, const float threshold );
////////////////////////////////////////////////////////////////////////////
//! Compare two unsigned char arrays
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareub( const unsigned char* reference, const unsigned char* data,
const unsigned int len );
////////////////////////////////////////////////////////////////////////////////
//! Compare two integers with a tolernance for # of byte errors
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
//! @param threshold tolerance % # of comparison errors (0.15f = 15%)
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareubt( const unsigned char* reference, const unsigned char* data,
const unsigned int len, const float epsilon, const float threshold );
////////////////////////////////////////////////////////////////////////////////
//! Compare two integer arrays witha n epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareube( const unsigned char* reference, const unsigned char* data,
const unsigned int len, const float epsilon );
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays with an epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparefe( const float* reference, const float* data,
const unsigned int len, const float epsilon );
////////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays with an epsilon tolerance for equality and a
//! threshold for # pixel errors
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparefet( const float* reference, const float* data,
const unsigned int len, const float epsilon, const float threshold );
////////////////////////////////////////////////////////////////////////////
//! Compare two float arrays using L2-norm with an epsilon tolerance for
//! equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param reference handle to the reference data / gold image
//! @param data handle to the computed data
//! @param len number of elements in reference and data
//! @param epsilon epsilon to use for the comparison
////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrCompareL2fe( const float* reference, const float* data,
const unsigned int len, const float epsilon );
////////////////////////////////////////////////////////////////////////////////
//! Compare two PPM image files with an epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param src_file filename for the image to be compared
//! @param data filename for the reference data / gold image
//! @param epsilon epsilon to use for the comparison
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
//! $param verboseErrors output details of image mismatch to std::err
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparePPM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
////////////////////////////////////////////////////////////////////////////////
//! Compare two PGM image files with an epsilon tolerance for equality
//! @return shrTRUEif \a reference and \a data are identical, otherwise shrFALSE
//! @param src_file filename for the image to be compared
//! @param data filename for the reference data / gold image
//! @param epsilon epsilon to use for the comparison
//! @param threshold threshold of pixels that can still mismatch to pass (i.e. 0.15f = 15% must pass)
//! $param verboseErrors output details of image mismatch to std::err
////////////////////////////////////////////////////////////////////////////////
extern "C" shrBOOL shrComparePGM( const char *src_file, const char *ref_file, const float epsilon, const float threshold);
extern "C" unsigned char* shrLoadRawFile(const char* filename, size_t size);
extern "C" size_t shrRoundUp(int group_size, int global_size);
// companion inline function for error checking and exit on error WITH Cleanup Callback (if supplied)
// *********************************************************************
inline void __shrCheckErrorEX(int iSample, int iReference, void (*pCleanup)(int), const char* cFile, const int iLine)
{
if (iReference != iSample)
{
shrLogEx(LOGBOTH | ERRORMSG, iSample, "line %i , in file %s !!!\n\n" , iLine, cFile);
if (pCleanup != NULL)
{
pCleanup(EXIT_FAILURE);
}
else
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "Exiting...\n");
exit(EXIT_FAILURE);
}
}
}
// Standardized Exit
// *********************************************************************
inline void __shrExitEX(int argc, const char** argv, int iExitCode)
{
#ifdef WIN32
if (!shrCheckCmdLineFlag(argc, argv, "noprompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
#else
if (shrCheckCmdLineFlag(argc, argv, "prompt") && !shrCheckCmdLineFlag(argc, argv, "qatest"))
#endif
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "\nPress <Enter> to Quit...\n");
getchar();
}
else
{
shrLogEx(LOGBOTH | CLOSELOG, 0, "%s Exiting...\n", argv[0]);
}
fflush(stderr);
exit(iExitCode);
}
#endif

View File

@@ -39,6 +39,27 @@ string kernel_names[2] = {"BFS_1", "BFS_2"};
int work_group_size = 512;
int device_id_inused = 0; // deviced id used (default : 0)
int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
if (nullptr == filename || nullptr == data || 0 == size)
return -1;
FILE* fp = fopen(filename, "r");
if (NULL == fp) {
fprintf(stderr, "Failed to load kernel.");
return -1;
}
fseek(fp , 0 , SEEK_END);
long fsize = ftell(fp);
rewind(fp);
*data = (uint8_t*)malloc(fsize);
*size = fread(*data, 1, fsize, fp);
fclose(fp);
return 0;
}
/*
* Converts the contents of a file into a string
*/
@@ -222,14 +243,25 @@ free(allPlatforms);*/
const char * source = source_str.c_str();
size_t sourceSize[] = { source_str.length() };*/
oclHandles.program = clCreateProgramWithBuiltInKernels(
oclHandles.context, 1, &oclHandles.devices[DEVICE_ID_INUSED],
"BFS_1;BFS_2", &resultCL);
//oclHandles.program = clCreateProgramWithBuiltInKernels(
// oclHandles.context, 1, &oclHandles.devices[DEVICE_ID_INUSED],
// "BFS_1;BFS_2", &resultCL);
/*oclHandles.program = clCreateProgramWithSource(oclHandles.context,
1,
&source,
sourceSize,
&resultCL);*/
// read kernel binary from file
uint8_t *kernel_bin = NULL;
size_t kernel_size;
cl_int binary_status = 0;
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
std::abort();
oclHandles.program = clCreateProgramWithBinary(
oclHandles.context, 1, &oclHandles.devices[DEVICE_ID_INUSED], &kernel_size, &kernel_bin, &binary_status, &resultCL);
free(kernel_bin);
if ((resultCL != CL_SUCCESS) || (oclHandles.program == NULL))
throw(string("InitCL()::Error: Loading Binary into cl_program. "
"(clCreateProgramWithBinary)"));

View File

@@ -1,68 +1,47 @@
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
LLVM_HOME ?= ~/dev/llvm-project/drops
TOOLCHAIN_PATH ?= ~/dev/riscv-gnu-toolchain/drops
SYSROOT ?= $(TOOLCHAIN_PATH)/riscv32-unknown-elf
POCL_CC_PATH ?= $(realpath ../compiler)
POCL_RT_PATH ?= $(realpath ../runtime)
VORTEX_DRV_PATH ?= $(realpath ../../../driver/sw)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
CXXFLAGS += -std=c++11 -O0 -g -fpermissive -Wall -Wextra -pedantic -Wfatal-errors
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
CXXFLAGS += -I$(POCL_RT_PATH)/include
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH)
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
PROJECT = bfs
SRCS = main.cc
all: $(PROJECT).dump $(PROJECT).hex
all: $(PROJECT)
lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
kernel.pocl: kernel.cl
TOOLCHAIN_PATH=$(TOOLCHAIN_PATH) SYSROOT=$(SYSROOT) LLVM_HOME=$(LLVM_HOME) VORTEX_RUNTIME_PATH=$(VORTEX_RT_PATH) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_HOME)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
run-fpga: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT)
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
run-ase: $(PROJECT) kernel.pocl
ASE_LOG=0 LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT)
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run-rtlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -strace -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug
rm -rf $(PROJECT) *.o *.dump .depend
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif

Binary file not shown.

Binary file not shown.

View File

@@ -187,7 +187,7 @@ int main(int argc, char *argv[]) {
FILE *fp;
Node *h_graph_nodes;
char *h_graph_mask, *h_updating_graph_mask, *h_graph_visited;
try {
char *input_f = "graph4096.txt";
printf("Reading File\n");

Binary file not shown.

View File

@@ -0,0 +1 @@
libOpenCL.so.2

View File

@@ -0,0 +1 @@
libOpenCL.so.2.5.0

Binary file not shown.

View File

@@ -0,0 +1,193 @@
/* pocl/_kernel_renames.h - Rename OpenCL builtin functions to avoid name
clashes with libm functions which are called in implementation.
Copyright (c) 2011-2013 Erik Schnetter <eschnetter@perimeterinstitute.ca>
Perimeter Institute for Theoretical Physics
Copyright (c) 2011-2017 Pekka Jääskeläinen / TUT
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef _KERNEL_RENAMES_H
#define _KERNEL_RENAMES_H
/* Move built-in declarations and libm functions out of the way.
(There should be a better way of doing so. These functions are
built-in math functions for OpenCL (see Clang's "Builtins.def").
Functions defined in libc or libm may also
interfere with OpenCL's functions, since their prototypes will be
wrong. */
#define abs _cl_abs
#define abs_diff _cl_abs_diff
#define acos _cl_acos
#define acosh _cl_acosh
#define acospi _cl_acospi
#define add_sat _cl_add_sat
#define all _cl_all
#define any _cl_any
#define asin _cl_asin
#define asinh _cl_asinh
#define asinpi _cl_asinpi
#define atan _cl_atan
#define atan2 _cl_atan2
#define atan2pi _cl_atan2pi
#define atanh _cl_atanh
#define atanpi _cl_atanpi
#define bitselect _cl_bitselect
#define cbrt _cl_cbrt
#define ceil _cl_ceil
#define clamp _cl_clamp
#define clz _cl_clz
#define copysign _cl_copysign
#define cos _cl_cos
#define cosh _cl_cosh
#define cospi _cl_cospi
#define cross _cl_cross
#define degrees _cl_degrees
#define distance _cl_distance
#define dot _cl_dot
#define erf _cl_erf
#define erfc _cl_erfc
#define exp _cl_exp
#define exp10 _cl_exp10
#define exp2 _cl_exp2
#define expm1 _cl_expm1
#define fabs _cl_fabs
#define fast_distance _cl_fast_distance
#define fast_length _cl_fast_length
#define fast_normalize _cl_fast_normalize
#define fdim _cl_fdim
#define floor _cl_floor
#define fma _cl_fma
#define fmax _cl_fmax
#define fmin _cl_fmin
#define fmod _cl_fmod
#define fract _cl_fract
#define frexp _cl_frexp
#define hadd _cl_hadd
#define half_cos _cl_half_cos
#define half_divide _cl_half_divide
#define half_exp _cl_half_exp
#define half_exp10 _cl_half_exp10
#define half_exp2 _cl_half_exp2
#define half_log _cl_half_log
#define half_log10 _cl_half_log10
#define half_log2 _cl_half_log2
#define half_powr _cl_half_powr
#define half_recip _cl_half_recip
#define half_rsqrt _cl_half_rsqrt
#define half_sin _cl_half_sin
#define half_sqrt _cl_half_sqrt
#define half_tan _cl_half_tan
#define hypot _cl_hypot
#define ilogb _cl_ilogb
#define isequal _cl_isequal
#define isfinite _cl_isfinite
#define isgreater _cl_isgreater
#define isgreaterequal _cl_isgreaterequal
#define isinf _cl_isinf
#define isless _cl_isless
#define islessequal _cl_islessequal
#define islessgreater _cl_islessgreater
#define isnan _cl_isnan
#define isnormal _cl_isnormal
#define isnotequal _cl_isnotequal
#define isordered _cl_isordered
#define isunordered _cl_isunordered
#define ldexp _cl_ldexp
#define length _cl_length
#define lgamma _cl_lgamma
#define lgamma_r _cl_lgamma_r
#define log _cl_log
#define log10 _cl_log10
#define log1p _cl_log1p
#define log2 _cl_log2
#define logb _cl_logb
#define mad _cl_mad
#define mad24 _cl_mad24
#define mad_hi _cl_mad_hi
#define mad_sat _cl_mad_sat
#define max _cl_max
#define maxmag _cl_maxmag
#define min _cl_min
#define minmag _cl_minmag
#define mix _cl_mix
#define modf _cl_modf
#define mul24 _cl_mul24
#define mul_hi _cl_mul_hi
#define nan _cl_nan
#define native_cos _cl_native_cos
#define native_divide _cl_native_divide
#define native_exp _cl_native_exp
#define native_exp10 _cl_native_exp10
#define native_exp2 _cl_native_exp2
#define native_log _cl_native_log
#define native_log10 _cl_native_log10
#define native_log2 _cl_native_log2
#define native_powr _cl_native_powr
#define native_recip _cl_native_recip
#define native_rsqrt _cl_native_rsqrt
#define native_sin _cl_native_sin
#define native_sqrt _cl_native_sqrt
#define native_tan _cl_native_tan
#define nextafter _cl_nextafter
#define normalize _cl_normalize
#define popcount _cl_popcount
#define pow _cl_pow
#define pown _cl_pown
#define powr _cl_powr
#define radians _cl_radians
#define remainder _cl_remainder
#define remquo _cl_remquo
#define rhadd _cl_rhadd
#define rint _cl_rint
#define rootn _cl_rootn
#define rotate _cl_rotate
#define round _cl_round
#define rsqrt _cl_rsqrt
#define select _cl_select
#define sign _cl_sign
#define signbit _cl_signbit
#define sin _cl_sin
#define sincos _cl_sincos
#define sinh _cl_sinh
#define sinpi _cl_sinpi
#define smoothstep _cl_smoothstep
#define sqrt _cl_sqrt
#define step _cl_step
#define sub_sat _cl_sub_sat
#define tan _cl_tan
#define tanh _cl_tanh
#define tanpi _cl_tanpi
#define tgamma _cl_tgamma
#define trunc _cl_trunc
#define upsample _cl_upsample
#define atom_add atomic_add
#define atom_sub atomic_sub
#define atom_xchg atomic_xchg
#define atom_inc atomic_inc
#define atom_dec atomic_dec
#define atom_cmpxchg atomic_cmpxchg
#define atom_min atomic_min
#define atom_max atomic_max
#define atom_and atomic_and
#define atom_or atomic_or
#define atom_xor atomic_xor
#endif

View File

@@ -0,0 +1,91 @@
/* This file includes opencl-c.h from Clang and fixes a few pocl extras.
Copyright (c) 2011-2017 Pekka Jääskeläinen / TUT
Copyright (c) 2017 Michal Babej / Tampere University of Technology
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef _OPENCL_H_
/* Use the declarations shipped with Clang. */
/* Check for _OPENCL_H already here because the kernel compiler loads the
header beforehand, but cannot find the file due to include paths not
set up. */
#include <opencl-c.h>
/* Missing declarations from opencl-c.h. Some of the geometric builtins are
defined only up to 4 vectors, but we implement them all: */
#ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
half _CL_OVERLOADABLE _CL_READNONE length (half8 p);
half _CL_OVERLOADABLE _CL_READNONE length (half16 p);
half _CL_OVERLOADABLE _CL_READNONE fast_length (half8 p);
half _CL_OVERLOADABLE _CL_READNONE fast_length (half16 p);
half8 _CL_OVERLOADABLE _CL_READNONE normalize (half8 p);
half16 _CL_OVERLOADABLE _CL_READNONE normalize (half16 p);
half8 _CL_OVERLOADABLE _CL_READNONE fast_normalize (half8 p);
half16 _CL_OVERLOADABLE _CL_READNONE fast_normalize (half16 p);
half _CL_OVERLOADABLE _CL_READNONE dot (half8 p0, half8 p1);
half _CL_OVERLOADABLE _CL_READNONE dot (half16 p0, half16 p1);
#endif
float _CL_OVERLOADABLE _CL_READNONE length (float8 p);
float _CL_OVERLOADABLE _CL_READNONE length (float16 p);
float _CL_OVERLOADABLE _CL_READNONE fast_length (float8 p);
float _CL_OVERLOADABLE _CL_READNONE fast_length (float16 p);
float8 _CL_OVERLOADABLE _CL_READNONE normalize (float8 p);
float16 _CL_OVERLOADABLE _CL_READNONE normalize (float16 p);
float8 _CL_OVERLOADABLE _CL_READNONE fast_normalize (float8 p);
float16 _CL_OVERLOADABLE _CL_READNONE fast_normalize (float16 p);
float _CL_OVERLOADABLE _CL_READNONE dot (float8 p0, float8 p1);
float _CL_OVERLOADABLE _CL_READNONE dot (float16 p0, float16 p1);
#ifdef cl_khr_fp64
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
double _CL_OVERLOADABLE _CL_READNONE length (double8 p);
double _CL_OVERLOADABLE _CL_READNONE length (double16 p);
double _CL_OVERLOADABLE _CL_READNONE fast_length (double p);
double _CL_OVERLOADABLE _CL_READNONE fast_length (double2 p);
double _CL_OVERLOADABLE _CL_READNONE fast_length (double3 p);
double _CL_OVERLOADABLE _CL_READNONE fast_length (double4 p);
double _CL_OVERLOADABLE _CL_READNONE fast_length (double8 p);
double _CL_OVERLOADABLE _CL_READNONE fast_length (double16 p);
double8 _CL_OVERLOADABLE _CL_READNONE normalize (double8 p);
double16 _CL_OVERLOADABLE _CL_READNONE normalize (double16 p);
double8 _CL_OVERLOADABLE _CL_READNONE fast_normalize (double8 p);
double16 _CL_OVERLOADABLE _CL_READNONE fast_normalize (double16 p);
double _CL_OVERLOADABLE _CL_READNONE dot (double8 p0, double8 p1);
double _CL_OVERLOADABLE _CL_READNONE dot (double16 p0, double16 p1);
#endif
#endif

View File

@@ -0,0 +1,58 @@
/* Enable all extensions known to pocl, which a device supports.
* This is required at the start of include/_kernel.h for prototypes,
* then at kernel lib compilation phase (because _kernel.h disables
* everything at the end).
*/
/* OpenCL 1.0-only extensions */
#if (__OPENCL_C_VERSION__ < 110)
#ifdef cl_khr_global_int32_base_atomics
# pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#endif
#ifdef cl_khr_global_int32_extended_atomics
# pragma OPENCL EXTENSION cl_khr_global_int32_extended_atomics : enable
#endif
#ifdef cl_khr_local_int32_base_atomics
# pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable
#endif
#ifdef cl_khr_local_int32_extended_atomics
# pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
#endif
#ifdef cl_khr_byte_addressable_store
# pragma OPENCL EXTENSION cl_khr_byte_addressable_store : enable
#endif
#endif
/* all versions */
#ifdef cl_khr_fp16
# pragma OPENCL EXTENSION cl_khr_fp16: enable
#endif
#ifdef cl_khr_fp64
# pragma OPENCL EXTENSION cl_khr_fp64: enable
#endif
#ifdef cl_khr_int64_base_atomics
# pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
#ifdef cl_khr_int64_extended_atomics
# pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
#endif
#if (__clang_major__ > 4)
#ifdef cl_khr_3d_image_writes
# pragma OPENCL EXTENSION cl_khr_3d_image_writes : enable
#endif
#endif

View File

@@ -0,0 +1,233 @@
/* pocl/_kernel.h - OpenCL types and runtime library
functions declarations. This should be included only from OpenCL C files.
Copyright (c) 2011 Universidad Rey Juan Carlos
Copyright (c) 2011-2017 Pekka Jääskeläinen / TUT
Copyright (c) 2011-2013 Erik Schnetter <eschnetter@perimeterinstitute.ca>
Perimeter Institute for Theoretical Physics
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* If the -cl-std build option is not specified, the highest OpenCL C 1.x
* language version supported by each device is used as the version of
* OpenCL C when compiling the program for each device.
*/
#ifndef __OPENCL_C_VERSION__
#define __OPENCL_C_VERSION__ 120
#endif
#if (__OPENCL_C_VERSION__ > 99)
#define CL_VERSION_1_0 100
#endif
#if (__OPENCL_C_VERSION__ > 109)
#define CL_VERSION_1_1 110
#endif
#if (__OPENCL_C_VERSION__ > 119)
#define CL_VERSION_1_2 120
#endif
#if (__OPENCL_C_VERSION__ > 199)
#define CL_VERSION_2_0 200
#endif
#include "_enable_all_exts.h"
#include "_builtin_renames.h"
/* Define some feature test macros to help write generic code. These are used
* mostly in _pocl_opencl.h header + some .cl files in kernel library */
#ifdef cl_khr_int64
# define __IF_INT64(x) x
#else
# define __IF_INT64(x)
#endif
#ifdef cl_khr_fp16
# define __IF_FP16(x) x
#else
# define __IF_FP16(x)
#endif
#ifdef cl_khr_fp64
# define __IF_FP64(x) x
#else
# define __IF_FP64(x)
#endif
#ifdef cl_khr_int64_base_atomics
#define __IF_BA64(x) x
#else
#define __IF_BA64(x)
#endif
#ifdef cl_khr_int64_extended_atomics
#define __IF_EA64(x) x
#else
#define __IF_EA64(x)
#endif
/****************************************************************************/
/* Function/type attributes supported by Clang/SPIR */
#if __has_attribute(__always_inline__)
# define _CL_ALWAYSINLINE __attribute__((__always_inline__))
#else
# define _CL_ALWAYSINLINE
#endif
#if __has_attribute(__noinline__)
# define _CL_NOINLINE __attribute__((__noinline__))
#else
# define _CL_NOINLINE
#endif
#if __has_attribute(__overloadable__)
# define _CL_OVERLOADABLE __attribute__((__overloadable__))
#else
# define _CL_OVERLOADABLE
#endif
#if __has_attribute(__pure__)
# define _CL_READONLY __attribute__((__pure__))
#else
# define _CL_READONLY
#endif
#if __has_attribute(__const__)
# define _CL_READNONE __attribute__((__const__))
#else
# define _CL_READNONE
#endif
#if __has_attribute(convergent)
# define _CL_CONVERGENT __attribute__((convergent))
#else
# define _CL_CONVERGENT
#endif
/************************ setup Clang version macros ******************/
#if (__clang_major__ == 6)
# undef LLVM_6_0
# define LLVM_6_0
#elif (__clang_major__ == 7)
# undef LLVM_7_0
# define LLVM_7_0
#elif (__clang_major__ == 8)
# undef LLVM_8_0
# define LLVM_8_0
#elif (__clang_major__ == 9)
# undef LLVM_9_0
# define LLVM_9_0
#elif (__clang_major__ == 10)
# undef LLVM_10_0
# define LLVM_10_0
#else
#error Unsupported Clang/LLVM version.
#endif
#ifndef LLVM_10_0
#define LLVM_OLDER_THAN_10_0 1
#ifndef LLVM_9_0
#define LLVM_OLDER_THAN_9_0 1
#ifndef LLVM_8_0
#define LLVM_OLDER_THAN_8_0 1
#ifndef LLVM_7_0
#define LLVM_OLDER_THAN_7_0 1
#ifndef LLVM_6_0
#define LLVM_OLDER_THAN_6_0 1
#endif
#endif
#endif
#endif
#endif
/****************************************************************************/
/* A static assert statement to catch inconsistencies at build time */
#if __has_extension(__c_static_assert__)
# define _CL_STATIC_ASSERT(_t, _x) _Static_assert(_x, #_t)
#else
# define _CL_STATIC_ASSERT(_t, _x) typedef int __cl_ai##_t[(x) ? 1 : -1];
#endif
/****************************************************************************/
#define IMG_RO_AQ __read_only
#define IMG_WO_AQ __write_only
#if (__OPENCL_C_VERSION__ > 199)
#define CLANG_HAS_RW_IMAGES
#define IMG_RW_AQ __read_write
#else
#undef CLANG_HAS_RW_IMAGES
#define IMG_RW_AQ __RW_IMAGES_UNSUPPORTED_BEFORE_CL_20
#endif
/****************************************************************************/
/* use Clang opencl header for definitions. */
#ifdef POCL_DEVICE_ADDRESS_BITS
/* If we wish to override the Clang set __SIZE_TYPE__ for this target,
let's do it here so the opencl-c.h sets size_t to the wanted type. */
#ifdef __SIZE_TYPE__
#undef __SIZE_TYPE__
#endif
#if POCL_DEVICE_ADDRESS_BITS == 32
#define __SIZE_TYPE__ uint
#elif POCL_DEVICE_ADDRESS_BITS == 64
#define __SIZE_TYPE__ ulong
#else
#error Unsupported POCL_DEVICE_ADDRESS_BITS value.
#endif
#endif
#include "_clang_opencl.h"
/****************************************************************************/
/* GNU's libm seems to use INT_MIN here while the Clang's header uses
INT_MAX. Both are allowed by the OpenCL specs, but we want them to
be unified to avoid failing tests. */
#undef FP_ILOGBNAN
#undef FP_ILOGB0
#define FP_ILOGBNAN INT_MIN
#define FP_ILOGB0 INT_MIN
/****************************************************************************/
#include "pocl_image_types.h"
#pragma OPENCL EXTENSION all : disable

View File

@@ -0,0 +1,189 @@
/* pocl/_kernel_c.h - C compatible OpenCL types and runtime library
functions declarations for kernel builtin implementations using C.
Copyright (c) 2011 Universidad Rey Juan Carlos
Copyright (c) 2011-2017 Pekka Jääskeläinen / TUT
Copyright (c) 2011-2013 Erik Schnetter <eschnetter@perimeterinstitute.ca>
Perimeter Institute for Theoretical Physics
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/**
* Header that can be implemented in C compiled implementations of
* built-in functions to introduce the OpenCL C compatible types etc.
*/
#ifndef _KERNEL_C_H
#define _KERNEL_C_H
#include "pocl_types.h"
#include "_kernel_constants.h"
/* Function/type attributes supported by Clang/SPIR */
#if __has_attribute(__always_inline__)
# define _CL_ALWAYSINLINE __attribute__((__always_inline__))
#else
# define _CL_ALWAYSINLINE
#endif
#if __has_attribute(__noinline__)
# define _CL_NOINLINE __attribute__((__noinline__))
#else
# define _CL_NOINLINE
#endif
#if __has_attribute(__overloadable__)
# define _CL_OVERLOADABLE __attribute__((__overloadable__))
#else
# define _CL_OVERLOADABLE
#endif
#if __has_attribute(__pure__)
# define _CL_READONLY __attribute__((__pure__))
#else
# define _CL_READONLY
#endif
#if __has_attribute(__const__)
# define _CL_READNONE __attribute__((__const__))
#else
# define _CL_READNONE
#endif
#if __has_attribute(convergent)
# define _CL_CONVERGENT __attribute__((convergent))
#else
# define _CL_CONVERGENT
#endif
typedef char char2 __attribute__((__ext_vector_type__(2)));
typedef char char3 __attribute__((__ext_vector_type__(3)));
typedef char char4 __attribute__((__ext_vector_type__(4)));
typedef char char8 __attribute__((__ext_vector_type__(8)));
typedef char char16 __attribute__((__ext_vector_type__(16)));
typedef uchar uchar2 __attribute__((__ext_vector_type__(2)));
typedef uchar uchar3 __attribute__((__ext_vector_type__(3)));
typedef uchar uchar4 __attribute__((__ext_vector_type__(4)));
typedef uchar uchar8 __attribute__((__ext_vector_type__(8)));
typedef uchar uchar16 __attribute__((__ext_vector_type__(16)));
typedef short short2 __attribute__((__ext_vector_type__(2)));
typedef short short3 __attribute__((__ext_vector_type__(3)));
typedef short short4 __attribute__((__ext_vector_type__(4)));
typedef short short8 __attribute__((__ext_vector_type__(8)));
typedef short short16 __attribute__((__ext_vector_type__(16)));
typedef ushort ushort2 __attribute__((__ext_vector_type__(2)));
typedef ushort ushort3 __attribute__((__ext_vector_type__(3)));
typedef ushort ushort4 __attribute__((__ext_vector_type__(4)));
typedef ushort ushort8 __attribute__((__ext_vector_type__(8)));
typedef ushort ushort16 __attribute__((__ext_vector_type__(16)));
typedef int int2 __attribute__((__ext_vector_type__(2)));
typedef int int3 __attribute__((__ext_vector_type__(3)));
typedef int int4 __attribute__((__ext_vector_type__(4)));
typedef int int8 __attribute__((__ext_vector_type__(8)));
typedef int int16 __attribute__((__ext_vector_type__(16)));
typedef uint uint2 __attribute__((__ext_vector_type__(2)));
typedef uint uint3 __attribute__((__ext_vector_type__(3)));
typedef uint uint4 __attribute__((__ext_vector_type__(4)));
typedef uint uint8 __attribute__((__ext_vector_type__(8)));
typedef uint uint16 __attribute__((__ext_vector_type__(16)));
#if defined(__CBUILD__) && defined(cl_khr_fp16)
/* NOTE: the Clang's __fp16 does not work robustly in C mode,
it might produce invalid code at least with half vectors.
Using the native 'half' type in OpenCL C mode works better. */
typedef __fp16 half;
#endif
typedef half half2 __attribute__((__ext_vector_type__(2)));
typedef half half3 __attribute__((__ext_vector_type__(3)));
typedef half half4 __attribute__((__ext_vector_type__(4)));
typedef half half8 __attribute__((__ext_vector_type__(8)));
typedef half half16 __attribute__((__ext_vector_type__(16)));
typedef float float2 __attribute__((__ext_vector_type__(2)));
typedef float float3 __attribute__((__ext_vector_type__(3)));
typedef float float4 __attribute__((__ext_vector_type__(4)));
typedef float float8 __attribute__((__ext_vector_type__(8)));
typedef float float16 __attribute__((__ext_vector_type__(16)));
#ifdef cl_khr_fp64
# ifndef __CBUILD__
# pragma OPENCL EXTENSION cl_khr_fp64 : enable
# endif
typedef double double2 __attribute__((__ext_vector_type__(2)));
typedef double double3 __attribute__((__ext_vector_type__(3)));
typedef double double4 __attribute__((__ext_vector_type__(4)));
typedef double double8 __attribute__((__ext_vector_type__(8)));
typedef double double16 __attribute__((__ext_vector_type__(16)));
#endif
#ifdef cl_khr_int64
typedef long long2 __attribute__((__ext_vector_type__(2)));
typedef long long3 __attribute__((__ext_vector_type__(3)));
typedef long long4 __attribute__((__ext_vector_type__(4)));
typedef long long8 __attribute__((__ext_vector_type__(8)));
typedef long long16 __attribute__((__ext_vector_type__(16)));
typedef ulong ulong2 __attribute__((__ext_vector_type__(2)));
typedef ulong ulong3 __attribute__((__ext_vector_type__(3)));
typedef ulong ulong4 __attribute__((__ext_vector_type__(4)));
typedef ulong ulong8 __attribute__((__ext_vector_type__(8)));
typedef ulong ulong16 __attribute__((__ext_vector_type__(16)));
#endif
#if defined(__TCE__)
#define POCL_ADDRESS_SPACE_PRIVATE 0
#define POCL_ADDRESS_SPACE_GLOBAL 1
#define POCL_ADDRESS_SPACE_LOCAL 3
#define POCL_ADDRESS_SPACE_CONSTANT 2
#define POCL_ADDRESS_SPACE_GENERIC 6
#endif
typedef uint cl_mem_fence_flags;
/* Integer Constants */
#if defined(__CBUILD__)
#define CHAR_BIT 8
#define CHAR_MAX SCHAR_MAX
#define CHAR_MIN SCHAR_MIN
#define INT_MAX 2147483647
#define INT_MIN (-2147483647 - 1)
#ifdef cl_khr_int64
#define LONG_MAX 0x7fffffffffffffffL
#define LONG_MIN (-0x7fffffffffffffffL - 1)
#endif
#define SCHAR_MAX 127
#define SCHAR_MIN (-127 - 1)
#define SHRT_MAX 32767
#define SHRT_MIN (-32767 - 1)
#define UCHAR_MAX 255
#define USHRT_MAX 65535
#define UINT_MAX 0xffffffff
#ifdef cl_khr_int64
#define ULONG_MAX 0xffffffffffffffffUL
#endif
#endif /* __CBUILD__ */
#endif

View File

@@ -0,0 +1,93 @@
/* pocl/_kernel_constants.h - C compatible OpenCL types and runtime library
constants declarations.
Copyright (c) 2011 Universidad Rey Juan Carlos
Copyright (c) 2011-2013 Pekka Jääskeläinen / TUT
Copyright (c) 2011-2013 Erik Schnetter <eschnetter@perimeterinstitute.ca>
Perimeter Institute for Theoretical Physics
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/**
* Header that can be implemented in C compiled implementations of
* built-in functions to introduce the OpenCL C compatible constants.
*/
#ifndef _KERNEL_CONSTANTS_H
#define _KERNEL_CONSTANTS_H
/* clang's header defines these */
#ifndef _OPENCL_H_
/* cl_channel_order */
#define CLK_R 0x10B0
#define CLK_A 0x10B1
#define CLK_RG 0x10B2
#define CLK_RA 0x10B3
#define CLK_RGB 0x10B4
#define CLK_RGBA 0x10B5
#define CLK_BGRA 0x10B6
#define CLK_ARGB 0x10B7
#define CLK_INTENSITY 0x10B8
#define CLK_LUMINANCE 0x10B9
#define CLK_Rx 0x10BA
#define CLK_RGx 0x10BB
#define CLK_RGBx 0x10BC
#define CLK_DEPTH 0x10BD
#define CLK_DEPTH_STENCIL 0x10BE
/* cl_channel_type */
#define CLK_SNORM_INT8 0x10D0
#define CLK_SNORM_INT16 0x10D1
#define CLK_UNORM_INT8 0x10D2
#define CLK_UNORM_INT16 0x10D3
#define CLK_UNORM_SHORT_565 0x10D4
#define CLK_UNORM_SHORT_555 0x10D5
#define CLK_UNORM_INT_101010 0x10D6
#define CLK_SIGNED_INT8 0x10D7
#define CLK_SIGNED_INT16 0x10D8
#define CLK_SIGNED_INT32 0x10D9
#define CLK_UNSIGNED_INT8 0x10DA
#define CLK_UNSIGNED_INT16 0x10DB
#define CLK_UNSIGNED_INT32 0x10DC
#define CLK_HALF_FLOAT 0x10DD
#define CLK_FLOAT 0x10DE
#define CLK_UNORM_INT24 0x10DF
/* cl_addressing _mode */
#define CLK_ADDRESS_NONE 0x00
#define CLK_ADDRESS_CLAMP_TO_EDGE 0x02
#define CLK_ADDRESS_CLAMP 0x04
#define CLK_ADDRESS_REPEAT 0x06
#define CLK_ADDRESS_MIRRORED_REPEAT 0x08
/* cl_sampler_info */
#define CLK_NORMALIZED_COORDS_FALSE 0x00
#define CLK_NORMALIZED_COORDS_TRUE 0x01
/* filter_mode */
#define CLK_FILTER_NEAREST 0x10
#define CLK_FILTER_LINEAR 0x20
/* barrier() flags */
#define CLK_LOCAL_MEM_FENCE 0x01
#define CLK_GLOBAL_MEM_FENCE 0x02
#endif
#endif

View File

@@ -0,0 +1,571 @@
//===----- opencl-c-base.h - OpenCL C language base definitions -----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef _OPENCL_BASE_H_
#define _OPENCL_BASE_H_
// built-in scalar data types:
/**
* An unsigned 8-bit integer.
*/
typedef unsigned char uchar;
/**
* An unsigned 16-bit integer.
*/
typedef unsigned short ushort;
/**
* An unsigned 32-bit integer.
*/
typedef unsigned int uint;
/**
* An unsigned 64-bit integer.
*/
typedef unsigned long ulong;
/**
* The unsigned integer type of the result of the sizeof operator. This
* is a 32-bit unsigned integer if CL_DEVICE_ADDRESS_BITS
* defined in table 4.3 is 32-bits and is a 64-bit unsigned integer if
* CL_DEVICE_ADDRESS_BITS is 64-bits.
*/
typedef __SIZE_TYPE__ size_t;
/**
* A signed integer type that is the result of subtracting two pointers.
* This is a 32-bit signed integer if CL_DEVICE_ADDRESS_BITS
* defined in table 4.3 is 32-bits and is a 64-bit signed integer if
* CL_DEVICE_ADDRESS_BITS is 64-bits.
*/
typedef __PTRDIFF_TYPE__ ptrdiff_t;
/**
* A signed integer type with the property that any valid pointer to
* void can be converted to this type, then converted back to pointer
* to void, and the result will compare equal to the original pointer.
*/
typedef __INTPTR_TYPE__ intptr_t;
/**
* An unsigned integer type with the property that any valid pointer to
* void can be converted to this type, then converted back to pointer
* to void, and the result will compare equal to the original pointer.
*/
typedef __UINTPTR_TYPE__ uintptr_t;
// built-in vector data types:
typedef char char2 __attribute__((ext_vector_type(2)));
typedef char char3 __attribute__((ext_vector_type(3)));
typedef char char4 __attribute__((ext_vector_type(4)));
typedef char char8 __attribute__((ext_vector_type(8)));
typedef char char16 __attribute__((ext_vector_type(16)));
typedef uchar uchar2 __attribute__((ext_vector_type(2)));
typedef uchar uchar3 __attribute__((ext_vector_type(3)));
typedef uchar uchar4 __attribute__((ext_vector_type(4)));
typedef uchar uchar8 __attribute__((ext_vector_type(8)));
typedef uchar uchar16 __attribute__((ext_vector_type(16)));
typedef short short2 __attribute__((ext_vector_type(2)));
typedef short short3 __attribute__((ext_vector_type(3)));
typedef short short4 __attribute__((ext_vector_type(4)));
typedef short short8 __attribute__((ext_vector_type(8)));
typedef short short16 __attribute__((ext_vector_type(16)));
typedef ushort ushort2 __attribute__((ext_vector_type(2)));
typedef ushort ushort3 __attribute__((ext_vector_type(3)));
typedef ushort ushort4 __attribute__((ext_vector_type(4)));
typedef ushort ushort8 __attribute__((ext_vector_type(8)));
typedef ushort ushort16 __attribute__((ext_vector_type(16)));
typedef int int2 __attribute__((ext_vector_type(2)));
typedef int int3 __attribute__((ext_vector_type(3)));
typedef int int4 __attribute__((ext_vector_type(4)));
typedef int int8 __attribute__((ext_vector_type(8)));
typedef int int16 __attribute__((ext_vector_type(16)));
typedef uint uint2 __attribute__((ext_vector_type(2)));
typedef uint uint3 __attribute__((ext_vector_type(3)));
typedef uint uint4 __attribute__((ext_vector_type(4)));
typedef uint uint8 __attribute__((ext_vector_type(8)));
typedef uint uint16 __attribute__((ext_vector_type(16)));
typedef long long2 __attribute__((ext_vector_type(2)));
typedef long long3 __attribute__((ext_vector_type(3)));
typedef long long4 __attribute__((ext_vector_type(4)));
typedef long long8 __attribute__((ext_vector_type(8)));
typedef long long16 __attribute__((ext_vector_type(16)));
typedef ulong ulong2 __attribute__((ext_vector_type(2)));
typedef ulong ulong3 __attribute__((ext_vector_type(3)));
typedef ulong ulong4 __attribute__((ext_vector_type(4)));
typedef ulong ulong8 __attribute__((ext_vector_type(8)));
typedef ulong ulong16 __attribute__((ext_vector_type(16)));
typedef float float2 __attribute__((ext_vector_type(2)));
typedef float float3 __attribute__((ext_vector_type(3)));
typedef float float4 __attribute__((ext_vector_type(4)));
typedef float float8 __attribute__((ext_vector_type(8)));
typedef float float16 __attribute__((ext_vector_type(16)));
#ifdef cl_khr_fp16
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
typedef half half2 __attribute__((ext_vector_type(2)));
typedef half half3 __attribute__((ext_vector_type(3)));
typedef half half4 __attribute__((ext_vector_type(4)));
typedef half half8 __attribute__((ext_vector_type(8)));
typedef half half16 __attribute__((ext_vector_type(16)));
#endif
#ifdef cl_khr_fp64
#if __OPENCL_C_VERSION__ < CL_VERSION_1_2
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#endif
typedef double double2 __attribute__((ext_vector_type(2)));
typedef double double3 __attribute__((ext_vector_type(3)));
typedef double double4 __attribute__((ext_vector_type(4)));
typedef double double8 __attribute__((ext_vector_type(8)));
typedef double double16 __attribute__((ext_vector_type(16)));
#endif
#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
#define NULL ((void*)0)
#endif
/**
* Value of maximum non-infinite single-precision floating-point
* number.
*/
#define MAXFLOAT 0x1.fffffep127f
/**
* A positive float constant expression. HUGE_VALF evaluates
* to +infinity. Used as an error value returned by the built-in
* math functions.
*/
#define HUGE_VALF (__builtin_huge_valf())
/**
* A positive double constant expression. HUGE_VAL evaluates
* to +infinity. Used as an error value returned by the built-in
* math functions.
*/
#define HUGE_VAL (__builtin_huge_val())
/**
* A constant expression of type float representing positive or
* unsigned infinity.
*/
#define INFINITY (__builtin_inff())
/**
* A constant expression of type float representing a quiet NaN.
*/
#define NAN as_float(INT_MAX)
#define FP_ILOGB0 INT_MIN
#define FP_ILOGBNAN INT_MAX
#define FLT_DIG 6
#define FLT_MANT_DIG 24
#define FLT_MAX_10_EXP +38
#define FLT_MAX_EXP +128
#define FLT_MIN_10_EXP -37
#define FLT_MIN_EXP -125
#define FLT_RADIX 2
#define FLT_MAX 0x1.fffffep127f
#define FLT_MIN 0x1.0p-126f
#define FLT_EPSILON 0x1.0p-23f
#define M_E_F 2.71828182845904523536028747135266250f
#define M_LOG2E_F 1.44269504088896340735992468100189214f
#define M_LOG10E_F 0.434294481903251827651128918916605082f
#define M_LN2_F 0.693147180559945309417232121458176568f
#define M_LN10_F 2.30258509299404568401799145468436421f
#define M_PI_F 3.14159265358979323846264338327950288f
#define M_PI_2_F 1.57079632679489661923132169163975144f
#define M_PI_4_F 0.785398163397448309615660845819875721f
#define M_1_PI_F 0.318309886183790671537767526745028724f
#define M_2_PI_F 0.636619772367581343075535053490057448f
#define M_2_SQRTPI_F 1.12837916709551257389615890312154517f
#define M_SQRT2_F 1.41421356237309504880168872420969808f
#define M_SQRT1_2_F 0.707106781186547524400844362104849039f
#define DBL_DIG 15
#define DBL_MANT_DIG 53
#define DBL_MAX_10_EXP +308
#define DBL_MAX_EXP +1024
#define DBL_MIN_10_EXP -307
#define DBL_MIN_EXP -1021
#define DBL_RADIX 2
#define DBL_MAX 0x1.fffffffffffffp1023
#define DBL_MIN 0x1.0p-1022
#define DBL_EPSILON 0x1.0p-52
#define M_E 0x1.5bf0a8b145769p+1
#define M_LOG2E 0x1.71547652b82fep+0
#define M_LOG10E 0x1.bcb7b1526e50ep-2
#define M_LN2 0x1.62e42fefa39efp-1
#define M_LN10 0x1.26bb1bbb55516p+1
#define M_PI 0x1.921fb54442d18p+1
#define M_PI_2 0x1.921fb54442d18p+0
#define M_PI_4 0x1.921fb54442d18p-1
#define M_1_PI 0x1.45f306dc9c883p-2
#define M_2_PI 0x1.45f306dc9c883p-1
#define M_2_SQRTPI 0x1.20dd750429b6dp+0
#define M_SQRT2 0x1.6a09e667f3bcdp+0
#define M_SQRT1_2 0x1.6a09e667f3bcdp-1
#ifdef cl_khr_fp16
#define HALF_DIG 3
#define HALF_MANT_DIG 11
#define HALF_MAX_10_EXP +4
#define HALF_MAX_EXP +16
#define HALF_MIN_10_EXP -4
#define HALF_MIN_EXP -13
#define HALF_RADIX 2
#define HALF_MAX ((0x1.ffcp15h))
#define HALF_MIN ((0x1.0p-14h))
#define HALF_EPSILON ((0x1.0p-10h))
#define M_E_H 2.71828182845904523536028747135266250h
#define M_LOG2E_H 1.44269504088896340735992468100189214h
#define M_LOG10E_H 0.434294481903251827651128918916605082h
#define M_LN2_H 0.693147180559945309417232121458176568h
#define M_LN10_H 2.30258509299404568401799145468436421h
#define M_PI_H 3.14159265358979323846264338327950288h
#define M_PI_2_H 1.57079632679489661923132169163975144h
#define M_PI_4_H 0.785398163397448309615660845819875721h
#define M_1_PI_H 0.318309886183790671537767526745028724h
#define M_2_PI_H 0.636619772367581343075535053490057448h
#define M_2_SQRTPI_H 1.12837916709551257389615890312154517h
#define M_SQRT2_H 1.41421356237309504880168872420969808h
#define M_SQRT1_2_H 0.707106781186547524400844362104849039h
#endif //cl_khr_fp16
#define CHAR_BIT 8
#define SCHAR_MAX 127
#define SCHAR_MIN (-128)
#define UCHAR_MAX 255
#define CHAR_MAX SCHAR_MAX
#define CHAR_MIN SCHAR_MIN
#define USHRT_MAX 65535
#define SHRT_MAX 32767
#define SHRT_MIN (-32768)
#define UINT_MAX 0xffffffff
#define INT_MAX 2147483647
#define INT_MIN (-2147483647-1)
#define ULONG_MAX 0xffffffffffffffffUL
#define LONG_MAX 0x7fffffffffffffffL
#define LONG_MIN (-0x7fffffffffffffffL-1)
// OpenCL v1.1 s6.11.8, v1.2 s6.12.8, v2.0 s6.13.8 - Synchronization Functions
// Flag type and values for barrier, mem_fence, read_mem_fence, write_mem_fence
typedef uint cl_mem_fence_flags;
/**
* Queue a memory fence to ensure correct
* ordering of memory operations to local memory
*/
#define CLK_LOCAL_MEM_FENCE 0x01
/**
* Queue a memory fence to ensure correct
* ordering of memory operations to global memory
*/
#define CLK_GLOBAL_MEM_FENCE 0x02
#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
typedef enum memory_scope {
memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
memory_scope_work_group = __OPENCL_MEMORY_SCOPE_WORK_GROUP,
memory_scope_device = __OPENCL_MEMORY_SCOPE_DEVICE,
memory_scope_all_svm_devices = __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES,
#if defined(cl_intel_subgroups) || defined(cl_khr_subgroups)
memory_scope_sub_group = __OPENCL_MEMORY_SCOPE_SUB_GROUP
#endif
} memory_scope;
/**
* Queue a memory fence to ensure correct ordering of memory
* operations between work-items of a work-group to
* image memory.
*/
#define CLK_IMAGE_MEM_FENCE 0x04
#ifndef ATOMIC_VAR_INIT
#define ATOMIC_VAR_INIT(x) (x)
#endif //ATOMIC_VAR_INIT
#define ATOMIC_FLAG_INIT 0
// enum values aligned with what clang uses in EmitAtomicExpr()
typedef enum memory_order
{
memory_order_relaxed = __ATOMIC_RELAXED,
memory_order_acquire = __ATOMIC_ACQUIRE,
memory_order_release = __ATOMIC_RELEASE,
memory_order_acq_rel = __ATOMIC_ACQ_REL,
memory_order_seq_cst = __ATOMIC_SEQ_CST
} memory_order;
#endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
// OpenCL v1.1 s6.11.3, v1.2 s6.12.14, v2.0 s6.13.14 - Image Read and Write Functions
// These values need to match the runtime equivalent
//
// Addressing Mode.
//
#define CLK_ADDRESS_NONE 0
#define CLK_ADDRESS_CLAMP_TO_EDGE 2
#define CLK_ADDRESS_CLAMP 4
#define CLK_ADDRESS_REPEAT 6
#define CLK_ADDRESS_MIRRORED_REPEAT 8
//
// Coordination Normalization
//
#define CLK_NORMALIZED_COORDS_FALSE 0
#define CLK_NORMALIZED_COORDS_TRUE 1
//
// Filtering Mode.
//
#define CLK_FILTER_NEAREST 0x10
#define CLK_FILTER_LINEAR 0x20
#ifdef cl_khr_gl_msaa_sharing
#pragma OPENCL EXTENSION cl_khr_gl_msaa_sharing : enable
#endif //cl_khr_gl_msaa_sharing
//
// Channel Datatype.
//
#define CLK_SNORM_INT8 0x10D0
#define CLK_SNORM_INT16 0x10D1
#define CLK_UNORM_INT8 0x10D2
#define CLK_UNORM_INT16 0x10D3
#define CLK_UNORM_SHORT_565 0x10D4
#define CLK_UNORM_SHORT_555 0x10D5
#define CLK_UNORM_INT_101010 0x10D6
#define CLK_SIGNED_INT8 0x10D7
#define CLK_SIGNED_INT16 0x10D8
#define CLK_SIGNED_INT32 0x10D9
#define CLK_UNSIGNED_INT8 0x10DA
#define CLK_UNSIGNED_INT16 0x10DB
#define CLK_UNSIGNED_INT32 0x10DC
#define CLK_HALF_FLOAT 0x10DD
#define CLK_FLOAT 0x10DE
#define CLK_UNORM_INT24 0x10DF
// Channel order, numbering must be aligned with cl_channel_order in cl.h
//
#define CLK_R 0x10B0
#define CLK_A 0x10B1
#define CLK_RG 0x10B2
#define CLK_RA 0x10B3
#define CLK_RGB 0x10B4
#define CLK_RGBA 0x10B5
#define CLK_BGRA 0x10B6
#define CLK_ARGB 0x10B7
#define CLK_INTENSITY 0x10B8
#define CLK_LUMINANCE 0x10B9
#define CLK_Rx 0x10BA
#define CLK_RGx 0x10BB
#define CLK_RGBx 0x10BC
#define CLK_DEPTH 0x10BD
#define CLK_DEPTH_STENCIL 0x10BE
#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
#define CLK_sRGB 0x10BF
#define CLK_sRGBx 0x10C0
#define CLK_sRGBA 0x10C1
#define CLK_sBGRA 0x10C2
#define CLK_ABGR 0x10C3
#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
// OpenCL v2.0 s6.13.16 - Pipe Functions
#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
#define CLK_NULL_RESERVE_ID (__builtin_astype(((void*)(__SIZE_MAX__)), reserve_id_t))
// OpenCL v2.0 s6.13.17 - Enqueue Kernels
#define CL_COMPLETE 0x0
#define CL_RUNNING 0x1
#define CL_SUBMITTED 0x2
#define CL_QUEUED 0x3
#define CLK_SUCCESS 0
#define CLK_ENQUEUE_FAILURE -101
#define CLK_INVALID_QUEUE -102
#define CLK_INVALID_NDRANGE -160
#define CLK_INVALID_EVENT_WAIT_LIST -57
#define CLK_DEVICE_QUEUE_FULL -161
#define CLK_INVALID_ARG_SIZE -51
#define CLK_EVENT_ALLOCATION_FAILURE -100
#define CLK_OUT_OF_RESOURCES -5
#define CLK_NULL_QUEUE 0
#define CLK_NULL_EVENT (__builtin_astype(((__SIZE_MAX__)), clk_event_t))
// execution model related definitions
#define CLK_ENQUEUE_FLAGS_NO_WAIT 0x0
#define CLK_ENQUEUE_FLAGS_WAIT_KERNEL 0x1
#define CLK_ENQUEUE_FLAGS_WAIT_WORK_GROUP 0x2
typedef int kernel_enqueue_flags_t;
typedef int clk_profiling_info;
// Profiling info name (see capture_event_profiling_info)
#define CLK_PROFILING_COMMAND_EXEC_TIME 0x1
#define MAX_WORK_DIM 3
typedef struct {
unsigned int workDimension;
size_t globalWorkOffset[MAX_WORK_DIM];
size_t globalWorkSize[MAX_WORK_DIM];
size_t localWorkSize[MAX_WORK_DIM];
} ndrange_t;
#endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
#ifdef cl_intel_device_side_avc_motion_estimation
#pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : begin
#define CLK_AVC_ME_MAJOR_16x16_INTEL 0x0
#define CLK_AVC_ME_MAJOR_16x8_INTEL 0x1
#define CLK_AVC_ME_MAJOR_8x16_INTEL 0x2
#define CLK_AVC_ME_MAJOR_8x8_INTEL 0x3
#define CLK_AVC_ME_MINOR_8x8_INTEL 0x0
#define CLK_AVC_ME_MINOR_8x4_INTEL 0x1
#define CLK_AVC_ME_MINOR_4x8_INTEL 0x2
#define CLK_AVC_ME_MINOR_4x4_INTEL 0x3
#define CLK_AVC_ME_MAJOR_FORWARD_INTEL 0x0
#define CLK_AVC_ME_MAJOR_BACKWARD_INTEL 0x1
#define CLK_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL 0x2
#define CLK_AVC_ME_PARTITION_MASK_ALL_INTEL 0x0
#define CLK_AVC_ME_PARTITION_MASK_16x16_INTEL 0x7E
#define CLK_AVC_ME_PARTITION_MASK_16x8_INTEL 0x7D
#define CLK_AVC_ME_PARTITION_MASK_8x16_INTEL 0x7B
#define CLK_AVC_ME_PARTITION_MASK_8x8_INTEL 0x77
#define CLK_AVC_ME_PARTITION_MASK_8x4_INTEL 0x6F
#define CLK_AVC_ME_PARTITION_MASK_4x8_INTEL 0x5F
#define CLK_AVC_ME_PARTITION_MASK_4x4_INTEL 0x3F
#define CLK_AVC_ME_SLICE_TYPE_PRED_INTEL 0x0
#define CLK_AVC_ME_SLICE_TYPE_BPRED_INTEL 0x1
#define CLK_AVC_ME_SLICE_TYPE_INTRA_INTEL 0x2
#define CLK_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL 0x0
#define CLK_AVC_ME_SEARCH_WINDOW_SMALL_INTEL 0x1
#define CLK_AVC_ME_SEARCH_WINDOW_TINY_INTEL 0x2
#define CLK_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL 0x3
#define CLK_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL 0x4
#define CLK_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL 0x5
#define CLK_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL 0x6
#define CLK_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL 0x7
#define CLK_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL 0x8
#define CLK_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0
#define CLK_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x2
#define CLK_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0
#define CLK_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1
#define CLK_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL 0x3
#define CLK_AVC_ME_COST_PRECISION_QPEL_INTEL 0x0
#define CLK_AVC_ME_COST_PRECISION_HPEL_INTEL 0x1
#define CLK_AVC_ME_COST_PRECISION_PEL_INTEL 0x2
#define CLK_AVC_ME_COST_PRECISION_DPEL_INTEL 0x3
#define CLK_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL 0x10
#define CLK_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL 0x15
#define CLK_AVC_ME_BIDIR_WEIGHT_HALF_INTEL 0x20
#define CLK_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 0x2B
#define CLK_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 0x30
#define CLK_AVC_ME_BORDER_REACHED_LEFT_INTEL 0x0
#define CLK_AVC_ME_BORDER_REACHED_RIGHT_INTEL 0x2
#define CLK_AVC_ME_BORDER_REACHED_TOP_INTEL 0x4
#define CLK_AVC_ME_BORDER_REACHED_BOTTOM_INTEL 0x8
#define CLK_AVC_ME_INTRA_16x16_INTEL 0x0
#define CLK_AVC_ME_INTRA_8x8_INTEL 0x1
#define CLK_AVC_ME_INTRA_4x4_INTEL 0x2
#define CLK_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL 0x0
#define CLK_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL 0x4000
#define CLK_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL (0x1 << 24)
#define CLK_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL (0x2 << 24)
#define CLK_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL (0x3 << 24)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL (0x55 << 24)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL (0xAA << 24)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL (0xFF << 24)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL (0x1 << 24)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL (0x2 << 24)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL (0x1 << 26)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL (0x2 << 26)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL (0x1 << 28)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL (0x2 << 28)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL (0x1 << 30)
#define CLK_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL (0x2 << 30)
#define CLK_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL 0x00
#define CLK_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL 0x80
#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_ALL_INTEL 0x0
#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL 0x6
#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL 0x5
#define CLK_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL 0x3
#define CLK_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL 0x60
#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL 0x10
#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL 0x8
#define CLK_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL 0x4
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7
#define CLK_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8
#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0
#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2
#define CLK_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3
#define CLK_AVC_ME_FRAME_FORWARD_INTEL 0x1
#define CLK_AVC_ME_FRAME_BACKWARD_INTEL 0x2
#define CLK_AVC_ME_FRAME_DUAL_INTEL 0x3
#define CLK_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL 0x0
#define CLK_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL 0x1
#define CLK_AVC_ME_INITIALIZE_INTEL 0x0
#define CLK_AVC_IME_PAYLOAD_INITIALIZE_INTEL 0x0
#define CLK_AVC_REF_PAYLOAD_INITIALIZE_INTEL 0x0
#define CLK_AVC_SIC_PAYLOAD_INITIALIZE_INTEL 0x0
#define CLK_AVC_IME_RESULT_INITIALIZE_INTEL 0x0
#define CLK_AVC_REF_RESULT_INITIALIZE_INTEL 0x0
#define CLK_AVC_SIC_RESULT_INITIALIZE_INTEL 0x0
#define CLK_AVC_IME_RESULT_SINGLE_REFERENCE_STREAMOUT_INITIALIZE_INTEL 0x0
#define CLK_AVC_IME_RESULT_SINGLE_REFERENCE_STREAMIN_INITIALIZE_INTEL 0x0
#define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMOUT_INITIALIZE_INTEL 0x0
#define CLK_AVC_IME_RESULT_DUAL_REFERENCE_STREAMIN_INITIALIZE_INTEL 0x0
#pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : end
#endif // cl_intel_device_side_avc_motion_estimation
#endif //_OPENCL_BASE_H_

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,395 @@
/* pocl.h - global pocl declarations for the host side runtime.
Copyright (c) 2011 Universidad Rey Juan Carlos
2011-2019 Pekka Jääskeläinen
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/**
* @file pocl.h
*
* The declarations in this file are such that are used both in the
* libpocl implementation CL and the kernel compiler. Others should be
* moved to pocl_cl.h of lib/CL or under the kernel compiler dir.
* @todo Check if there are extra declarations here that could be moved.
*/
#ifndef POCL_H
#define POCL_H
#ifndef CL_TARGET_OPENCL_VERSION
#define CL_TARGET_OPENCL_VERSION 220
#endif
#include <CL/opencl.h>
#include "config.h"
#include "pocl_context.h"
/* detects restrict, variadic macros etc */
#include "pocl_compiler_features.h"
#define POCL_FILENAME_LENGTH 1024
#define WORKGROUP_STRING_LENGTH 1024
typedef struct _mem_mapping mem_mapping_t;
/* represents a single buffer to host memory mapping */
struct _mem_mapping {
void *host_ptr; /* the location of the mapped buffer chunk in the host memory */
size_t offset; /* offset to the beginning of the buffer */
size_t size;
mem_mapping_t *prev, *next;
/* This is required, because two clEnqueueMap() with the same buffer+size+offset,
will create two identical mappings in the buffer->mappings LL.
Without this flag, both corresponding clEnqUnmap()s will find
the same mapping (the first one in mappings LL), which will lead
to memory double-free corruption later. */
long unmap_requested;
cl_map_flags map_flags;
/* image mapping data */
size_t origin[3];
size_t region[3];
size_t row_pitch;
size_t slice_pitch;
};
/* memory identifier: id to point the global memory where memory resides
+ pointer to actual data */
typedef struct _pocl_mem_identifier
{
int available; /* ... in this mem objs context */
int global_mem_id;
void *mem_ptr;
void *image_data;
} pocl_mem_identifier;
typedef struct _mem_destructor_callback mem_destructor_callback_t;
/* represents a memory object destructor callback */
struct _mem_destructor_callback
{
void (CL_CALLBACK * pfn_notify) (cl_mem, void*); /* callback function */
void *user_data; /* user supplied data passed to callback function */
mem_destructor_callback_t *next;
};
typedef struct _build_program_callback build_program_callback_t;
struct _build_program_callback
{
void (CL_CALLBACK * callback_function) (cl_program, void*); /* callback function */
void *user_data; /* user supplied data passed to callback function */
};
// Command Queue datatypes
#define POCL_KERNEL_DIGEST_SIZE 16
typedef uint8_t pocl_kernel_hash_t[POCL_KERNEL_DIGEST_SIZE];
// clEnqueueNDRangeKernel
typedef struct
{
void *hash;
void *wg; /* The work group function ptr. Device specific. */
cl_kernel kernel;
/* The launch data that can be passed to the kernel execution environment. */
struct pocl_context pc;
struct pocl_argument *arguments;
/* Can be used to store/cache arbitrary device-specific data. */
void *device_data;
/* If set to 1, disallow any work-group function specialization. */
int force_generic_wg_func;
/* If set to 1, disallow "small grid" WG function specialization. */
int force_large_grid_wg_func;
unsigned device_i;
} _cl_command_run;
// clEnqueueNativeKernel
typedef struct
{
void *args;
size_t cb_args;
void (*user_func)(void *);
} _cl_command_native;
// clEnqueueReadBuffer
typedef struct
{
void *__restrict__ dst_host_ptr;
pocl_mem_identifier *src_mem_id;
size_t offset;
size_t size;
} _cl_command_read;
// clEnqueueWriteBuffer
typedef struct
{
const void *__restrict__ src_host_ptr;
pocl_mem_identifier *dst_mem_id;
size_t offset;
size_t size;
} _cl_command_write;
// clEnqueueCopyBuffer
typedef struct
{
pocl_mem_identifier *src_mem_id;
pocl_mem_identifier *dst_mem_id;
size_t src_offset;
size_t dst_offset;
size_t size;
} _cl_command_copy;
// clEnqueueReadBufferRect
typedef struct
{
void *__restrict__ dst_host_ptr;
pocl_mem_identifier *src_mem_id;
size_t buffer_origin[3];
size_t host_origin[3];
size_t region[3];
size_t buffer_row_pitch;
size_t buffer_slice_pitch;
size_t host_row_pitch;
size_t host_slice_pitch;
} _cl_command_read_rect;
// clEnqueueWriteBufferRect
typedef struct
{
const void *__restrict__ src_host_ptr;
pocl_mem_identifier *dst_mem_id;
size_t buffer_origin[3];
size_t host_origin[3];
size_t region[3];
size_t buffer_row_pitch;
size_t buffer_slice_pitch;
size_t host_row_pitch;
size_t host_slice_pitch;
} _cl_command_write_rect;
// clEnqueueCopyBufferRect
typedef struct
{
pocl_mem_identifier *src_mem_id;
pocl_mem_identifier *dst_mem_id;
size_t dst_origin[3];
size_t src_origin[3];
size_t region[3];
size_t src_row_pitch;
size_t src_slice_pitch;
size_t dst_row_pitch;
size_t dst_slice_pitch;
} _cl_command_copy_rect;
// clEnqueueMapBuffer
typedef struct
{
pocl_mem_identifier *mem_id;
mem_mapping_t *mapping;
} _cl_command_map;
/* clEnqueueUnMapMemObject */
typedef struct
{
pocl_mem_identifier *mem_id;
mem_mapping_t *mapping;
} _cl_command_unmap;
/* clEnqueueFillBuffer */
typedef struct
{
pocl_mem_identifier *dst_mem_id;
size_t size;
size_t offset;
void *__restrict__ pattern;
size_t pattern_size;
} _cl_command_fill_mem;
/* clEnqueue(Write/Read)Image */
typedef struct
{
pocl_mem_identifier *src_mem_id;
void *__restrict__ dst_host_ptr;
pocl_mem_identifier *dst_mem_id;
size_t dst_offset;
size_t origin[3];
size_t region[3];
size_t dst_row_pitch;
size_t dst_slice_pitch;
} _cl_command_read_image;
typedef struct
{
pocl_mem_identifier *dst_mem_id;
const void *__restrict__ src_host_ptr;
pocl_mem_identifier *src_mem_id;
size_t src_offset;
size_t origin[3];
size_t region[3];
size_t src_row_pitch;
size_t src_slice_pitch;
} _cl_command_write_image;
typedef struct
{
pocl_mem_identifier *src_mem_id;
pocl_mem_identifier *dst_mem_id;
size_t dst_origin[3];
size_t src_origin[3];
size_t region[3];
} _cl_command_copy_image;
/* clEnqueueFillImage */
typedef struct
{
pocl_mem_identifier *mem_id;
size_t origin[3];
size_t region[3];
void *__restrict__ fill_pixel;
size_t pixel_size;
} _cl_command_fill_image;
/* clEnqueueMarkerWithWaitlist */
typedef struct
{
void *data;
int has_wait_list;
} _cl_command_marker;
/* clEnqueueBarrierWithWaitlist */
typedef _cl_command_marker _cl_command_barrier;
/* clEnqueueMigrateMemObjects */
typedef struct
{
void *data;
size_t num_mem_objects;
cl_mem *mem_objects;
cl_device_id *source_devices;
} _cl_command_migrate;
typedef struct
{
void* data;
void* queue;
unsigned num_svm_pointers;
void **svm_pointers;
void (CL_CALLBACK *pfn_free_func) ( cl_command_queue queue,
cl_uint num_svm_pointers,
void *svm_pointers[],
void *user_data);
} _cl_command_svm_free;
typedef struct
{
void* svm_ptr;
size_t size;
cl_map_flags flags;
} _cl_command_svm_map;
typedef struct
{
void* svm_ptr;
} _cl_command_svm_unmap;
typedef struct
{
const void *__restrict__ src;
void *__restrict__ dst;
size_t size;
} _cl_command_svm_cpy;
typedef struct
{
void *__restrict__ svm_ptr;
size_t size;
void *__restrict__ pattern;
size_t pattern_size;
} _cl_command_svm_fill;
typedef union
{
_cl_command_run run;
_cl_command_native native;
_cl_command_read read;
_cl_command_write write;
_cl_command_copy copy;
_cl_command_read_rect read_rect;
_cl_command_write_rect write_rect;
_cl_command_copy_rect copy_rect;
_cl_command_fill_mem memfill;
_cl_command_read_image read_image;
_cl_command_write_image write_image;
_cl_command_copy_image copy_image;
_cl_command_fill_image fill_image;
_cl_command_map map;
_cl_command_unmap unmap;
_cl_command_marker marker;
_cl_command_barrier barrier;
_cl_command_migrate migrate;
_cl_command_svm_free svm_free;
_cl_command_svm_map svm_map;
_cl_command_svm_unmap svm_unmap;
_cl_command_svm_cpy svm_memcpy;
_cl_command_svm_fill svm_fill;
} _cl_command_t;
// one item in the command queue
typedef struct _cl_command_node _cl_command_node;
struct _cl_command_node
{
_cl_command_t command;
cl_command_type type;
_cl_command_node *next; // for linked-list storage
_cl_command_node *prev;
cl_event event;
const cl_event *event_wait_list;
cl_device_id device;
/* The index of the targeted device in the platform's device list. */
unsigned device_i;
cl_int ready;
};
#ifndef LLVM_10_0
#define LLVM_OLDER_THAN_10_0 1
#ifndef LLVM_9_0
#define LLVM_OLDER_THAN_9_0 1
#ifndef LLVM_8_0
#define LLVM_OLDER_THAN_8_0 1
#ifndef LLVM_7_0
#define LLVM_OLDER_THAN_7_0 1
#ifndef LLVM_6_0
#define LLVM_OLDER_THAN_6_0 1
#endif
#endif
#endif
#endif
#endif
#endif /* POCL_H */

View File

@@ -0,0 +1,80 @@
/* pocl_device.h - global pocl declarations to be used in the device binaries in
case applicable by the target
Copyright (c) 2012-2018 Pekka Jääskeläinen / Tampere University of Technology
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef POCL_DEVICE_H
#define POCL_DEVICE_H
#include "pocl_types.h"
#define MAX_KERNEL_ARGS 64
#define MAX_KERNEL_NAME_LENGTH 64
/* Metadata of a single kernel stored in the device.*/
typedef struct {
const uchar name[MAX_KERNEL_NAME_LENGTH];
ushort num_args;
ushort num_locals;
void *work_group_func;
} __kernel_metadata;
#ifdef _MSC_VER
#define ALIGN4(x) __declspec(align(4)) x
#define ALIGN8(x) __declspec(align(4)) x
#else
#define ALIGN4(x) x __attribute__ ((aligned (4)))
#define ALIGN8(x) x __attribute__ ((aligned (8)))
#endif
/* A kernel invocation command. */
typedef struct {
/* The execution status of this queue slot. */
ALIGN8(uint status);
/* The kernel to execute. Points to the metadata in the device global
memory. It will be casted to a __kernel_metadata* */
ALIGN8(uint kernel);
/* Pointers to the kernel arguments in the global memory. Will be
casted to 32 bit void* */
ALIGN8(uint args[MAX_KERNEL_ARGS]);
/* Sizes of the dynamically allocated local buffers. */
/* uint32_t dynamic_local_arg_sizes[MAX_KERNEL_ARGS] ALIGN4; */
/* Number of dimensions in the work space. */
ALIGN4(uint work_dim);
ALIGN4(uint num_groups[3]);
ALIGN4(uint global_offset[3]);
} __kernel_exec_cmd;
/* Kernel execution statuses. */
/* The invocation entry is free to use. */
#define POCL_KST_FREE 1
/* The kernel structure has been populated and is waiting to be
executed. */
#define POCL_KST_READY 2
/* The kernel is currently running in the device. */
#define POCL_KST_RUNNING 3
/* The kernel has finished execution. The results can be collected and the
execution entry be freed (by writing POCL_KST_FREE to the status). */
#define POCL_KST_FINISHED 4
#endif

View File

@@ -0,0 +1,52 @@
/* pocl_image_types.h - image data structure used by device implementations
Copyright (c) 2013 Ville Korhonen
Copyright (c) 2017 Michal Babej / Tampere University of Technology
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef __X86_IMAGE_H__
#define __X86_IMAGE_H__
#ifdef __CBUILD__
#define INTTYPE cl_int
#else
#define INTTYPE int
#endif
typedef uintptr_t dev_sampler_t;
typedef struct dev_image_t {
void *_data;
INTTYPE _width;
INTTYPE _height;
INTTYPE _depth;
INTTYPE _image_array_size;
INTTYPE _row_pitch;
INTTYPE _slice_pitch;
INTTYPE _num_mip_levels; /* maybe not needed */
INTTYPE _num_samples; /* maybe not needed */
INTTYPE _order;
INTTYPE _data_type;
INTTYPE _num_channels;
INTTYPE _elem_size;
} dev_image_t;
#endif

View File

@@ -0,0 +1,33 @@
/* pocl-spir.h - global pocl declarations for the SPIR support.
Copyright (c) 2018-2019 Pekka Jääskeläinen
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#ifndef POCL_SPIR_H
#define POCL_SPIR_H
#define SPIR_ADDRESS_SPACE_PRIVATE 0
#define SPIR_ADDRESS_SPACE_GLOBAL 1
#define SPIR_ADDRESS_SPACE_CONSTANT 2
#define SPIR_ADDRESS_SPACE_LOCAL 3
#define SPIR_ADDRESS_SPACE_GENERIC 4
#endif

View File

@@ -0,0 +1,171 @@
/* pocl_types.h - The basic OpenCL C device side scalar data types.
Copyright (c) 2018 Pekka Jääskeläinen / Tampere University of Technology
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* This header is designed to be included both from the device and the host.
In case compiling OpenCL C sources, __OPENCL_VERSION__ should be set.
In case compiling in the host, all but the device-specific types are
defined (size_t and others). Devices should avoid including the C
stdint.h instead of this one as OpenCL C size_t et al. is allowed to
be of different width than when targeting C.
TODO: replace this header (partially) with Clang's opencl-c.h
*/
#ifndef POCL_DEVICE_TYPES_H
#define POCL_DEVICE_TYPES_H
#ifdef __OPENCL_VERSION__
#ifdef __USE_CLANG_OPENCL_C_H
/* Minimal definitions, only the target specific macro overrides,
just in case Clang export the C ones which might differ for
OpenCL C. */
#ifdef __INTPTR_TYPE__
#undef __INTPTR_TYPE__
#endif
#ifdef __UINTPTR_TYPE__
#undef __UINTPTR_TYPE__
#endif
#ifdef __SIZE_TYPE__
#undef __SIZE_TYPE__
#endif
#ifdef __SIZE_MAX__
#undef __SIZE_MAX__
#endif
#if defined(POCL_DEVICE_ADDRESS_BITS) && POCL_DEVICE_ADDRESS_BITS == 32
#define __SIZE_TYPE__ uint
#define __SIZE_MAX__ UINT_MAX
#else
#define __SIZE_TYPE__ ulong
#define __SIZE_MAX__ ULONG_MAX
#endif
#define __INTPTR_TYPE__ __SIZE_TYPE__
#define __UINTPTR_TYPE__ __INTPTR_TYPE__
#else
/* Compiling Device-specific OpenCL C or builtin library C. */
#if defined cl_khr_fp64 && !defined cl_khr_int64
#error "cl_khr_fp64 requires cl_khr_int64"
#endif
/* TODO FIXME We should not use these in OpenCL library's C code at all.
* The problem is that 1) these are predefined by glibc, 2) while we can
* re-define "ulong", we cannot control the size of "long" at all.
* which can lead to "ulong" being 64bit and "long" 32bit, resulting in
* mysterious errors and bugs. Therefore OpenCL library's C code should
* use the fixed size C types where integer size matters. */
#ifdef __CBUILD__
/* Builtin library C code definitions. */
#define size_t csize_t
#define uintptr_t cuintptr_t
#include <stdint.h>
#undef size_t
#undef uintptr_t
typedef uint8_t uchar;
typedef uint16_t ushort;
typedef uint32_t uint;
#ifdef cl_khr_int64
typedef uint64_t ulong;
#else
typedef uint32_t ulong;
#endif
#ifndef cl_khr_fp16
typedef short half;
#endif
#endif
/* The definitions below intentionally lead to errors if these types
are used when they are not available in the language. This prevents
accidentally using them if the compiler does not disable these
types, but only e.g. defines them with an incorrect size.*/
#ifndef cl_khr_fp64
typedef struct error_undefined_type_double error_undefined_type_double;
#define double error_undefined_type_double
#endif
#ifdef __SIZE_TYPE__
#undef __SIZE_TYPE__
#endif
#ifdef __SIZE_MAX__
#undef __SIZE_MAX__
#endif
#if defined(POCL_DEVICE_ADDRESS_BITS) && POCL_DEVICE_ADDRESS_BITS == 32
#define __SIZE_TYPE__ uint
#define __SIZE_MAX__ UINT_MAX
#else
#define __SIZE_TYPE__ ulong
#define __SIZE_MAX__ ULONG_MAX
#endif
typedef __SIZE_TYPE__ size_t;
typedef __PTRDIFF_TYPE__ ptrdiff_t;
typedef ptrdiff_t intptr_t;
typedef size_t uintptr_t;
#endif /* #ifdef __USE_CLANG_OPENCL_C_H */
#else /* #ifdef __OPENCL_VERSION__ */
/* Including from a host source (runtime API implementation). Introduce
the fixed width datatypes, but do not override C's size_t and other
target specific datatypes. */
typedef unsigned char uchar;
/* FIXME see the above TODO about these types. */
#if !(defined(_SYS_TYPES_H) && defined(__USE_MISC))
/* glibc, when including sys/types.h, typedefs these. */
typedef unsigned long int ulong;
typedef unsigned short int ushort;
typedef unsigned int uint;
#endif
#include <stdint.h>
#endif
#endif

View File

@@ -0,0 +1 @@
convolution

View File

@@ -1,68 +1,47 @@
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
LLVM_HOME ?= ~/dev/llvm-project/drops
TOOLCHAIN_PATH ?= ~/dev/riscv-gnu-toolchain/drops
SYSROOT ?= $(TOOLCHAIN_PATH)/riscv32-unknown-elf
POCL_CC_PATH ?= $(realpath ../compiler)
POCL_RT_PATH ?= $(realpath ../runtime)
VORTEX_DRV_PATH ?= $(realpath ../../../driver/sw)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
CXXFLAGS += -std=c++11 -O0 -g -fpermissive -Wall -Wextra -pedantic -Wfatal-errors
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
CXXFLAGS += -I$(POCLRT_PATH)/include
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH)
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
LDFLAGS += -L$(POCLRT_PATH)/lib -L$(DRIVER_PATH)/simx -lOpenCL -lvortex
PROJECT = convolution
SRCS = main.cpp utils.cpp
all: $(PROJECT).dump $(PROJECT).hex
all: $(PROJECT)
lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
kernel.pocl: kernel.cl
TOOLCHAIN_PATH=$(TOOLCHAIN_PATH) SYSROOT=$(SYSROOT) LLVM_HOME=$(LLVM_HOME) VORTEX_RUNTIME_PATH=$(VORTEX_RT_PATH) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_HOME)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
run-fpga: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT)
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
run-ase: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT)
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run-rtlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCLRT_PATH)/lib:$(DRIVER_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug
rm -rf $(PROJECT) *.o *.dump .depend
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif

View File

@@ -1,54 +1,54 @@
__kernel
void convolution(
__read_only image2d_t sourceImage,
__write_only image2d_t outputImage,
int rows,
int cols,
__constant float* filter,
int filterWidth,
sampler_t sampler)
{
// Store each work-items unique row and column
int column = get_global_id(0);
int row = get_global_id(1);
// Half the width of the filter is needed for indexing
// memory later
int halfWidth = (int)(filterWidth/2);
// All accesses to images return data as four-element vector
// (i.e., float4), although only the 'x' component will contain
// meaningful data in this code
float4 sum = {0.0f, 0.0f, 0.0f, 0.0f};
// Iterator for the filter
int filterIdx = 0;
// Each work-item iterates around its local area based on the
// size of the filter
int2 coords; // Coordinates for accessing the image
// Iterate the filter rows
for(int i = -halfWidth; i <= halfWidth; i++) {
coords.y = row + i;
// Iterate over the filter columns
for(int j = -halfWidth; j <= halfWidth; j++) {
coords.x = column + j;
float4 pixel;
// Read a pixel from the image. A single channel image
// stores the pixel in the 'x' coordinate of the returned
// vector.
pixel = read_imagef(sourceImage, sampler, coords);
sum.x += pixel.x * filter[filterIdx++];
}
}
// Copy the data to the output image if the
// work-item is in bounds
if(row < rows && column < cols) {
coords.x = column;
coords.y = row;
write_imagef(outputImage, coords, sum);
}
__kernel
void convolution(
__read_only image2d_t sourceImage,
__write_only image2d_t outputImage,
int rows,
int cols,
__constant float* filter,
int filterWidth,
sampler_t sampler)
{
// Store each work-items unique row and column
int column = get_global_id(0);
int row = get_global_id(1);
// Half the width of the filter is needed for indexing
// memory later
int halfWidth = (int)(filterWidth/2);
// All accesses to images return data as four-element vector
// (i.e., float4), although only the 'x' component will contain
// meaningful data in this code
float4 sum = {0.0f, 0.0f, 0.0f, 0.0f};
// Iterator for the filter
int filterIdx = 0;
// Each work-item iterates around its local area based on the
// size of the filter
int2 coords; // Coordinates for accessing the image
// Iterate the filter rows
for(int i = -halfWidth; i <= halfWidth; i++) {
coords.y = row + i;
// Iterate over the filter columns
for(int j = -halfWidth; j <= halfWidth; j++) {
coords.x = column + j;
float4 pixel;
// Read a pixel from the image. A single channel image
// stores the pixel in the 'x' coordinate of the returned
// vector.
pixel = read_imagef(sourceImage, sampler, coords);
sum.x += pixel.x * filter[filterIdx++];
}
}
// Copy the data to the output image if the
// work-item is in bounds
if(row < rows && column < cols) {
coords.x = column;
coords.y = row;
write_imagef(outputImage, coords, sum);
}
}

View File

@@ -1,261 +1,261 @@
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include "utils.h"
// This function takes a positive integer and rounds it up to
// the nearest multiple of another provided integer
unsigned int roundUp(unsigned int value, unsigned int multiple) {
// Determine how far past the nearest multiple the value is
unsigned int remainder = value % multiple;
// Add the difference to make the value a multiple
if(remainder != 0) {
value += (multiple-remainder);
}
return value;
}
// This function reads in a text file and stores it as a char pointer
char* readSource(char* kernelPath) {
cl_int status;
FILE *fp;
char *source;
long int size;
printf("Program file is: %s\n", kernelPath);
fp = fopen(kernelPath, "rb");
if(!fp) {
printf("Could not open kernel file\n");
exit(-1);
}
status = fseek(fp, 0, SEEK_END);
if(status != 0) {
printf("Error seeking to end of file\n");
exit(-1);
}
size = ftell(fp);
if(size < 0) {
printf("Error getting file position\n");
exit(-1);
}
rewind(fp);
source = (char *)malloc(size + 1);
int i;
for (i = 0; i < size+1; i++) {
source[i]='\0';
}
if(source == NULL) {
printf("Error allocating space for the kernel source\n");
exit(-1);
}
fread(source, 1, size, fp);
source[size] = '\0';
return source;
}
void chk(cl_int status, const char* cmd) {
if(status != CL_SUCCESS) {
printf("%s failed (%d)\n", cmd, status);
exit(-1);
}
}
int main() {
int i, j, k, l;
// Rows and columns in the input image
int imageHeight;
int imageWidth;
const char* inputFile = "input.bmp";
const char* outputFile = "output.bmp";
// Homegrown function to read a BMP from file
float* inputImage = readImage(inputFile, &imageWidth,
&imageHeight);
// Size of the input and output images on the host
int dataSize = imageHeight*imageWidth*sizeof(float);
// Output image on the host
float* outputImage = NULL;
outputImage = (float*)malloc(dataSize);
float* refImage = NULL;
refImage = (float*)malloc(dataSize);
// 45 degree motion blur
float filter[49] =
{0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, -1, 0, 1, 0, 0,
0, 0, -2, 0, 2, 0, 0,
0, 0, -1, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0};
// The convolution filter is 7x7
int filterWidth = 7;
int filterSize = filterWidth*filterWidth; // Assume a square kernel
// Set up the OpenCL environment
cl_int status;
// Discovery platform
cl_platform_id platform;
status = clGetPlatformIDs(1, &platform, NULL);
chk(status, "clGetPlatformIDs");
// Discover device
cl_device_id device;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
chk(status, "clGetDeviceIDs");
// Create context
cl_context_properties props[3] = {CL_CONTEXT_PLATFORM,
(cl_context_properties)(platform), 0};
cl_context context;
context = clCreateContext(props, 1, &device, NULL, NULL, &status);
chk(status, "clCreateContext");
// Create command queue
cl_command_queue queue;
queue = clCreateCommandQueue(context, device, 0, &status);
chk(status, "clCreateCommandQueue");
// The image format describes how the data will be stored in memory
cl_image_format format;
format.image_channel_order = CL_R; // single channel
format.image_channel_data_type = CL_FLOAT; // float data type
// Create space for the source image on the device
cl_mem d_inputImage = clCreateImage2D(context, 0, &format, imageWidth,
imageHeight, 0, NULL, &status);
chk(status, "clCreateImage2D");
// Create space for the output image on the device
cl_mem d_outputImage = clCreateImage2D(context, 0, &format, imageWidth,
imageHeight, 0, NULL, &status);
chk(status, "clCreateImage2D");
// Create space for the 7x7 filter on the device
cl_mem d_filter = clCreateBuffer(context, 0, filterSize*sizeof(float),
NULL, &status);
chk(status, "clCreateBuffer");
// Copy the source image to the device
size_t origin[3] = {0, 0, 0}; // Offset within the image to copy from
size_t region[3] = {imageWidth, imageHeight, 1}; // Elements to per dimension
status = clEnqueueWriteImage(queue, d_inputImage, CL_FALSE, origin, region,
0, 0, inputImage, 0, NULL, NULL);
chk(status, "clEnqueueWriteImage");
// Copy the 7x7 filter to the device
status = clEnqueueWriteBuffer(queue, d_filter, CL_FALSE, 0,
filterSize*sizeof(float), filter, 0, NULL, NULL);
chk(status, "clEnqueueWriteBuffer");
// Create the image sampler
cl_sampler sampler = clCreateSampler(context, CL_FALSE,
CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &status);
chk(status, "clCreateSampler");
const char* source = readSource("kernel.cl");
// Create a program object with source and build it
cl_program program;
program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);
chk(status, "clCreateProgramWithSource");
status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
chk(status, "clBuildProgram");
// Create the kernel object
cl_kernel kernel;
kernel = clCreateKernel(program, "convolution", &status);
chk(status, "clCreateKernel");
// Set the kernel arguments
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_inputImage);
status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_outputImage);
status |= clSetKernelArg(kernel, 2, sizeof(int), &imageHeight);
status |= clSetKernelArg(kernel, 3, sizeof(int), &imageWidth);
status |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_filter);
status |= clSetKernelArg(kernel, 5, sizeof(int), &filterWidth);
status |= clSetKernelArg(kernel, 6, sizeof(cl_sampler), &sampler);
chk(status, "clSetKernelArg");
// Set the work item dimensions
size_t globalSize[2] = {imageWidth, imageHeight};
status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, NULL, 0,
NULL, NULL);
chk(status, "clEnqueueNDRange");
// Read the image back to the host
status = clEnqueueReadImage(queue, d_outputImage, CL_TRUE, origin,
region, 0, 0, outputImage, 0, NULL, NULL);
chk(status, "clEnqueueReadImage");
// Write the output image to file
storeImage(outputImage, outputFile, imageHeight, imageWidth, inputFile);
// Compute the reference image
for(i = 0; i < imageHeight; i++) {
for(j = 0; j < imageWidth; j++) {
refImage[i*imageWidth+j] = 0;
}
}
// Iterate over the rows of the source image
int halfFilterWidth = filterWidth/2;
float sum;
for(i = 0; i < imageHeight; i++) {
// Iterate over the columns of the source image
for(j = 0; j < imageWidth; j++) {
sum = 0; // Reset sum for new source pixel
// Apply the filter to the neighborhood
for(k = - halfFilterWidth; k <= halfFilterWidth; k++) {
for(l = - halfFilterWidth; l <= halfFilterWidth; l++) {
if(i+k >= 0 && i+k < imageHeight &&
j+l >= 0 && j+l < imageWidth) {
sum += inputImage[(i+k)*imageWidth + j+l] *
filter[(k+halfFilterWidth)*filterWidth +
l+halfFilterWidth];
}
}
}
refImage[i*imageWidth+j] = sum;
}
}
int failed = 0;
for(i = 0; i < imageHeight; i++) {
for(j = 0; j < imageWidth; j++) {
if(abs(outputImage[i*imageWidth+j]-refImage[i*imageWidth+j]) > 0.01) {
printf("Results are INCORRECT\n");
printf("Pixel mismatch at <%d,%d> (%f vs. %f)\n", i, j,
outputImage[i*imageWidth+j], refImage[i*imageWidth+j]);
failed = 1;
}
if(failed) break;
}
if(failed) break;
}
if(!failed) {
printf("Results are correct\n");
}
return 0;
#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include "utils.h"
// This function takes a positive integer and rounds it up to
// the nearest multiple of another provided integer
unsigned int roundUp(unsigned int value, unsigned int multiple) {
// Determine how far past the nearest multiple the value is
unsigned int remainder = value % multiple;
// Add the difference to make the value a multiple
if(remainder != 0) {
value += (multiple-remainder);
}
return value;
}
// This function reads in a text file and stores it as a char pointer
char* readSource(char* kernelPath) {
cl_int status;
FILE *fp;
char *source;
long int size;
printf("Program file is: %s\n", kernelPath);
fp = fopen(kernelPath, "rb");
if(!fp) {
printf("Could not open kernel file\n");
exit(-1);
}
status = fseek(fp, 0, SEEK_END);
if(status != 0) {
printf("Error seeking to end of file\n");
exit(-1);
}
size = ftell(fp);
if(size < 0) {
printf("Error getting file position\n");
exit(-1);
}
rewind(fp);
source = (char *)malloc(size + 1);
int i;
for (i = 0; i < size+1; i++) {
source[i]='\0';
}
if(source == NULL) {
printf("Error allocating space for the kernel source\n");
exit(-1);
}
fread(source, 1, size, fp);
source[size] = '\0';
return source;
}
void chk(cl_int status, const char* cmd) {
if(status != CL_SUCCESS) {
printf("%s failed (%d)\n", cmd, status);
exit(-1);
}
}
int main() {
int i, j, k, l;
// Rows and columns in the input image
int imageHeight;
int imageWidth;
const char* inputFile = "input.bmp";
const char* outputFile = "output.bmp";
// Homegrown function to read a BMP from file
float* inputImage = readImage(inputFile, &imageWidth,
&imageHeight);
// Size of the input and output images on the host
int dataSize = imageHeight*imageWidth*sizeof(float);
// Output image on the host
float* outputImage = NULL;
outputImage = (float*)malloc(dataSize);
float* refImage = NULL;
refImage = (float*)malloc(dataSize);
// 45 degree motion blur
float filter[49] =
{0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, -1, 0, 1, 0, 0,
0, 0, -2, 0, 2, 0, 0,
0, 0, -1, 0, 1, 0, 0,
0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0};
// The convolution filter is 7x7
int filterWidth = 7;
int filterSize = filterWidth*filterWidth; // Assume a square kernel
// Set up the OpenCL environment
cl_int status;
// Discovery platform
cl_platform_id platform;
status = clGetPlatformIDs(1, &platform, NULL);
chk(status, "clGetPlatformIDs");
// Discover device
cl_device_id device;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
chk(status, "clGetDeviceIDs");
// Create context
cl_context_properties props[3] = {CL_CONTEXT_PLATFORM,
(cl_context_properties)(platform), 0};
cl_context context;
context = clCreateContext(props, 1, &device, NULL, NULL, &status);
chk(status, "clCreateContext");
// Create command queue
cl_command_queue queue;
queue = clCreateCommandQueue(context, device, 0, &status);
chk(status, "clCreateCommandQueue");
// The image format describes how the data will be stored in memory
cl_image_format format;
format.image_channel_order = CL_R; // single channel
format.image_channel_data_type = CL_FLOAT; // float data type
// Create space for the source image on the device
cl_mem d_inputImage = clCreateImage2D(context, 0, &format, imageWidth,
imageHeight, 0, NULL, &status);
chk(status, "clCreateImage2D");
// Create space for the output image on the device
cl_mem d_outputImage = clCreateImage2D(context, 0, &format, imageWidth,
imageHeight, 0, NULL, &status);
chk(status, "clCreateImage2D");
// Create space for the 7x7 filter on the device
cl_mem d_filter = clCreateBuffer(context, 0, filterSize*sizeof(float),
NULL, &status);
chk(status, "clCreateBuffer");
// Copy the source image to the device
size_t origin[3] = {0, 0, 0}; // Offset within the image to copy from
size_t region[3] = {imageWidth, imageHeight, 1}; // Elements to per dimension
status = clEnqueueWriteImage(queue, d_inputImage, CL_FALSE, origin, region,
0, 0, inputImage, 0, NULL, NULL);
chk(status, "clEnqueueWriteImage");
// Copy the 7x7 filter to the device
status = clEnqueueWriteBuffer(queue, d_filter, CL_FALSE, 0,
filterSize*sizeof(float), filter, 0, NULL, NULL);
chk(status, "clEnqueueWriteBuffer");
// Create the image sampler
cl_sampler sampler = clCreateSampler(context, CL_FALSE,
CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &status);
chk(status, "clCreateSampler");
const char* source = readSource("kernel.cl");
// Create a program object with source and build it
cl_program program;
program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);
chk(status, "clCreateProgramWithSource");
status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
chk(status, "clBuildProgram");
// Create the kernel object
cl_kernel kernel;
kernel = clCreateKernel(program, "convolution", &status);
chk(status, "clCreateKernel");
// Set the kernel arguments
status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_inputImage);
status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_outputImage);
status |= clSetKernelArg(kernel, 2, sizeof(int), &imageHeight);
status |= clSetKernelArg(kernel, 3, sizeof(int), &imageWidth);
status |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_filter);
status |= clSetKernelArg(kernel, 5, sizeof(int), &filterWidth);
status |= clSetKernelArg(kernel, 6, sizeof(cl_sampler), &sampler);
chk(status, "clSetKernelArg");
// Set the work item dimensions
size_t globalSize[2] = {imageWidth, imageHeight};
status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, NULL, 0,
NULL, NULL);
chk(status, "clEnqueueNDRange");
// Read the image back to the host
status = clEnqueueReadImage(queue, d_outputImage, CL_TRUE, origin,
region, 0, 0, outputImage, 0, NULL, NULL);
chk(status, "clEnqueueReadImage");
// Write the output image to file
storeImage(outputImage, outputFile, imageHeight, imageWidth, inputFile);
// Compute the reference image
for(i = 0; i < imageHeight; i++) {
for(j = 0; j < imageWidth; j++) {
refImage[i*imageWidth+j] = 0;
}
}
// Iterate over the rows of the source image
int halfFilterWidth = filterWidth/2;
float sum;
for(i = 0; i < imageHeight; i++) {
// Iterate over the columns of the source image
for(j = 0; j < imageWidth; j++) {
sum = 0; // Reset sum for new source pixel
// Apply the filter to the neighborhood
for(k = - halfFilterWidth; k <= halfFilterWidth; k++) {
for(l = - halfFilterWidth; l <= halfFilterWidth; l++) {
if(i+k >= 0 && i+k < imageHeight &&
j+l >= 0 && j+l < imageWidth) {
sum += inputImage[(i+k)*imageWidth + j+l] *
filter[(k+halfFilterWidth)*filterWidth +
l+halfFilterWidth];
}
}
}
refImage[i*imageWidth+j] = sum;
}
}
int failed = 0;
for(i = 0; i < imageHeight; i++) {
for(j = 0; j < imageWidth; j++) {
if(abs(outputImage[i*imageWidth+j]-refImage[i*imageWidth+j]) > 0.01) {
printf("Results are INCORRECT\n");
printf("Pixel mismatch at <%d,%d> (%f vs. %f)\n", i, j,
outputImage[i*imageWidth+j], refImage[i*imageWidth+j]);
failed = 1;
}
if(failed) break;
}
if(failed) break;
}
if(!failed) {
printf("Results are correct\n");
}
return 0;
}

View File

@@ -1,68 +0,0 @@
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH) -I.
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT = cutcp
SRCS = main.cc args.c parboil_opencl.c ocl.c gpu_info.c cutoff.c cutcpu.c output.c readatom.c excl.c
all: $(PROJECT).dump $(PROJECT).hex
lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug

View File

@@ -1,617 +0,0 @@
#include <parboil.h>
#include <errno.h>
#include <limits.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
/*****************************************************************************/
/* Memory management routines */
/* Free an array of owned strings. */
void
pb_FreeStringArray(char **string_array)
{
char **p;
if (!string_array) return;
for (p = string_array; *p; p++) free(*p);
free(string_array);
}
struct pb_PlatformParam *
pb_PlatformParam(char *name, char *version)
{
if (name == NULL) {
fprintf(stderr, "pb_PlatformParam: Invalid argument\n");
exit(-1);
}
struct pb_PlatformParam *ret =
(struct pb_PlatformParam *)malloc(sizeof (struct pb_PlatformParam));
ret->name = name;
ret->version = version;
return ret;
}
void
pb_FreePlatformParam(struct pb_PlatformParam *p)
{
if (p == NULL) return;
free(p->name);
free(p->version);
free(p);
}
struct pb_DeviceParam *
pb_DeviceParam_index(int index)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_INDEX;
ret->index = index;
return ret;
}
struct pb_DeviceParam *
pb_DeviceParam_cpu(void)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_CPU;
return ret;
}
struct pb_DeviceParam *
pb_DeviceParam_gpu(void)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_GPU;
return ret;
}
struct pb_DeviceParam *
pb_DeviceParam_accelerator(void)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_ACCELERATOR;
return ret;
}
struct pb_DeviceParam *
pb_DeviceParam_name(char *name)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_NAME;
ret->name = name;
return ret;
}
void
pb_FreeDeviceParam(struct pb_DeviceParam *p)
{
if (p == NULL) return;
switch(p->criterion) {
case pb_Device_NAME:
free(p->name);
break;
case pb_Device_INDEX:
case pb_Device_CPU:
case pb_Device_ACCELERATOR:
break;
default:
fprintf(stderr, "pb_FreeDeviceParam: Invalid argument\n");
exit(-1);
}
}
void
pb_FreeParameters(struct pb_Parameters *p)
{
free(p->outFile);
pb_FreeStringArray(p->inpFiles);
pb_FreePlatformParam(p->platform);
pb_FreeDeviceParam(p->device);
free(p);
}
/*****************************************************************************/
/* Parse a comma-delimited list of strings into an
* array of strings. */
static char **
read_string_array(char *in)
{
char **ret;
int i;
int count; /* Number of items in the input */
char *substring; /* Current substring within 'in' */
/* Count the number of items in the string */
count = 1;
for (i = 0; in[i]; i++) if (in[i] == ',') count++;
/* Allocate storage */
ret = (char **)malloc((count + 1) * sizeof(char *));
/* Create copies of the strings from the list */
substring = in;
for (i = 0; i < count; i++) {
char *substring_end;
int substring_length;
/* Find length of substring */
for (substring_end = substring;
(*substring_end != ',') && (*substring_end != 0);
substring_end++);
substring_length = substring_end - substring;
/* Allocate memory and copy the substring */
ret[i] = (char *)malloc(substring_length + 1);
memcpy(ret[i], substring, substring_length);
ret[i][substring_length] = 0;
/* go to next substring */
substring = substring_end + 1;
}
ret[i] = NULL; /* Write the sentinel value */
return ret;
}
static void
report_parse_error(const char *str)
{
fputs(str, stderr);
}
/* Interpret a string as a 'pb_DeviceParam' value.
* Return a pointer to a new value, or NULL on failure.
*/
static struct pb_DeviceParam *
read_device_param(char *str)
{
/* Try different ways of interpreting 'device_string' until one works */
/* If argument is an integer, then interpret it as a device index */
errno = 0;
char *end;
long device_int = strtol(str, &end, 10);
if (!errno) {
/* Negative numbers are not valid */
if (device_int < 0 || device_int > INT_MAX) return NULL;
return pb_DeviceParam_index(device_int);
}
/* Match against predefined strings */
if (strcmp(str, "CPU") == 0)
return pb_DeviceParam_cpu();
if (strcmp(str, "GPU") == 0)
return pb_DeviceParam_gpu();
if (strcmp(str, "ACCELERATOR") == 0)
return pb_DeviceParam_accelerator();
/* Assume any other string is a device name */
return pb_DeviceParam_name(strdup(str));
}
/* Interpret a string as a 'pb_PlatformParam' value.
* Return a pointer to a new value, or NULL on failure.
*/
static struct pb_PlatformParam *
read_platform_param(char *str)
{
int separator_index; /* Index of the '-' character separating
* name and version number. It's -1 if
* there's no '-' character. */
/* Find the last occurrence of '-' in 'str' */
{
char *cur;
separator_index = -1;
for (cur = str; *cur; cur++) {
if (*cur == '-') separator_index = cur - str;
}
}
/* The platform name is either the entire string, or all characters before
* the separator */
int name_length = separator_index == -1 ? strlen(str) : separator_index;
char *name_str = (char *)malloc(name_length + 1);
memcpy(name_str, str, name_length);
name_str[name_length] = 0;
/* The version is either NULL, or all characters after the separator */
char *version_str;
if (separator_index == -1) {
version_str = NULL;
}
else {
const char *version_input_str = str + separator_index + 1;
int version_length = strlen(version_input_str);
version_str = (char *)malloc(version_length + 1);
memcpy(version_str, version_input_str, version_length);
version_str[version_length] = 0;
}
/* Create output structure */
return pb_PlatformParam(name_str, version_str);
}
/****************************************************************************/
/* Argument parsing state */
/* Argument parsing state.
*
* Arguments that are interpreted by the argument parser are removed from
* the list. Variables 'argc' and 'argn' do not count arguments that have
* been removed.
*
* During argument parsing, the array of arguments is compacted, overwriting
* the erased arguments. Variable 'argv_put' points to the array element
* where the next argument will be written. Variable 'argv_get' points to
* the array element where the next argument will be read from.
*/
struct argparse {
int argc; /* Number of arguments. Mutable. */
int argn; /* Current argument index. */
char **argv_get; /* Argument value being read. */
char **argv_put; /* Argument value being written.
* argv_put <= argv_get. */
};
static void
initialize_argparse(struct argparse *ap, int argc, char **argv)
{
ap->argc = argc;
ap->argn = 0;
ap->argv_get = ap->argv_put = argv;
}
/* Finish argument parsing, without processing the remaining arguments.
* Write new argument count into _argc. */
static void
finalize_argparse(struct argparse *ap, int *_argc, char **argv)
{
/* Move the remaining arguments */
for(; ap->argn < ap->argc; ap->argn++)
*ap->argv_put++ = *ap->argv_get++;
/* Update the argument count */
*_argc = ap->argc;
/* Insert a terminating NULL */
argv[ap->argc] = NULL;
}
/* Delete the current argument. The argument will not be visible
* when argument parsing is done. */
static void
delete_argument(struct argparse *ap)
{
if (ap->argn >= ap->argc) {
fprintf(stderr, "delete_argument\n");
}
ap->argc--;
ap->argv_get++;
}
/* Go to the next argument. Also, move the current argument to its
* final location in argv. */
static void
next_argument(struct argparse *ap)
{
if (ap->argn >= ap->argc) {
fprintf(stderr, "next_argument\n");
}
/* Move argument to its new location. */
*ap->argv_put++ = *ap->argv_get++;
ap->argn++;
}
static int
is_end_of_arguments(struct argparse *ap)
{
return ap->argn == ap->argc;
}
/* Get the current argument */
static char *
get_argument(struct argparse *ap)
{
return *ap->argv_get;
}
/* Get the current argument, and also delete it */
static char *
consume_argument(struct argparse *ap)
{
char *ret = get_argument(ap);
delete_argument(ap);
return ret;
}
/****************************************************************************/
/* The result of parsing a command-line argument */
typedef enum {
ARGPARSE_OK, /* Success */
ARGPARSE_ERROR, /* Error */
ARGPARSE_DONE /* Success, and do not continue parsing */
} result;
typedef result parse_action(struct argparse *ap, struct pb_Parameters *params);
/* A command-line option */
struct option {
char short_name; /* If not 0, the one-character
* name of this option */
const char *long_name; /* If not NULL, the long name of this option */
parse_action *action; /* What to do when this option occurs.
* Sentinel value is NULL.
*/
};
/* Output file
*
* -o FILE
*/
static result
parse_output_file(struct argparse *ap, struct pb_Parameters *params)
{
if (is_end_of_arguments(ap))
{
report_parse_error("Expecting file name after '-o'\n");
return ARGPARSE_ERROR;
}
/* Replace the output file name */
free(params->outFile);
params->outFile = strdup(consume_argument(ap));
return ARGPARSE_OK;
}
/* Input files
*
* -i FILE,FILE,...
*/
static result
parse_input_files(struct argparse *ap, struct pb_Parameters *params)
{
if (is_end_of_arguments(ap))
{
report_parse_error("Expecting file name after '-i'\n");
return ARGPARSE_ERROR;
}
/* Replace the input file list */
pb_FreeStringArray(params->inpFiles);
params->inpFiles = read_string_array(consume_argument(ap));
return ARGPARSE_OK;
}
/* End of options
*
* --
*/
static result
parse_end_options(struct argparse *ap, struct pb_Parameters *params)
{
return ARGPARSE_DONE;
}
/* OpenCL device
*
* --device X
*/
static result
parse_device(struct argparse *ap, struct pb_Parameters *params)
{
/* Read the next argument, which specifies a device */
if (is_end_of_arguments(ap))
{
report_parse_error("Expecting device specification after '--device'\n");
return ARGPARSE_ERROR;
}
char *device_string = consume_argument(ap);
struct pb_DeviceParam *device_param = read_device_param(device_string);
if (!device_param) {
report_parse_error("Unrecognized device specification format on command line\n");
return ARGPARSE_ERROR;
}
/* Save the result */
pb_FreeDeviceParam(params->device);
params->device = device_param;
return ARGPARSE_OK;
}
static result
parse_platform(struct argparse *ap, struct pb_Parameters *params)
{
/* Read the next argument, which specifies a platform */
if (is_end_of_arguments(ap))
{
report_parse_error("Expecting device specification after '--platform'\n");
return ARGPARSE_ERROR;
}
char *platform_string = consume_argument(ap);
struct pb_PlatformParam *platform_param = read_platform_param(platform_string);
if (!platform_param) {
report_parse_error("Unrecognized platform specification format on command line\n");
return ARGPARSE_ERROR;
}
/* Save the result */
pb_FreePlatformParam(params->platform);
params->platform = platform_param;
return ARGPARSE_OK;
}
static struct option options[] = {
{ 'o', NULL, &parse_output_file },
{ 'i', NULL, &parse_input_files },
{ '-', NULL, &parse_end_options },
{ 0, "device", &parse_device },
{ 0, "platform", &parse_platform },
{ 0, NULL, NULL }
};
static int
is_last_option(struct option *op)
{
return op->action == NULL;
}
/****************************************************************************/
/* Parse command-line parameters.
* Return zero on error, nonzero otherwise.
* On error, the other outputs may be invalid.
*
* The information collected from parameters is used to update
* 'ret'. 'ret' should be initialized.
*
* '_argc' and 'argv' are updated to contain only the unprocessed arguments.
*/
static int
pb_ParseParameters (struct pb_Parameters *ret, int *_argc, char **argv)
{
char *err_message;
struct argparse ap;
/* Each argument */
initialize_argparse(&ap, *_argc, argv);
while(!is_end_of_arguments(&ap)) {
result arg_result; /* Result of parsing this option */
char *arg = get_argument(&ap);
/* Process this argument */
if (arg[0] == '-') {
/* Single-character flag */
if ((arg[1] != 0) && (arg[2] == 0)) {
delete_argument(&ap); /* This argument is consumed here */
/* Find a matching short option */
struct option *op;
for (op = options; !is_last_option(op); op++) {
if (op->short_name == arg[1]) {
arg_result = (*op->action)(&ap, ret);
goto option_was_processed;
}
}
/* No option matches */
report_parse_error("Unexpected command-line parameter\n");
arg_result = ARGPARSE_ERROR;
goto option_was_processed;
}
/* Long flag */
if (arg[1] == '-') {
delete_argument(&ap); /* This argument is consumed here */
/* Find a matching long option */
struct option *op;
for (op = options; !is_last_option(op); op++) {
if (op->long_name && strcmp(&arg[2], op->long_name) == 0) {
arg_result = (*op->action)(&ap, ret);
goto option_was_processed;
}
}
/* No option matches */
report_parse_error("Unexpected command-line parameter\n");
arg_result = ARGPARSE_ERROR;
goto option_was_processed;
}
}
else {
/* Other arguments are ignored */
next_argument(&ap);
arg_result = ARGPARSE_OK;
goto option_was_processed;
}
option_was_processed:
/* Decide what to do next based on 'arg_result' */
switch(arg_result) {
case ARGPARSE_OK:
/* Continue processing */
break;
case ARGPARSE_ERROR:
/* Error exit from the function */
return 0;
case ARGPARSE_DONE:
/* Normal exit from the argument parsing loop */
goto end_of_options;
}
} /* end for each argument */
/* If all arguments were processed, then normal exit from the loop */
end_of_options:
finalize_argparse(&ap, _argc, argv);
return 1;
}
/*****************************************************************************/
/* Other exported functions */
struct pb_Parameters *
pb_ReadParameters(int *_argc, char **argv)
{
struct pb_Parameters *ret =
(struct pb_Parameters *)malloc(sizeof(struct pb_Parameters));
/* Initialize the parameters structure */
ret->outFile = NULL;
ret->inpFiles = (char **)malloc(sizeof(char *));
ret->inpFiles[0] = NULL;
ret->platform = NULL;
ret->device = NULL;
/* Read parameters and update _argc, argv */
if (!pb_ParseParameters(ret, _argc, argv)) {
/* Parse error */
pb_FreeParameters(ret);
return NULL;
}
return ret;
}
int
pb_Parameters_CountInputs(struct pb_Parameters *p)
{
int n;
for (n = 0; p->inpFiles[n]; n++);
return n;
}

View File

@@ -1,37 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#ifndef ATOM_H
#define ATOM_H
#ifdef __cplusplus
extern "C" {
#endif
typedef struct Atom_t {
float x, y, z, q;
} Atom;
typedef struct Atoms_t {
Atom *atoms;
int size;
} Atoms;
typedef struct Vec3_t {
float x, y, z;
} Vec3;
Atoms *read_atom_file(const char *fname);
void free_atom(Atoms *atom);
void get_atom_extent(Vec3 *lo, Vec3 *hi, Atoms *atom);
#ifdef __cplusplus
}
#endif
#endif /* ATOM_H */

View File

@@ -1,195 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <parboil.h>
#include "atom.h"
#include "cutoff.h"
#undef DEBUG_PASS_RATE
#define CHECK_CYLINDER_CPU
#define CELLEN 4.f
#define INV_CELLEN (1.f/CELLEN)
extern int cpu_compute_cutoff_potential_lattice(
Lattice *lattice, /* the lattice */
float cutoff, /* cutoff distance */
Atoms *atoms /* array of atoms */
)
{
int nx = lattice->dim.nx;
int ny = lattice->dim.ny;
int nz = lattice->dim.nz;
float xlo = lattice->dim.lo.x;
float ylo = lattice->dim.lo.y;
float zlo = lattice->dim.lo.z;
float gridspacing = lattice->dim.h;
int natoms = atoms->size;
Atom *atom = atoms->atoms;
const float a2 = cutoff * cutoff;
const float inv_a2 = 1.f / a2;
float s;
const float inv_gridspacing = 1.f / gridspacing;
const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
/* lattice point radius about each atom */
int n;
int i, j, k;
int ia, ib, ic;
int ja, jb, jc;
int ka, kb, kc;
int index;
int koff, jkoff;
float x, y, z, q;
float dx, dy, dz;
float dz2, dydz2, r2;
float e;
float xstart, ystart;
float *pg;
int gindex;
int ncell, nxcell, nycell, nzcell;
int *first, *next;
float inv_cellen = INV_CELLEN;
Vec3 minext, maxext; /* Extent of atom bounding box */
float xmin, ymin, zmin;
float xmax, ymax, zmax;
#if DEBUG_PASS_RATE
unsigned long long pass_count = 0;
unsigned long long fail_count = 0;
#endif
/* find min and max extent */
get_atom_extent(&minext, &maxext, atoms);
/* number of cells in each dimension */
nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
ncell = nxcell * nycell * nzcell;
/* allocate for cursor link list implementation */
first = (int *) malloc(ncell * sizeof(int));
for (gindex = 0; gindex < ncell; gindex++) {
first[gindex] = -1;
}
next = (int *) malloc(natoms * sizeof(int));
for (n = 0; n < natoms; n++) {
next[n] = -1;
}
/* geometric hashing */
for (n = 0; n < natoms; n++) {
if (0==atom[n].q) continue; /* skip any non-contributing atoms */
i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
gindex = (k*nycell + j)*nxcell + i;
next[n] = first[gindex];
first[gindex] = n;
}
/* traverse the grid cells */
for (gindex = 0; gindex < ncell; gindex++) {
for (n = first[gindex]; n != -1; n = next[n]) {
x = atom[n].x - xlo;
y = atom[n].y - ylo;
z = atom[n].z - zlo;
q = atom[n].q;
/* find closest grid point with position less than or equal to atom */
ic = (int) (x * inv_gridspacing);
jc = (int) (y * inv_gridspacing);
kc = (int) (z * inv_gridspacing);
/* find extent of surrounding box of grid points */
ia = ic - radius;
ib = ic + radius + 1;
ja = jc - radius;
jb = jc + radius + 1;
ka = kc - radius;
kb = kc + radius + 1;
/* trim box edges so that they are within grid point lattice */
if (ia < 0) ia = 0;
if (ib >= nx) ib = nx-1;
if (ja < 0) ja = 0;
if (jb >= ny) jb = ny-1;
if (ka < 0) ka = 0;
if (kb >= nz) kb = nz-1;
/* loop over surrounding grid points */
xstart = ia*gridspacing - x;
ystart = ja*gridspacing - y;
dz = ka*gridspacing - z;
for (k = ka; k <= kb; k++, dz += gridspacing) {
koff = k*ny;
dz2 = dz*dz;
dy = ystart;
for (j = ja; j <= jb; j++, dy += gridspacing) {
jkoff = (koff + j)*nx;
dydz2 = dy*dy + dz2;
#ifdef CHECK_CYLINDER_CPU
if (dydz2 >= a2) continue;
#endif
dx = xstart;
index = jkoff + ia;
pg = lattice->lattice + index;
#if defined(__INTEL_COMPILER)
for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
r2 = dx*dx + dydz2;
s = (1.f - r2 * inv_a2) * (1.f - r2 * inv_a2);
e = q * (1/sqrtf(r2)) * s;
*pg += (r2 < a2 ? e : 0); /* LOOP VECTORIZED!! */
}
#else
for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
r2 = dx*dx + dydz2;
if (r2 >= a2)
{
#ifdef DEBUG_PASS_RATE
fail_count++;
#endif
continue;
}
#ifdef DEBUG_PASS_RATE
pass_count++;
#endif
s = (1.f - r2 * inv_a2);
e = q * (1/sqrtf(r2)) * s * s;
*pg += e;
}
#endif
}
} /* end loop over surrounding grid points */
} /* end loop over atoms in a gridcell */
} /* end loop over gridcells */
/* free memory */
free(next);
free(first);
/* For debugging: print the number of times that the test passed/failed */
#ifdef DEBUG_PASS_RATE
printf ("Pass :%lld\n", pass_count);
printf ("Fail :%lld\n", fail_count);
#endif
return 0;
}

View File

@@ -1,499 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#include <CL/cl.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <parboil.h>
#include "atom.h"
#include "cutoff.h"
#include "macros.h"
#include "ocl.h"
// OpenCL 1.1 support for int3 is not uniform on all implementations, so
// we use int4 instead. Only the 'x', 'y', and 'z' fields of xyz are used.
typedef cl_int4 xyz;
//extern "C" int gpu_compute_cutoff_potential_lattice(
int gpu_compute_cutoff_potential_lattice(
struct pb_TimerSet *timers,
Lattice *lattice, /* the lattice */
float cutoff, /* cutoff distance */
Atoms *atoms, /* array of atoms */
int verbose, /* print info/debug messages */
struct pb_Parameters *parameters
)
{
int nx = lattice->dim.nx;
int ny = lattice->dim.ny;
int nz = lattice->dim.nz;
float xlo = lattice->dim.lo.x;
float ylo = lattice->dim.lo.y;
float zlo = lattice->dim.lo.z;
float h = lattice->dim.h;
int natoms = atoms->size;
Atom *atom = atoms->atoms;
xyz nbrlist[NBRLIST_MAXLEN];
int nbrlistlen = 0;
int binHistoFull[BIN_DEPTH+1] = { 0 }; /* clear every array element */
int binHistoCover[BIN_DEPTH+1] = { 0 }; /* clear every array element */
int num_excluded = 0;
int xRegionDim, yRegionDim, zRegionDim;
int xRegionIndex, yRegionIndex, zRegionIndex;
int xOffset, yOffset, zOffset;
int lnx, lny, lnz, lnall;
float *regionZeroAddr, *thisRegion;
cl_mem regionZeroCl;
int index, indexRegion;
int c;
xyz binDim;
int nbins;
cl_float4 *binBaseAddr, *binZeroAddr;
cl_mem binBaseCl, binZeroCl;
int *bincntBaseAddr, *bincntZeroAddr;
Atoms *extra = NULL;
cl_mem NbrListLen;
cl_mem NbrList;
int i, j, k, n;
int sum, total;
float avgFillFull, avgFillCover;
const float cutoff2 = cutoff * cutoff;
const float inv_cutoff2 = 1.f / cutoff2;
size_t gridDim[3], blockDim[3];
// The "compute" timer should be active upon entry to this function
/* pad lattice to be factor of 8 in each dimension */
xRegionDim = (int) ceilf(nx/8.f);
yRegionDim = (int) ceilf(ny/8.f);
zRegionDim = (int) ceilf(nz/8.f);
lnx = 8 * xRegionDim;
lny = 8 * yRegionDim;
lnz = 8 * zRegionDim;
lnall = lnx * lny * lnz;
/* will receive energies from OpenCL */
regionZeroAddr = (float *) malloc(lnall * sizeof(float));
/* create bins */
c = (int) ceil(cutoff * BIN_INVLEN); /* count extra bins around lattice */
binDim.x = (int) ceil(lnx * h * BIN_INVLEN) + 2*c;
binDim.y = (int) ceil(lny * h * BIN_INVLEN) + 2*c;
binDim.z = (int) ceil(lnz * h * BIN_INVLEN) + 2*c;
nbins = binDim.x * binDim.y * binDim.z;
binBaseAddr = (cl_float4 *) calloc(nbins * BIN_DEPTH, sizeof(cl_float4));
binZeroAddr = binBaseAddr + ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;
bincntBaseAddr = (int *) calloc(nbins, sizeof(int));
bincntZeroAddr = bincntBaseAddr + (c * binDim.y + c) * binDim.x + c;
/* create neighbor list */
if (ceilf(BIN_LENGTH / (8*h)) == floorf(BIN_LENGTH / (8*h))) {
float s = sqrtf(3);
float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH);
int cnt = 0;
/* develop neighbor list around 1 cell */
if (2*c + 1 > NBRLIST_DIM) {
fprintf(stderr, "must have cutoff <= %f\n",
(NBRLIST_DIM-1)/2 * BIN_LENGTH);
return -1;
}
for (k = -c; k <= c; k++) {
for (j = -c; j <= c; j++) {
for (i = -c; i <= c; i++) {
if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue;
nbrlist[cnt].x = i;
nbrlist[cnt].y = j;
nbrlist[cnt].z = k;
cnt++;
}
}
}
nbrlistlen = cnt;
}
else if (8*h <= 2*BIN_LENGTH) {
float s = 2.f*sqrtf(3);
float r2 = (cutoff + s*BIN_LENGTH) * (cutoff + s*BIN_LENGTH);
int cnt = 0;
/* develop neighbor list around 3-cube of cells */
if (2*c + 3 > NBRLIST_DIM) {
fprintf(stderr, "must have cutoff <= %f\n",
(NBRLIST_DIM-3)/2 * BIN_LENGTH);
return -1;
}
for (k = -c; k <= c; k++) {
for (j = -c; j <= c; j++) {
for (i = -c; i <= c; i++) {
if ((i*i + j*j + k*k)*BIN_LENGTH*BIN_LENGTH >= r2) continue;
nbrlist[cnt].x = i;
nbrlist[cnt].y = j;
nbrlist[cnt].z = k;
cnt++;
}
}
}
nbrlistlen = cnt;
}
else {
fprintf(stderr, "must have h <= %f\n", 0.25 * BIN_LENGTH);
return -1;
}
/* perform geometric hashing of atoms into bins */
{
/* array of extra atoms, permit average of one extra per bin */
Atom *extra_atoms = (Atom *) calloc(nbins, sizeof(Atom));
int extra_len = 0;
for (n = 0; n < natoms; n++) {
cl_float4 p;
p.x = atom[n].x - xlo;
p.y = atom[n].y - ylo;
p.z = atom[n].z - zlo;
p.w = atom[n].q;
i = (int) floorf(p.x * BIN_INVLEN);
j = (int) floorf(p.y * BIN_INVLEN);
k = (int) floorf(p.z * BIN_INVLEN);
if (i >= -c && i < binDim.x - c &&
j >= -c && j < binDim.y - c &&
k >= -c && k < binDim.z - c &&
atom[n].q != 0) {
int index = (k * binDim.y + j) * binDim.x + i;
cl_float4 *bin = binZeroAddr + index * BIN_DEPTH;
int bindex = bincntZeroAddr[index];
if (bindex < BIN_DEPTH) {
/* copy atom into bin and increase counter for this bin */
bin[bindex] = p;
bincntZeroAddr[index]++;
}
else {
/* add index to array of extra atoms to be computed with CPU */
if (extra_len >= nbins) {
fprintf(stderr, "exceeded space for storing extra atoms\n");
return -1;
}
extra_atoms[extra_len] = atom[n];
extra_len++;
}
}
else {
/* excluded atoms are either outside bins or neutrally charged */
num_excluded++;
}
}
/* Save result */
extra = (Atoms *)malloc(sizeof(Atoms));
extra->atoms = extra_atoms;
extra->size = extra_len;
}
/* bin stats */
sum = total = 0;
for (n = 0; n < nbins; n++) {
binHistoFull[ bincntBaseAddr[n] ]++;
sum += bincntBaseAddr[n];
total += BIN_DEPTH;
}
avgFillFull = sum / (float) total;
sum = total = 0;
for (k = 0; k < binDim.z - 2*c; k++) {
for (j = 0; j < binDim.y - 2*c; j++) {
for (i = 0; i < binDim.x - 2*c; i++) {
int index = (k * binDim.y + j) * binDim.x + i;
binHistoCover[ bincntZeroAddr[index] ]++;
sum += bincntZeroAddr[index];
total += BIN_DEPTH;
}
}
}
avgFillCover = sum / (float) total;
if (verbose) {
/* report */
printf("number of atoms = %d\n", natoms);
printf("lattice spacing = %g\n", h);
printf("cutoff distance = %g\n", cutoff);
printf("\n");
printf("requested lattice dimensions = %d %d %d\n", nx, ny, nz);
printf("requested space dimensions = %g %g %g\n", nx*h, ny*h, nz*h);
printf("expanded lattice dimensions = %d %d %d\n", lnx, lny, lnz);
printf("expanded space dimensions = %g %g %g\n", lnx*h, lny*h, lnz*h);
printf("number of bytes for lattice data = %u\n", (unsigned int) (lnall*sizeof(float)));
printf("\n");
printf("bin padding thickness = %d\n", c);
printf("bin cover dimensions = %d %d %d\n",
binDim.x - 2*c, binDim.y - 2*c, binDim.z - 2*c);
printf("bin full dimensions = %d %d %d\n", binDim.x, binDim.y, binDim.z);
printf("number of bins = %d\n", nbins);
printf("total number of atom slots = %d\n", nbins * BIN_DEPTH);
printf("%% overhead space = %g\n",
(natoms / (double) (nbins * BIN_DEPTH)) * 100);
printf("number of bytes for bin data = %u\n",
(unsigned int)(nbins * BIN_DEPTH * sizeof(cl_float4)));
printf("\n");
printf("bin histogram with padding:\n");
sum = 0;
for (n = 0; n <= BIN_DEPTH; n++) {
printf(" number of bins with %d atoms: %d\n", n, binHistoFull[n]);
sum += binHistoFull[n];
}
printf(" total number of bins: %d\n", sum);
printf(" %% average fill: %g\n", avgFillFull * 100);
printf("\n");
printf("bin histogram excluding padding:\n");
sum = 0;
for (n = 0; n <= BIN_DEPTH; n++) {
printf(" number of bins with %d atoms: %d\n", n, binHistoCover[n]);
sum += binHistoCover[n];
}
printf(" total number of bins: %d\n", sum);
printf(" %% average fill: %g\n", avgFillCover * 100);
printf("\n");
printf("number of extra atoms = %d\n", extra->size);
printf("%% atoms that are extra = %g\n", (extra->size / (double) natoms) * 100);
printf("\n");
/* sanity check on bins */
sum = 0;
for (n = 0; n <= BIN_DEPTH; n++) {
sum += n * binHistoFull[n];
}
sum += extra->size + num_excluded;
printf("sanity check on bin histogram with edges: "
"sum + others = %d\n", sum);
sum = 0;
for (n = 0; n <= BIN_DEPTH; n++) {
sum += n * binHistoCover[n];
}
sum += extra->size + num_excluded;
printf("sanity check on bin histogram excluding edges: "
"sum + others = %d\n", sum);
printf("\n");
/* neighbor list */
printf("neighbor list length = %d\n", nbrlistlen);
printf("\n");
}
printf("Ok!\n");
pb_Context* pb_context;
pb_context = pb_InitOpenCLContext(parameters);
if (pb_context == NULL) {
fprintf (stderr, "Error: No OpenCL platform/device can be found.");
return -1;
}
printf("Ok!\n");
cl_int clStatus;
cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId;
cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId;
cl_context clContext = (cl_context) pb_context->clContext;
cl_command_queue clCommandQueue = clCreateCommandQueue(clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
CHECK_ERROR("clCreateCommandQueue")
pb_SetOpenCL(&clContext, &clCommandQueue);
//const char* clSource[] = {readFile("src/opencl_base/kernel.cl")};
//cl_program clProgram = clCreateProgramWithSource(clContext,1,clSource,NULL,&clStatus);
cl_program clProgram = clCreateProgramWithBuiltInKernels(
clContext, 1, &clDevice, "opencl_cutoff_potential_lattice", &clStatus);
CHECK_ERROR("clCreateProgramWithSource")
char clOptions[50];
sprintf(clOptions,"-I src/opencl_base"); //-cl-nv-verbose
clStatus = clBuildProgram(clProgram,1,&clDevice,clOptions,NULL,NULL);
if (clStatus != CL_SUCCESS) {
size_t string_size = 0;
clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG,
0, NULL, &string_size);
char* string = (char*)malloc(string_size*sizeof(char));
clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG,
string_size, string, NULL);
puts(string);
}
CHECK_ERROR("clBuildProgram")
cl_kernel clKernel = clCreateKernel(clProgram,"opencl_cutoff_potential_lattice",&clStatus);
CHECK_ERROR("clCreateKernel")
/* setup OpenCL kernel parameters */
blockDim[0] = 8;
blockDim[1] = 8;
blockDim[2] = 2;
gridDim[0] = 4 * xRegionDim * blockDim[0];
gridDim[1] = yRegionDim * blockDim[1];
gridDim[2] = 1 * blockDim[2];
/* allocate and initialize memory on OpenCL device */
pb_SwitchToTimer(timers, pb_TimerID_COPY);
if (verbose) {
printf("Allocating %.2fMB on OpenCL device for potentials\n",
lnall * sizeof(float) / (double) (1024*1024));
}
regionZeroCl = clCreateBuffer(clContext,CL_MEM_WRITE_ONLY,lnall*sizeof(float),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
// clMemSet(clCommandQueue,regionZeroCl,0,lnall*sizeof(float));
if (verbose) {
printf("Allocating %.2fMB on OpenCL device for atom bins\n",
nbins * BIN_DEPTH * sizeof(cl_float4) / (double) (1024*1024));
}
binBaseCl = clCreateBuffer(clContext,CL_MEM_READ_ONLY,nbins*BIN_DEPTH*sizeof(cl_float4),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
clStatus = clEnqueueWriteBuffer(clCommandQueue,binBaseCl,CL_TRUE,0,nbins*BIN_DEPTH*sizeof(cl_float4),binBaseAddr,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
//Sub buffers are not supported in OpenCL v1.0
int offset = ((c * binDim.y + c) * binDim.x + c) * BIN_DEPTH;
NbrListLen = clCreateBuffer(clContext,CL_MEM_READ_ONLY,sizeof(int),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrListLen,CL_TRUE,0,sizeof(int),&nbrlistlen,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
NbrList = clCreateBuffer(clContext,CL_MEM_READ_ONLY,NBRLIST_MAXLEN*sizeof(xyz),NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
clStatus = clEnqueueWriteBuffer(clCommandQueue,NbrList,CL_TRUE,0,nbrlistlen*sizeof(xyz),nbrlist,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
if (verbose)
printf("\n");
clStatus = clSetKernelArg(clKernel,0,sizeof(int),&(binDim.x));
clStatus = clSetKernelArg(clKernel,1,sizeof(int),&(binDim.y));
clStatus = clSetKernelArg(clKernel,2,sizeof(cl_mem),&binBaseCl);
clStatus = clSetKernelArg(clKernel,3,sizeof(int),&offset);
clStatus = clSetKernelArg(clKernel,4,sizeof(float),&h);
clStatus = clSetKernelArg(clKernel,5,sizeof(float),&cutoff2);
clStatus = clSetKernelArg(clKernel,6,sizeof(float),&inv_cutoff2);
clStatus = clSetKernelArg(clKernel,7,sizeof(cl_mem),&regionZeroCl);
clStatus = clSetKernelArg(clKernel,9,sizeof(cl_mem),&NbrListLen);
clStatus = clSetKernelArg(clKernel,10,sizeof(cl_mem),&NbrList);
CHECK_ERROR("clSetKernelArg")
printf("Ok!!\n");
/* loop over z-dimension, invoke OpenCL kernel for each x-y plane */
pb_SwitchToTimer(timers, pb_TimerID_KERNEL);
printf("Invoking OpenCL kernel on %d region planes...\n", zRegionDim);
for (zRegionIndex = 0; zRegionIndex < zRegionDim; zRegionIndex++) {
printf(" computing plane %d\r", zRegionIndex);
fflush(stdout);
clStatus = clSetKernelArg(clKernel,8,sizeof(int),&zRegionIndex);
CHECK_ERROR("clSetKernelArg")
printf("Ok**!2\n");
clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,gridDim,blockDim,0,NULL,NULL);
printf("Ok**!2\n");
CHECK_ERROR("clEnqueueNDRangeKernel")
printf("Ok**!2\n");
clStatus = clFinish(clCommandQueue);
printf("Ok**!2\n");
CHECK_ERROR("clFinish")
}
printf("Ok++!\n");
printf("Finished OpenCL kernel calls \n");
/* copy result regions from OpenCL device */
pb_SwitchToTimer(timers, pb_TimerID_COPY);
clStatus = clEnqueueReadBuffer(clCommandQueue,regionZeroCl,CL_TRUE,0,lnall*sizeof(float),regionZeroAddr,0,NULL,NULL);
CHECK_ERROR("clEnqueueReadBuffer")
/* free OpenCL memory allocations */
clStatus = clReleaseMemObject(regionZeroCl);
clStatus = clReleaseMemObject(binBaseCl);
clStatus = clReleaseMemObject(NbrListLen);
clStatus = clReleaseMemObject(NbrList);
CHECK_ERROR("clReleaseMemObject")
clStatus = clReleaseKernel(clKernel);
clStatus = clReleaseProgram(clProgram);
clStatus = clReleaseCommandQueue(clCommandQueue);
clStatus = clReleaseContext(clContext);
//free((void*)clSource[0]);
/* transpose regions back into lattice */
pb_SwitchToTimer(timers, pb_TimerID_COMPUTE);
for (k = 0; k < nz; k++) {
zRegionIndex = (k >> 3);
zOffset = (k & 7);
for (j = 0; j < ny; j++) {
yRegionIndex = (j >> 3);
yOffset = (j & 7);
for (i = 0; i < nx; i++) {
xRegionIndex = (i >> 3);
xOffset = (i & 7);
thisRegion = regionZeroAddr
+ ((zRegionIndex * yRegionDim + yRegionIndex) * xRegionDim
+ xRegionIndex) * REGION_SIZE;
indexRegion = (zOffset * 8 + yOffset) * 8 + xOffset;
index = (k * ny + j) * nx + i;
lattice->lattice[index] = thisRegion[indexRegion];
}
}
}
/* handle extra atoms */
if (extra->size > 0) {
printf("computing extra atoms on CPU\n");
if (cpu_compute_cutoff_potential_lattice(lattice, cutoff, extra)) {
fprintf(stderr, "cpu_compute_cutoff_potential_lattice() failed "
"for extra atoms\n");
return -1;
}
printf("\n");
}
/* cleanup memory allocations */
free(regionZeroAddr);
free(binBaseAddr);
free(bincntBaseAddr);
free_atom(extra);
return 0;
}

View File

@@ -1,72 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#ifndef CUTOFF_H
#define CUTOFF_H
#ifdef __cplusplus
extern "C" {
#endif
#define SHIFTED
/* A structure to record how points in 3D space map to array
elements. Array element (z, y, x)
where 0 <= x < nx, 0 <= y < ny, 0 <= z < nz
maps to coordinate (xlo, ylo, zlo) + h * (x, y, z).
*/
typedef struct LatticeDim_t {
/* Number of lattice points in x, y, z dimensions */
int nx, ny, nz;
/* Lowest corner of lattice */
Vec3 lo;
/* Lattice spacing */
float h;
} LatticeDim;
/* An electric potential field sampled on a regular grid. The
lattice size and grid point positions are specified by 'dim'.
*/
typedef struct Lattice_t {
LatticeDim dim;
float *lattice;
} Lattice;
LatticeDim lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h);
Lattice *create_lattice(LatticeDim dim);
void destroy_lattice(Lattice *);
int gpu_compute_cutoff_potential_lattice(
struct pb_TimerSet *timers,
Lattice *lattice,
float cutoff, /* cutoff distance */
Atoms *atom, /* array of atoms */
int verbose, /* print info/debug messages */
struct pb_Parameters *parameters
);
int cpu_compute_cutoff_potential_lattice(
Lattice *lattice, /* the lattice */
float cutoff, /* cutoff distance */
Atoms *atoms /* array of atoms */
);
int remove_exclusions(
Lattice *lattice, /* the lattice */
float exclcutoff, /* exclusion cutoff distance */
Atoms *atom /* array of atoms */
);
#ifdef __cplusplus
}
#endif
#endif /* CUTOFF_H */

View File

@@ -1,157 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <parboil.h>
#include "atom.h"
#include "cutoff.h"
#define CELLEN 4.f
#define INV_CELLEN (1.f/CELLEN)
extern int remove_exclusions(
Lattice *lattice, /* the lattice */
float cutoff, /* exclusion cutoff distance */
Atoms *atoms /* array of atoms */
)
{
int nx = lattice->dim.nx;
int ny = lattice->dim.ny;
int nz = lattice->dim.nz;
float xlo = lattice->dim.lo.x;
float ylo = lattice->dim.lo.y;
float zlo = lattice->dim.lo.z;
float gridspacing = lattice->dim.h;
Atom *atom = atoms->atoms;
const float a2 = cutoff * cutoff;
const float inv_gridspacing = 1.f / gridspacing;
const int radius = (int) ceilf(cutoff * inv_gridspacing) - 1;
/* lattice point radius about each atom */
int n;
int i, j, k;
int ia, ib, ic;
int ja, jb, jc;
int ka, kb, kc;
int index;
int koff, jkoff;
float x, y, z, q;
float dx, dy, dz;
float dz2, dydz2, r2;
float e;
float xstart, ystart;
float *pg;
int gindex;
int ncell, nxcell, nycell, nzcell;
int *first, *next;
float inv_cellen = INV_CELLEN;
Vec3 minext, maxext;
/* find min and max extent */
get_atom_extent(&minext, &maxext, atoms);
/* number of cells in each dimension */
nxcell = (int) floorf((maxext.x-minext.x) * inv_cellen) + 1;
nycell = (int) floorf((maxext.y-minext.y) * inv_cellen) + 1;
nzcell = (int) floorf((maxext.z-minext.z) * inv_cellen) + 1;
ncell = nxcell * nycell * nzcell;
/* allocate for cursor link list implementation */
first = (int *) malloc(ncell * sizeof(int));
for (gindex = 0; gindex < ncell; gindex++) {
first[gindex] = -1;
}
next = (int *) malloc(atoms->size * sizeof(int));
for (n = 0; n < atoms->size; n++) {
next[n] = -1;
}
/* geometric hashing */
for (n = 0; n < atoms->size; n++) {
if (0==atom[n].q) continue; /* skip any non-contributing atoms */
i = (int) floorf((atom[n].x - minext.x) * inv_cellen);
j = (int) floorf((atom[n].y - minext.y) * inv_cellen);
k = (int) floorf((atom[n].z - minext.z) * inv_cellen);
gindex = (k*nycell + j)*nxcell + i;
next[n] = first[gindex];
first[gindex] = n;
}
/* traverse the grid cells */
for (gindex = 0; gindex < ncell; gindex++) {
for (n = first[gindex]; n != -1; n = next[n]) {
x = atom[n].x - xlo;
y = atom[n].y - ylo;
z = atom[n].z - zlo;
q = atom[n].q;
/* find closest grid point with position less than or equal to atom */
ic = (int) (x * inv_gridspacing);
jc = (int) (y * inv_gridspacing);
kc = (int) (z * inv_gridspacing);
/* find extent of surrounding box of grid points */
ia = ic - radius;
ib = ic + radius + 1;
ja = jc - radius;
jb = jc + radius + 1;
ka = kc - radius;
kb = kc + radius + 1;
/* trim box edges so that they are within grid point lattice */
if (ia < 0) ia = 0;
if (ib >= nx) ib = nx-1;
if (ja < 0) ja = 0;
if (jb >= ny) jb = ny-1;
if (ka < 0) ka = 0;
if (kb >= nz) kb = nz-1;
/* loop over surrounding grid points */
xstart = ia*gridspacing - x;
ystart = ja*gridspacing - y;
dz = ka*gridspacing - z;
for (k = ka; k <= kb; k++, dz += gridspacing) {
koff = k*ny;
dz2 = dz*dz;
dy = ystart;
for (j = ja; j <= jb; j++, dy += gridspacing) {
jkoff = (koff + j)*nx;
dydz2 = dy*dy + dz2;
dx = xstart;
index = jkoff + ia;
pg = lattice->lattice + index;
for (i = ia; i <= ib; i++, pg++, dx += gridspacing) {
r2 = dx*dx + dydz2;
/* If atom and lattice point are too close, set the lattice value
* to zero */
if (r2 < a2) *pg = 0;
}
}
} /* end loop over surrounding grid points */
} /* end loop over atoms in a gridcell */
} /* end loop over gridcells */
/* free memory */
free(next);
free(first);
return 0;
}

View File

@@ -1,55 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
//#include <endian.h>
#include <stdlib.h>
#include <malloc.h>
#include <stdio.h>
#include <inttypes.h>
#include "gpu_info.h"
void compute_active_thread(size_t *thread,
size_t *grid,
int task,
int pad,
int major,
int minor,
int sm)
{
int max_thread;
int max_block=8;
if(major==1)
{
if(minor>=2)
max_thread=1024;
else
max_thread=768;
}
else if(major==2)
max_thread=1536;
else
//newer GPU //keep using 2.0
max_thread=1536;
int _grid;
int _thread;
if(task*pad>sm*max_thread)
{
_thread=max_thread/max_block;
_grid = ((task*pad+_thread-1)/_thread)*_thread;
}
else
{
_thread=pad;
_grid=task*pad;
}
thread[0]=_thread;
grid[0]=_grid;
}

View File

@@ -1,20 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#ifndef __GPUINFOH__
#define __GPUINFOH__
void compute_active_thread(size_t *thread,
size_t *grid,
int task,
int pad,
int major,
int minor,
int sm);
#endif

View File

@@ -1,104 +0,0 @@
/*
* potential lattice is decomposed into size 8^3 lattice point "regions"
*
* THIS IMPLEMENTATION: one thread per lattice point
* thread block size 128 gives 4 thread blocks per region
* kernel is invoked for each x-y plane of regions,
* where gridDim.x is 4*(x region dimension) so that blockIdx.x
* can absorb the z sub-region index in its 2 lowest order bits
*
* Regions are stored contiguously in memory in row-major order
*
* The bins have to not only cover the region, but they need to surround
* the outer edges so that region sides and corners can still use
* neighbor list stencil. The binZeroAddr is actually a shifted pointer into
* the bin array (binZeroAddr = binBaseAddr + (c*binDim_y + c)*binDim_x + c)
* where c = ceil(cutoff / binsize). This allows for negative offsets to
* be added to myBinIndex.
*
* The (0,0,0) spatial origin corresponds to lower left corner of both
* regionZeroAddr and binZeroAddr. The atom coordinates are translated
* during binning to enforce this assumption.
*/
#include "macros.h"
// OpenCL 1.1 support for int3 is not uniform on all implementations, so
// we use int4 instead. Only the 'x', 'y', and 'z' fields of xyz are used.
typedef int4 xyz;
__kernel void opencl_cutoff_potential_lattice(
int binDim_x,
int binDim_y,
__global float4 *binBaseAddr,
int offset,
float h, /* lattice spacing */
float cutoff2, /* square of cutoff distance */
float inv_cutoff2,
__global float *regionZeroAddr, /* address of lattice regions starting at origin */
int zRegionIndex,
__constant int *NbrListLen,
__constant xyz *NbrList
)
{
__global float4* binZeroAddr = binBaseAddr + offset;
__global float *myRegionAddr;
int Bx, By, Bz;
/* thread id */
const int tid = (get_local_id(2)*get_local_size(1) +
get_local_id(1))*get_local_size(0) + get_local_id(0);
/* this is the start of the sub-region indexed by tid */
myRegionAddr = regionZeroAddr + ((zRegionIndex*get_num_groups(1)
+ get_group_id(1))*(get_num_groups(0)>>2) + (get_group_id(0)>>2))*REGION_SIZE
+ (get_group_id(0)&3)*SUB_REGION_SIZE;
/* spatial coordinate of this lattice point */
float x = (8 * (get_group_id(0) >> 2) + get_local_id(0)) * h;
float y = (8 * get_group_id(1) + get_local_id(1)) * h;
float z = (8 * zRegionIndex + 2*(get_group_id(0)&3) + get_local_id(2)) * h;
float dx;
float dy;
float dz;
float r2;
float s;
int totalbins = 0;
/* bin number determined by center of region */
Bx = (int) floor((8 * (get_group_id(0) >> 2) + 4) * h * BIN_INVLEN);
By = (int) floor((8 * get_group_id(1) + 4) * h * BIN_INVLEN);
Bz = (int) floor((8 * zRegionIndex + 4) * h * BIN_INVLEN);
float energy = 0.f;
int bincnt;
for (bincnt = 0; bincnt < *NbrListLen; bincnt++) {
int i = Bx + NbrList[bincnt].x;
int j = By + NbrList[bincnt].y;
int k = Bz + NbrList[bincnt].z;
__global float4* p_global = binZeroAddr +
(((k*binDim_y + j)*binDim_x + i) * BIN_DEPTH);
int m;
for (m = 0; m < BIN_DEPTH; m++) {
float aq = p_global[m].w;
if (0.f != aq) {
dx = p_global[m].x - x;
dy = p_global[m].y - y;
dz = p_global[m].z - z;
r2 = dx*dx + dy*dy + dz*dz;
if (r2 < cutoff2) {
s = (1.f - r2 * inv_cutoff2);
energy += aq * rsqrt(r2) * s * s;
}
}
} /* end loop over atoms in bin */
} /* end loop over neighbor list */
/* store into global memory */
myRegionAddr[tid+0] = energy;
}

Binary file not shown.

View File

@@ -1,69 +0,0 @@
#ifndef __MACROSH__
#define __MACROSH__
#ifdef __DEVICE_EMULATION__
#define DEBUG
/* define which grid block and which thread to examine */
#define BX 0
#define BY 0
#define TX 0
#define TY 0
#define TZ 0
#define EMU(code) do { \
if (blockIdx.x==BX && blockIdx.y==BY && \
threadIdx.x==TX && threadIdx.y==TY && threadIdx.z==TZ) { \
code; \
} \
} while (0)
#define INT(n) printf("%s = %d\n", #n, n)
#define FLOAT(f) printf("%s = %g\n", #f, (double)(f))
#define INT3(n) printf("%s = %d %d %d\n", #n, (n).x, (n).y, (n).z)
#define FLOAT4(f) printf("%s = %g %g %g %g\n", #f, (double)(f).x, \
(double)(f).y, (double)(f).z, (double)(f).w)
#else
#define EMU(code)
#define INT(n)
#define FLOAT(f)
#define INT3(n)
#define FLOAT4(f)
#endif
/* report error from OpenCL */
#define CHECK_ERROR(errorMessage) \
if(clStatus != CL_SUCCESS) \
{ \
printf("Error: %s!\n",errorMessage); \
printf("Line: %d\n",__LINE__); \
exit(1); \
}
/*
* neighbor list:
* stored in constant memory as table of offsets
* flat index addressing is computed by kernel
*
* reserve enough memory for 11^3 stencil of grid cells
* this fits within 16K of memory
*/
#define NBRLIST_DIM 11
#define NBRLIST_MAXLEN (NBRLIST_DIM * NBRLIST_DIM * NBRLIST_DIM)
/*
* atom bins cached into shared memory for processing
*
* this reserves 4K of shared memory for 32 atom bins each containing 8 atoms,
* should permit scheduling of up to 3 thread blocks per SM
*/
#define BIN_DEPTH 8 /* max number of atoms per bin */
#define BIN_SIZE 32 /* size of bin in floats */
#define BIN_CACHE_MAXLEN 32 /* max number of atom bins to cache */
#define BIN_LENGTH 4.f /* spatial length in Angstroms */
#define BIN_INVLEN (1.f / BIN_LENGTH)
/* assuming density of 1 atom / 10 A^3, expectation is 6.4 atoms per bin
* so that bin fill should be 80% (for non-empty regions of space) */
#define REGION_SIZE 512 /* number of floats in lattice region */
#define SUB_REGION_SIZE 128 /* number of floats in lattice sub-region */
#endif

View File

@@ -1,194 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <parboil.h>
#include "atom.h"
#include "cutoff.h"
#include "output.h"
#define ERRTOL 1e-4f
#define NOKERNELS 0
#define CUTOFF1 1
#define CUTOFF6 32
#define CUTOFF6OVERLAP 64
#define CUTOFFCPU 16384
int appenddata(const char *filename, int size, double time) {
FILE *fp;
fp=fopen(filename, "a");
if (fp == NULL) {
printf("error appending to file %s..\n", filename);
return -1;
}
fprintf(fp, "%d %.3f\n", size, time);
fclose(fp);
return 0;
}
LatticeDim
lattice_from_bounding_box(Vec3 lo, Vec3 hi, float h)
{
LatticeDim ret;
ret.nx = (int) floorf((hi.x-lo.x)/h) + 1;
ret.ny = (int) floorf((hi.y-lo.y)/h) + 1;
ret.nz = (int) floorf((hi.z-lo.z)/h) + 1;
ret.lo = lo;
ret.h = h;
return ret;
}
Lattice *
create_lattice(LatticeDim dim)
{
int size;
Lattice *lat = (Lattice *)malloc(sizeof(Lattice));
if (lat == NULL) {
fprintf(stderr, "Out of memory\n");
exit(1);
}
lat->dim = dim;
/* Round up the allocated size to a multiple of 8 */
size = ((dim.nx * dim.ny * dim.nz) + 7) & ~7;
lat->lattice = (float *)calloc(size, sizeof(float));
if (lat->lattice == NULL) {
fprintf(stderr, "Out of memory\n");
exit(1);
}
return lat;
}
void
destroy_lattice(Lattice *lat)
{
if (lat) {
free(lat->lattice);
free(lat);
}
}
int main(int argc, char *argv[]) {
Atoms *atom;
LatticeDim lattice_dim;
Lattice *gpu_lattice;
Vec3 min_ext, max_ext; /* Bounding box of atoms */
Vec3 lo, hi; /* Bounding box with padding */
float h = 0.5f; /* Lattice spacing */
float cutoff = 12.f; /* Cutoff radius */
float exclcutoff = 1.f; /* Radius for exclusion */
float padding = 0.5f; /* Bounding box padding distance */
int n;
struct pb_Parameters *parameters;
struct pb_TimerSet timers;
/* Read input parameters */
parameters = pb_ReadParameters(&argc, argv);
if (parameters == NULL) {
exit(1);
}
parameters->inpFiles = (char **)malloc(sizeof(char *) * 2);
parameters->inpFiles[0] = (char *)malloc(100);
parameters->inpFiles[1] = NULL;
strncpy(parameters->inpFiles[0], "watbox.sl40.pqr", 100);
/* Expect one input file */
if (pb_Parameters_CountInputs(parameters) != 1) {
fprintf(stderr, "Expecting one input file\n");
exit(1);
}
pb_InitializeTimerSet(&timers);
pb_SwitchToTimer(&timers, pb_TimerID_IO);
printf("OK\n");
{
const char *pqrfilename = parameters->inpFiles[0];
if (!(atom = read_atom_file(pqrfilename))) {
fprintf(stderr, "read_atom_file() failed\n");
exit(1);
}
printf("read %d atoms from file '%s'\n", atom->size, pqrfilename);
}
printf("OK\n");
/* find extent of domain */
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
get_atom_extent(&min_ext, &max_ext, atom);
printf("extent of domain is:\n");
printf(" minimum %g %g %g\n", min_ext.x, min_ext.y, min_ext.z);
printf(" maximum %g %g %g\n", max_ext.x, max_ext.y, max_ext.z);
printf("padding domain by %g Angstroms\n", padding);
lo = (Vec3) {min_ext.x - padding, min_ext.y - padding, min_ext.z - padding};
hi = (Vec3) {max_ext.x + padding, max_ext.y + padding, max_ext.z + padding};
printf("domain lengths are %g by %g by %g\n", hi.x-lo.x, hi.y-lo.y, hi.z-lo.z);
lattice_dim = lattice_from_bounding_box(lo, hi, h);
gpu_lattice = create_lattice(lattice_dim);
printf("\n");
/*
* Run OpenCL kernel
* (Begin and end with COMPUTE timer active)
*/
if (gpu_compute_cutoff_potential_lattice(&timers, gpu_lattice, cutoff, atom, 0, parameters)) {
fprintf(stderr, "Computation failed\n");
exit(1);
}
/*
* Zero the lattice points that are too close to an atom. This is
* necessary for numerical stability.
*/
if (remove_exclusions(gpu_lattice, exclcutoff, atom)) {
fprintf(stderr, "remove_exclusions() failed for gpu lattice\n");
exit(1);
}
printf("\n");
pb_SwitchToTimer(&timers, pb_TimerID_IO);
/* Print output */
if (parameters->outFile) {
//write_lattice_summary(parameters->outFile, gpu_lattice);
}
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
/* Cleanup */
destroy_lattice(gpu_lattice);
free_atom(atom);
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
pb_PrintTimerSet(&timers);
pb_FreeParameters(parameters);
return 0;
}

View File

@@ -1,49 +0,0 @@
#include <CL/cl.h>
#include <stdio.h>
#include <string.h>
#include "ocl.h"
char* readFile(const char* fileName)
{
FILE* fp;
fp = fopen(fileName,"r");
if(fp == NULL)
{
printf("Error 1!\n");
exit(1);
}
fseek(fp,0,SEEK_END);
long size = ftell(fp);
rewind(fp);
char* buffer = (char*)malloc(sizeof(char)*(size+1));
if(buffer == NULL)
{
printf("Error 2!\n");
fclose(fp);
exit(1);
}
size_t res = fread(buffer,1,size,fp);
if(res != size)
{
printf("Error 3!\n");
fclose(fp);
exit(1);
}
buffer[size] = 0;
fclose(fp);
return buffer;
}
void clMemSet(cl_command_queue clCommandQueue, cl_mem buf, int val, size_t size)
{
cl_int clStatus;
char* temp = (char*)malloc(size);
memset(temp,val,size);
clStatus = clEnqueueWriteBuffer(clCommandQueue,buf,CL_TRUE,0,size,temp,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
free(temp);
}

View File

@@ -1,17 +0,0 @@
#ifndef __OCLH__
#define __OCLH__
#include <stdlib.h>
void clMemSet(cl_command_queue, cl_mem, int, size_t);
char* readFile(const char*);
#define CHECK_ERROR(errorMessage) \
if(clStatus != CL_SUCCESS) \
{ \
printf("Error: %s!\n",errorMessage); \
printf("Line: %d\n",__LINE__); \
exit(1); \
}
#endif

View File

@@ -1,67 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
#include <math.h>
#include <parboil.h>
#include "atom.h"
#include "cutoff.h"
void
write_lattice_summary(const char *filename, Lattice *lattice)
{
float *lattice_data = lattice->lattice;
int nx = lattice->dim.nx;
int ny = lattice->dim.ny;
int nz = lattice->dim.nz;
/* Open output file */
FILE *outfile = fopen(filename, "w");
if (outfile == NULL) {
fprintf(stderr, "Cannot open output file\n");
exit(1);
}
/* Write the sum of the the absolute values of all lattice potentials */
{
double abspotential = 0.0;
float tmp;
int i;
for (i = 0; i < nx * ny * nz; i++)
abspotential += fabs((double) lattice_data[i]);
tmp = (float) abspotential;
fwrite(&tmp, 1, sizeof(float), outfile);
}
/* Write the size of a lattice plane */
{
uint32_t tmp;
tmp = (uint32_t) (lattice->dim.nx * lattice->dim.ny);
fwrite(&tmp, 1, sizeof(uint32_t), outfile);
}
/* Write the plane of lattice data at z=0 and z = nz-1 */
{
int plane_size = nx * ny;
fwrite(lattice_data, plane_size, sizeof(float), outfile);
fwrite(lattice_data + (nz-1) * plane_size, plane_size, sizeof(float),
outfile);
}
/* Cleanup */
fclose(outfile);
}

View File

@@ -1,25 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#ifndef OUTPUT_H
#define OUTPUT_H
#include "cutoff.h"
#ifdef __cplusplus
extern "C" {
#endif
void
write_lattice_summary(const char *filename, Lattice *lattice);
#ifdef __cplusplus
}
#endif
#endif

View File

@@ -1,348 +0,0 @@
/*
* (c) 2010 The Board of Trustees of the University of Illinois.
*/
#ifndef PARBOIL_HEADER
#define PARBOIL_HEADER
#include <stdio.h>
#include <string.h>
#ifdef __cplusplus
extern "C" {
#endif
#include <unistd.h>
/* A platform as specified by the user on the command line */
struct pb_PlatformParam {
char *name; /* The platform name. This string is owned. */
char *version; /* The platform version; may be NULL.
* This string is owned. */
};
/* Create a PlatformParam from the given strings.
* 'name' must not be NULL. 'version' may be NULL.
* If not NULL, the strings should have been allocated by malloc(),
* and they will be owned by the returned object.
*/
struct pb_PlatformParam *
pb_PlatformParam(char *name, char *version);
void
pb_FreePlatformParam(struct pb_PlatformParam *);
/* A criterion for how to select a device */
enum pb_DeviceSelectionCriterion {
pb_Device_INDEX, /* Enumerate the devices and select one
* by its number */
pb_Device_CPU, /* Select a CPU device */
pb_Device_GPU, /* Select a GPU device */
pb_Device_ACCELERATOR, /* Select an accelerator device */
pb_Device_NAME /* Select a device by name */
};
/* A device as specified by the user on the command line */
struct pb_DeviceParam {
enum pb_DeviceSelectionCriterion criterion;
union {
int index; /* If criterion == pb_Device_INDEX,
* the index of the device */
char *name; /* If criterion == pb_Device_NAME,
* the name of the device.
* This string is owned. */
};
};
struct pb_DeviceParam *
pb_DeviceParam_index(int index);
struct pb_DeviceParam *
pb_DeviceParam_cpu(void);
struct pb_DeviceParam *
pb_DeviceParam_gpu(void);
struct pb_DeviceParam *
pb_DeviceParam_accelerator(void);
/* Create a by-name device selection criterion.
* The string should have been allocated by malloc(), and it will will be
* owned by the returned object.
*/
struct pb_DeviceParam *
pb_DeviceParam_name(char *name);
void
pb_FreeDeviceParam(struct pb_DeviceParam *);
/* Command line parameters for benchmarks */
struct pb_Parameters {
char *outFile; /* If not NULL, the raw output of the
* computation should be saved to this
* file. The string is owned. */
char **inpFiles; /* A NULL-terminated array of strings
* holding the input file(s) for the
* computation. The array and strings
* are owned. */
struct pb_PlatformParam *platform; /* If not NULL, the platform
* specified on the command line. */
struct pb_DeviceParam *device; /* If not NULL, the device
* specified on the command line. */
};
/* Read command-line parameters.
*
* The argc and argv parameters to main are read, and any parameters
* interpreted by this function are removed from the argument list.
*
* A new instance of struct pb_Parameters is returned.
* If there is an error, then an error message is printed on stderr
* and NULL is returned.
*/
struct pb_Parameters *
pb_ReadParameters(int *_argc, char **argv);
/* Free an instance of struct pb_Parameters.
*/
void
pb_FreeParameters(struct pb_Parameters *p);
void
pb_FreeStringArray(char **);
/* Count the number of input files in a pb_Parameters instance.
*/
int
pb_Parameters_CountInputs(struct pb_Parameters *p);
/* A time or duration. */
//#if _POSIX_VERSION >= 200112L
typedef unsigned long long pb_Timestamp; /* time in microseconds */
//#else
//# error "Timestamps not implemented"
//#endif
enum pb_TimerState {
pb_Timer_STOPPED,
pb_Timer_RUNNING,
};
struct pb_Timer {
enum pb_TimerState state;
pb_Timestamp elapsed; /* Amount of time elapsed so far */
pb_Timestamp init; /* Beginning of the current time interval,
* if state is RUNNING. End of the last
* recorded time interfal otherwise. */
};
/* Reset a timer.
* Use this to initialize a timer or to clear
* its elapsed time. The reset timer is stopped.
*/
void
pb_ResetTimer(struct pb_Timer *timer);
/* Start a timer. The timer is set to RUNNING mode and
* time elapsed while the timer is running is added to
* the timer.
* The timer should not already be running.
*/
void
pb_StartTimer(struct pb_Timer *timer);
/* Stop a timer.
* This stops adding elapsed time to the timer.
* The timer should not already be stopped.
*/
void
pb_StopTimer(struct pb_Timer *timer);
/* Get the elapsed time in seconds. */
double
pb_GetElapsedTime(struct pb_Timer *timer);
/* Execution time is assigned to one of these categories. */
enum pb_TimerID {
pb_TimerID_NONE = 0,
pb_TimerID_IO, /* Time spent in input/output */
pb_TimerID_KERNEL, /* Time spent computing on the device,
* recorded asynchronously */
pb_TimerID_COPY, /* Time spent synchronously moving data
* to/from device and allocating/freeing
* memory on the device */
pb_TimerID_DRIVER, /* Time spent in the host interacting with the
* driver, primarily for recording the time
* spent queueing asynchronous operations */
pb_TimerID_COPY_ASYNC, /* Time spent in asynchronous transfers */
pb_TimerID_COMPUTE, /* Time for all program execution other
* than parsing command line arguments,
* I/O, kernel, and copy */
pb_TimerID_OVERLAP, /* Time double-counted in asynchronous and
* host activity: automatically filled in,
* not intended for direct usage */
pb_TimerID_LAST /* Number of timer IDs */
};
/* Dynamic list of asynchronously tracked times between events */
struct pb_async_time_marker_list {
char *label; // actually just a pointer to a string
enum pb_TimerID timerID; /* The ID to which the interval beginning
* with this marker should be attributed */
void * marker;
//cudaEvent_t marker; /* The driver event for this marker */
struct pb_async_time_marker_list *next;
};
struct pb_SubTimer {
char *label;
struct pb_Timer timer;
struct pb_SubTimer *next;
};
struct pb_SubTimerList {
struct pb_SubTimer *current;
struct pb_SubTimer *subtimer_list;
};
/* A set of timers for recording execution times. */
struct pb_TimerSet {
enum pb_TimerID current;
struct pb_async_time_marker_list* async_markers;
pb_Timestamp async_begin;
pb_Timestamp wall_begin;
struct pb_Timer timers[pb_TimerID_LAST];
struct pb_SubTimerList *sub_timer_list[pb_TimerID_LAST];
};
/* Reset all timers in the set. */
void
pb_InitializeTimerSet(struct pb_TimerSet *timers);
void
pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category);
/* Select which timer the next interval of time should be accounted
* to. The selected timer is started and other timers are stopped.
* Using pb_TimerID_NONE stops all timers. */
void
pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer);
void
pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category);
/* Print timer values to standard output. */
void
pb_PrintTimerSet(struct pb_TimerSet *timers);
/* Release timer resources */
void
pb_DestroyTimerSet(struct pb_TimerSet * timers);
void
pb_SetOpenCL(void *clContextPtr, void *clCommandQueuePtr);
typedef struct pb_Device_tag {
char* name;
void* clDevice;
int id;
unsigned int in_use;
unsigned int available;
} pb_Device;
struct pb_Context_tag;
typedef struct pb_Context_tag pb_Context;
typedef struct pb_Platform_tag {
char* name;
char* version;
void* clPlatform;
unsigned int in_use;
pb_Context** contexts;
pb_Device** devices;
} pb_Platform;
struct pb_Context_tag {
void* clPlatformId;
void* clContext;
void* clDeviceId;
pb_Platform* pb_platform;
pb_Device* pb_device;
};
// verbosely print out list of platforms and their devices to the console.
pb_Platform**
pb_GetPlatforms();
// Choose a platform according to the given platform specification
pb_Platform*
pb_GetPlatform(struct pb_PlatformParam *platform);
// choose a platform: by name, name & version
pb_Platform*
pb_GetPlatformByName(const char* name);
pb_Platform*
pb_GetPlatformByNameAndVersion(const char* name, const char* version);
// Choose a device according to the given device specification
pb_Device*
pb_GetDevice(pb_Platform* pb_platform, struct pb_DeviceParam *device);
pb_Device**
pb_GetDevices(pb_Platform* pb_platform);
// choose a device by name.
pb_Device*
pb_GetDeviceByName(pb_Platform* pb_platform, const char* name);
pb_Platform*
pb_GetPlatformByEnvVars();
pb_Context*
pb_InitOpenCLContext(struct pb_Parameters* parameters);
void
pb_ReleasePlatforms();
void
pb_ReleaseContext(pb_Context* c);
void
pb_PrintPlatformInfo(pb_Context* c);
void
perf_init();
//#define MEASURE_KERNEL_TIME
#include <CL/cl.h>
#ifdef MEASURE_KERNEL_TIME
#define clEnqueueNDRangeKernel(q,k,d,o,dg,db,a,b,c) pb_clEnqueueNDRangeKernel((q), (k), (d), (o), (dg), (db), (a), (b), (c))
cl_int
pb_clEnqueueNDRangeKernel(cl_command_queue /* command_queue */,
cl_kernel /* kernel */,
cl_uint /* work_dim */,
const size_t * /* global_work_offset */,
const size_t * /* global_work_size */,
const size_t * /* local_work_size */,
cl_uint /* num_events_in_wait_list */,
const cl_event * /* event_wait_list */,
cl_event * /* event */);
#endif
enum { T_FLOAT, T_DOUBLE, T_SHORT, T_INT, T_UCHAR };
void pb_sig_float(char*, float*, int);
void pb_sig_double(char*, double*, int);
void pb_sig_short(char*, short*, int);
void pb_sig_int(char*, int*, int);
void pb_sig_uchar(char*, unsigned char*, unsigned int);
void pb_sig_clmem(char*, cl_command_queue, cl_mem, int);
#ifdef __cplusplus
}
#endif
#endif //PARBOIL_HEADER

File diff suppressed because it is too large Load Diff

View File

@@ -1,139 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2008-2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "atom.h"
#define LINELEN 96
#define INITLEN 20
Atoms *read_atom_file(const char *fname)
{
FILE *file;
char line[LINELEN];
Atom *atom; /* Atom array */
int len = INITLEN; /* Size of atom array */
int cnt = 0; /* Number of atoms read */
/* allocate initial atom array */
atom = (Atom *) malloc(len * sizeof(Atom));
if (NULL==atom) {
fprintf(stderr, "can't allocate memory\n");
return NULL;
}
int i;
for (i = 0; i < len; ++i) {
atom[i].x = i+0;
atom[i].y = i+1;
atom[i].z = i+2;
atom[i].q = 1;
}
#if 0
/* open atom "pqr" file */
file = fopen(fname, "r");
if (NULL==file) {
fprintf(stderr, "can't open file \"%s\" for reading\n", fname);
return NULL;
}
/* loop to read pqr file line by line */
while (fgets(line, LINELEN, file) != NULL) {
if (strncmp(line, "ATOM ", 6) != 0 && strncmp(line, "HETATM", 6) != 0) {
continue; /* skip anything that isn't an atom record */
}
if (cnt==len) { /* extend atom array */
void *tmp = realloc(atom, 2*len*sizeof(Atom));
if (NULL==tmp) {
fprintf(stderr, "can't allocate more memory\n");
return NULL;
}
atom = (Atom *) tmp;
len *= 2;
}
/* read position coordinates and charge from atom record */
if (sscanf(line, "%*s %*d %*s %*s %*d %f %f %f %f", &(atom[cnt].x),
&(atom[cnt].y), &(atom[cnt].z), &(atom[cnt].q)) != 4) {
fprintf(stderr, "atom record %d does not have expected format\n", cnt+1);
return NULL;
}
cnt++; /* count atoms as we store them */
}
/* verify EOF and close file */
if ( !feof(file) ) {
fprintf(stderr, "did not find EOF\n");
return NULL;
}
if (fclose(file)) {
fprintf(stderr, "can't close file\n");
return NULL;
}
#endif
/* Build the output data structure */
{
Atoms *out = (Atoms *)malloc(sizeof(Atoms));
if (NULL == out) {
fprintf(stderr, "can't allocate memory\n");
return NULL;
}
out->size = cnt;
out->atoms = atom;
return out;
}
}
void free_atom(Atoms *atom)
{
if (atom) {
free(atom->atoms);
free(atom);
}
}
void
get_atom_extent(Vec3 *out_lo, Vec3 *out_hi, Atoms *atom)
{
Atom *atoms = atom->atoms;
int natoms = atom->size;
Vec3 lo;
Vec3 hi;
int n;
hi.x = lo.x = atoms[0].x;
hi.y = lo.y = atoms[0].y;
hi.z = lo.z = atoms[0].z;
for (n = 1; n < natoms; n++) {
lo.x = fminf(lo.x, atoms[n].x);
hi.x = fmaxf(hi.x, atoms[n].x);
lo.y = fminf(lo.y, atoms[n].y);
hi.y = fmaxf(hi.y, atoms[n].y);
lo.z = fminf(lo.z, atoms[n].z);
hi.z = fmaxf(hi.z, atoms[n].z);
}
*out_lo = lo;
*out_hi = hi;
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,68 +1,47 @@
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
LLVM_HOME ?= ~/dev/llvm-project/drops
TOOLCHAIN_PATH ?= ~/dev/riscv-gnu-toolchain/drops
SYSROOT ?= $(TOOLCHAIN_PATH)/riscv32-unknown-elf
POCL_CC_PATH ?= $(realpath ../compiler)
POCL_RT_PATH ?= $(realpath ../runtime)
VORTEX_DRV_PATH ?= $(realpath ../../../driver/sw)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
CXXFLAGS += -std=c++11 -O0 -g -fpermissive -Wall -Wextra -pedantic -Wfatal-errors
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
CXXFLAGS += -I$(POCL_RT_PATH)/include
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH)
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT = gaussian
PROJECT = guassian
SRCS = main.cc clutils.cpp utils.cpp
all: $(PROJECT).dump $(PROJECT).hex
all: $(PROJECT)
lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
kernel.pocl: kernel.cl
TOOLCHAIN_PATH=$(TOOLCHAIN_PATH) SYSROOT=$(SYSROOT) LLVM_HOME=$(LLVM_HOME) VORTEX_RUNTIME_PATH=$(VORTEX_RT_PATH) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_HOME)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
run-fpga: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT)
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
run-ase: $(PROJECT) kernel.pocl
ASE_LOG=0 LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT)
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run-rtlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug
rm -rf $(PROJECT) *.o *.dump .depend
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif

View File

@@ -782,6 +782,27 @@ void cl_writeToZCBuffer(cl_mem mem, void* data, size_t size)
cl_unmapBuffer(mem, ptr);
}
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
if (nullptr == filename || nullptr == data || 0 == size)
return -1;
FILE* fp = fopen(filename, "r");
if (NULL == fp) {
fprintf(stderr, "Failed to load kernel.");
return -1;
}
fseek(fp , 0 , SEEK_END);
long fsize = ftell(fp);
rewind(fp);
*data = (uint8_t*)malloc(fsize);
*size = fread(*data, 1, fsize, fp);
fclose(fp);
return 0;
}
//-------------------------------------------------------
// Program and kernels
//-------------------------------------------------------
@@ -839,11 +860,20 @@ cl_program cl_compileProgram(char* kernelPath, char* compileoptions, bool verbos
// Create the program object
//cl_program clProgramReturn = clCreateProgramWithSource(context, 1, (const char **)&source, NULL, &status);
cl_program clProgramReturn = clCreateProgramWithBuiltInKernels(context, 1, &device, "Fan1;Fan2", &status);
//cl_program clProgramReturn = clCreateProgramWithBuiltInKernels(context, 1, &device, "Fan1;Fan2", &status);
// read kernel binary from file
uint8_t *kernel_bin = NULL;
size_t kernel_size;
cl_int binary_status = 0;
status = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
cl_errChk(status, "read_kernel_file", true);
cl_program clProgramReturn = clCreateProgramWithBinary(
context, 1, &device, &kernel_size, &kernel_bin, &binary_status, &status);
free(kernel_bin);
cl_errChk(status, "Creating program", true);
free(source);
fclose(fp);
//free(source);
//fclose(fp);
// Try to compile the program
status = clBuildProgram(clProgramReturn, 0, NULL, compileoptions, NULL, NULL);

Binary file not shown.

View File

@@ -94,10 +94,9 @@ void ForwardSub(cl_context context, float *a, float *b, float *m, int size,
cl_event writeEvent, kernelEvent, readEvent;
float writeTime = 0, readTime = 0, kernelTime = 0;
float writeMB = 0, readMB = 0;
gaussianElim_program =
cl_compileProgram((char *)"gaussianElim_kernels.cl", NULL);
gaussianElim_program = cl_compileProgram((char *)"gaussianElim_kernels.cl", NULL);
fan1_kernel = clCreateKernel(gaussianElim_program, "Fan1", &status);
status = cl_errChk(status, (char *)"Error Creating Fan1 kernel", true);
if (status)

2
benchmarks/opencl/kmeans/.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
kmeans

View File

@@ -1,79 +1,47 @@
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
LLVM_HOME ?= ~/dev/llvm-project/drops
TOOLCHAIN_PATH ?= ~/dev/riscv-gnu-toolchain/drops
SYSROOT ?= $(TOOLCHAIN_PATH)/riscv32-unknown-elf
POCL_CC_PATH ?= $(realpath ../compiler)
POCL_RT_PATH ?= $(realpath ../runtime)
VORTEX_DRV_PATH ?= $(realpath ../../../driver/sw)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
CXXFLAGS += -std=c++11 -O0 -g -fpermissive -Wall -Wextra -pedantic -Wfatal-errors
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
CXXFLAGS += -I$(POCL_RT_PATH)/include
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH)
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
PROJECT = kmeans
SRCS = main.cc read_input.c rmse.c cluster.c kmeans_clustering.c
all: $(PROJECT).dump $(PROJECT).hex
SRCS = main.cc read_input.c rmse.c kmeans_clustering.c cluster.c getopt.c
lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
all: $(PROJECT)
kmeans_clustering.o: kmeans_clustering.c
$(CC) $(CXXFLAGS) -c kmeans_clustering.c
kernel.pocl: kernel.cl
TOOLCHAIN_PATH=$(TOOLCHAIN_PATH) SYSROOT=$(SYSROOT) LLVM_HOME=$(LLVM_HOME) VORTEX_RUNTIME_PATH=$(VORTEX_RT_PATH) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_HOME)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o kernel.pocl kernel.cl
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
cluster.o: cluster.c
$(CC) $(CXXFLAGS) -c cluster.c
run-fpga: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT)
read_input.o: read_input.c
$(CC) $(CXXFLAGS) -c read_input.c
run-ase: $(PROJECT) kernel.pocl
ASE_LOG=0 LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT)
rmse.o: rmse.c
$(CC) $(CXXFLAGS) -c rmse.c
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT)
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
run-rtlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT)
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -strace -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug
rm -rf $(PROJECT) *.o *.dump .depend
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif

File diff suppressed because it is too large Load Diff

View File

@@ -1,191 +1,191 @@
/* getopt.h */
/* Declarations for getopt.
Copyright (C) 1989-1994, 1996-1999, 2001 Free Software
Foundation, Inc. This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute
it and/or modify it under the terms of the GNU Lesser
General Public License as published by the Free Software
Foundation; either version 2.1 of the License, or
(at your option) any later version.
The GNU C Library is distributed in the hope that it will
be useful, but WITHOUT ANY WARRANTY; without even the
implied warranty of MERCHANTABILITY or FITNESS FOR A
PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
You should have received a copy of the GNU Lesser General
Public License along with the GNU C Library; if not, write
to the Free Software Foundation, Inc., 59 Temple Place,
Suite 330, Boston, MA 02111-1307 USA. */
#ifndef _GETOPT_H
#ifndef __need_getopt
# define _GETOPT_H 1
#endif
/* If __GNU_LIBRARY__ is not already defined, either we are being used
standalone, or this is the first header included in the source file.
If we are being used with glibc, we need to include <features.h>, but
that does not exist if we are standalone. So: if __GNU_LIBRARY__ is
not defined, include <ctype.h>, which will pull in <features.h> for us
if it's from glibc. (Why ctype.h? It's guaranteed to exist and it
doesn't flood the namespace with stuff the way some other headers do.) */
#if !defined __GNU_LIBRARY__
# include <ctype.h>
#endif
#ifdef __cplusplus
extern "C" {
#endif
/* For communication from `getopt' to the caller.
When `getopt' finds an option that takes an argument,
the argument value is returned here.
Also, when `ordering' is RETURN_IN_ORDER,
each non-option ARGV-element is returned here. */
extern char *optarg;
/* Index in ARGV of the next element to be scanned.
This is used for communication to and from the caller
and for communication between successive calls to `getopt'.
On entry to `getopt', zero means this is the first call; initialize.
When `getopt' returns -1, this is the index of the first of the
non-option elements that the caller should itself scan.
Otherwise, `optind' communicates from one call to the next
how much of ARGV has been scanned so far. */
extern int optind;
/* Callers store zero here to inhibit the error message `getopt' prints
for unrecognized options. */
extern int opterr;
/* Set to an option character which was unrecognized. */
extern int optopt;
#ifndef __need_getopt
/* Describe the long-named options requested by the application.
The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector
of `struct option' terminated by an element containing a name which is
zero.
The field `has_arg' is:
no_argument (or 0) if the option does not take an argument,
required_argument (or 1) if the option requires an argument,
optional_argument (or 2) if the option takes an optional argument.
If the field `flag' is not NULL, it points to a variable that is set
to the value given in the field `val' when the option is found, but
left unchanged if the option is not found.
To have a long-named option do something other than set an `int' to
a compiled-in constant, such as set a value from `optarg', set the
option's `flag' field to zero and its `val' field to a nonzero
value (the equivalent single-letter option character, if there is
one). For long options that have a zero `flag' field, `getopt'
returns the contents of the `val' field. */
struct option
{
# if (defined __STDC__ && __STDC__) || defined __cplusplus
const char *name;
# else
char *name;
# endif
/* has_arg can't be an enum because some compilers complain about
type mismatches in all the code that assumes it is an int. */
int has_arg;
int *flag;
int val;
};
/* Names for the values of the `has_arg' field of `struct option'. */
# define no_argument 0
# define required_argument 1
# define optional_argument 2
#endif /* need getopt */
/* Get definitions and prototypes for functions to process the
arguments in ARGV (ARGC of them, minus the program name) for
options given in OPTS.
Return the option character from OPTS just read. Return -1 when
there are no more options. For unrecognized options, or options
missing arguments, `optopt' is set to the option letter, and '?' is
returned.
The OPTS string is a list of characters which are recognized option
letters, optionally followed by colons, specifying that that letter
takes an argument, to be placed in `optarg'.
If a letter in OPTS is followed by two colons, its argument is
optional. This behavior is specific to the GNU `getopt'.
The argument `--' causes premature termination of argument
scanning, explicitly telling `getopt' that there are no more
options.
If OPTS begins with `--', then non-option arguments are treated as
arguments to the option '\0'. This behavior is specific to the GNU
`getopt'. */
#if (defined __STDC__ && __STDC__) || defined __cplusplus
# ifdef __GNU_LIBRARY__
/* Many other libraries have conflicting prototypes for getopt, with
differences in the consts, in stdlib.h. To avoid compilation
errors, only prototype getopt for the GNU C library. */
extern int getopt (int ___argc, char *const *___argv, const char *__shortopts);
# else /* not __GNU_LIBRARY__ */
extern int getopt ();
# endif /* __GNU_LIBRARY__ */
# ifndef __need_getopt
extern int getopt_long (int ___argc, char *const *___argv,
const char *__shortopts,
const struct option *__longopts, int *__longind);
extern int getopt_long_only (int ___argc, char *const *___argv,
const char *__shortopts,
const struct option *__longopts, int *__longind);
/* Internal only. Users should not call this directly. */
extern int _getopt_internal (int ___argc, char *const *___argv,
const char *__shortopts,
const struct option *__longopts, int *__longind,
int __long_only);
# endif
#else /* not __STDC__ */
extern int getopt ();
# ifndef __need_getopt
extern int getopt_long ();
extern int getopt_long_only ();
extern int _getopt_internal ();
# endif
#endif /* __STDC__ */
#ifdef __cplusplus
}
#endif
/* Make sure we later can get all the definitions and declarations. */
#undef __need_getopt
#endif /* getopt.h */
/* getopt.h */
/* Declarations for getopt.
Copyright (C) 1989-1994, 1996-1999, 2001 Free Software
Foundation, Inc. This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute
it and/or modify it under the terms of the GNU Lesser
General Public License as published by the Free Software
Foundation; either version 2.1 of the License, or
(at your option) any later version.
The GNU C Library is distributed in the hope that it will
be useful, but WITHOUT ANY WARRANTY; without even the
implied warranty of MERCHANTABILITY or FITNESS FOR A
PARTICULAR PURPOSE. See the GNU Lesser General Public
License for more details.
You should have received a copy of the GNU Lesser General
Public License along with the GNU C Library; if not, write
to the Free Software Foundation, Inc., 59 Temple Place,
Suite 330, Boston, MA 02111-1307 USA. */
#ifndef _GETOPT_H
#ifndef __need_getopt
# define _GETOPT_H 1
#endif
/* If __GNU_LIBRARY__ is not already defined, either we are being used
standalone, or this is the first header included in the source file.
If we are being used with glibc, we need to include <features.h>, but
that does not exist if we are standalone. So: if __GNU_LIBRARY__ is
not defined, include <ctype.h>, which will pull in <features.h> for us
if it's from glibc. (Why ctype.h? It's guaranteed to exist and it
doesn't flood the namespace with stuff the way some other headers do.) */
#if !defined __GNU_LIBRARY__
# include <ctype.h>
#endif
#ifdef __cplusplus
extern "C" {
#endif
/* For communication from `getopt' to the caller.
When `getopt' finds an option that takes an argument,
the argument value is returned here.
Also, when `ordering' is RETURN_IN_ORDER,
each non-option ARGV-element is returned here. */
extern char *optarg;
/* Index in ARGV of the next element to be scanned.
This is used for communication to and from the caller
and for communication between successive calls to `getopt'.
On entry to `getopt', zero means this is the first call; initialize.
When `getopt' returns -1, this is the index of the first of the
non-option elements that the caller should itself scan.
Otherwise, `optind' communicates from one call to the next
how much of ARGV has been scanned so far. */
extern int optind;
/* Callers store zero here to inhibit the error message `getopt' prints
for unrecognized options. */
extern int opterr;
/* Set to an option character which was unrecognized. */
extern int optopt;
#ifndef __need_getopt
/* Describe the long-named options requested by the application.
The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector
of `struct option' terminated by an element containing a name which is
zero.
The field `has_arg' is:
no_argument (or 0) if the option does not take an argument,
required_argument (or 1) if the option requires an argument,
optional_argument (or 2) if the option takes an optional argument.
If the field `flag' is not NULL, it points to a variable that is set
to the value given in the field `val' when the option is found, but
left unchanged if the option is not found.
To have a long-named option do something other than set an `int' to
a compiled-in constant, such as set a value from `optarg', set the
option's `flag' field to zero and its `val' field to a nonzero
value (the equivalent single-letter option character, if there is
one). For long options that have a zero `flag' field, `getopt'
returns the contents of the `val' field. */
struct option
{
# if (defined __STDC__ && __STDC__) || defined __cplusplus
const char *name;
# else
char *name;
# endif
/* has_arg can't be an enum because some compilers complain about
type mismatches in all the code that assumes it is an int. */
int has_arg;
int *flag;
int val;
};
/* Names for the values of the `has_arg' field of `struct option'. */
# define no_argument 0
# define required_argument 1
# define optional_argument 2
#endif /* need getopt */
/* Get definitions and prototypes for functions to process the
arguments in ARGV (ARGC of them, minus the program name) for
options given in OPTS.
Return the option character from OPTS just read. Return -1 when
there are no more options. For unrecognized options, or options
missing arguments, `optopt' is set to the option letter, and '?' is
returned.
The OPTS string is a list of characters which are recognized option
letters, optionally followed by colons, specifying that that letter
takes an argument, to be placed in `optarg'.
If a letter in OPTS is followed by two colons, its argument is
optional. This behavior is specific to the GNU `getopt'.
The argument `--' causes premature termination of argument
scanning, explicitly telling `getopt' that there are no more
options.
If OPTS begins with `--', then non-option arguments are treated as
arguments to the option '\0'. This behavior is specific to the GNU
`getopt'. */
#if (defined __STDC__ && __STDC__) || defined __cplusplus
# ifdef __GNU_LIBRARY__
/* Many other libraries have conflicting prototypes for getopt, with
differences in the consts, in stdlib.h. To avoid compilation
errors, only prototype getopt for the GNU C library. */
extern int getopt (int ___argc, char *const *___argv, const char *__shortopts);
# else /* not __GNU_LIBRARY__ */
extern int getopt ();
# endif /* __GNU_LIBRARY__ */
# ifndef __need_getopt
extern int getopt_long (int ___argc, char *const *___argv,
const char *__shortopts,
const struct option *__longopts, int *__longind);
extern int getopt_long_only (int ___argc, char *const *___argv,
const char *__shortopts,
const struct option *__longopts, int *__longind);
/* Internal only. Users should not call this directly. */
extern int _getopt_internal (int ___argc, char *const *___argv,
const char *__shortopts,
const struct option *__longopts, int *__longind,
int __long_only);
# endif
#else /* not __STDC__ */
extern int getopt ();
# ifndef __need_getopt
extern int getopt_long ();
extern int getopt_long_only ();
extern int _getopt_internal ();
# endif
#endif /* __STDC__ */
#ifdef __cplusplus
}
#endif
/* Make sure we later can get all the definitions and declarations. */
#undef __need_getopt
#endif /* getopt.h */

View File

@@ -1,61 +1,61 @@
#ifndef FLT_MAX
#define FLT_MAX 3.40282347e+38
#endif
__kernel void
kmeans_kernel_c(__global float *feature,
__global float *clusters,
__global int *membership,
int npoints,
int nclusters,
int nfeatures,
int offset,
int size
)
{
unsigned int point_id = get_global_id(0);
int index = 0;
//const unsigned int point_id = get_global_id(0);
if (point_id < npoints)
{
float min_dist=FLT_MAX;
for (int i=0; i < nclusters; i++) {
float dist = 0;
float ans = 0;
for (int l=0; l<nfeatures; l++){
ans += (feature[l * npoints + point_id]-clusters[i*nfeatures+l])*
(feature[l * npoints + point_id]-clusters[i*nfeatures+l]);
}
dist = ans;
if (dist < min_dist) {
min_dist = dist;
index = i;
}
}
//printf("%d\n", index);
membership[point_id] = index;
}
return;
}
__kernel void
kmeans_swap(__global float *feature,
__global float *feature_swap,
int npoints,
int nfeatures
){
unsigned int tid = get_global_id(0);
//for(int i = 0; i < nfeatures; i++)
// feature_swap[i * npoints + tid] = feature[tid * nfeatures + i];
//Lingjie Zhang modificated at 11/05/2015
if (tid < npoints){
for(int i = 0; i < nfeatures; i++)
feature_swap[i * npoints + tid] = feature[tid * nfeatures + i];
}
// end of Lingjie Zhang's modification
}
#ifndef FLT_MAX
#define FLT_MAX 3.40282347e+38
#endif
__kernel void
kmeans_kernel_c(__global float *feature,
__global float *clusters,
__global int *membership,
int npoints,
int nclusters,
int nfeatures,
int offset,
int size
)
{
unsigned int point_id = get_global_id(0);
int index = 0;
//const unsigned int point_id = get_global_id(0);
if (point_id < npoints)
{
float min_dist=FLT_MAX;
for (int i=0; i < nclusters; i++) {
float dist = 0;
float ans = 0;
for (int l=0; l<nfeatures; l++){
ans += (feature[l * npoints + point_id]-clusters[i*nfeatures+l])*
(feature[l * npoints + point_id]-clusters[i*nfeatures+l]);
}
dist = ans;
if (dist < min_dist) {
min_dist = dist;
index = i;
}
}
//printf("%d\n", index);
membership[point_id] = index;
}
return;
}
__kernel void
kmeans_swap(__global float *feature,
__global float *feature_swap,
int npoints,
int nfeatures
){
unsigned int tid = get_global_id(0);
//for(int i = 0; i < nfeatures; i++)
// feature_swap[i * npoints + tid] = feature[tid * nfeatures + i];
//Lingjie Zhang modificated at 11/05/2015
if (tid < npoints){
for(int i = 0; i < nfeatures; i++)
feature_swap[i * npoints + tid] = feature[tid * nfeatures + i];
}
// end of Lingjie Zhang's modification
}

Binary file not shown.

Binary file not shown.

View File

@@ -1,176 +1,176 @@
/*****************************************************************************/
/*IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. */
/*By downloading, copying, installing or using the software you agree */
/*to this license. If you do not agree to this license, do not download, */
/*install, copy or use the software. */
/* */
/* */
/*Copyright (c) 2005 Northwestern University */
/*All rights reserved. */
/*Redistribution of the software in source and binary forms, */
/*with or without modification, is permitted provided that the */
/*following conditions are met: */
/* */
/*1 Redistributions of source code must retain the above copyright */
/* notice, this list of conditions and the following disclaimer. */
/* */
/*2 Redistributions in binary form must reproduce the above copyright */
/* notice, this list of conditions and the following disclaimer in the */
/* documentation and/or other materials provided with the distribution.*/
/* */
/*3 Neither the name of Northwestern University nor the names of its */
/* contributors may be used to endorse or promote products derived */
/* from this software without specific prior written permission. */
/* */
/*THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS */
/*IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED */
/*TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT AND */
/*FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL */
/*NORTHWESTERN UNIVERSITY OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, */
/*INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/*(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */
/*SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) */
/*HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, */
/*STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN */
/*ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/*POSSIBILITY OF SUCH DAMAGE. */
/******************************************************************************/
/*************************************************************************/
/** File: kmeans_clustering.c **/
/** Description: Implementation of regular k-means clustering **/
/** algorithm **/
/** Author: Wei-keng Liao **/
/** ECE Department, Northwestern University **/
/** email: wkliao@ece.northwestern.edu **/
/** **/
/** Edited by: Jay Pisharath **/
/** Northwestern University. **/
/** **/
/** ================================================================ **/
/** **/
/** Edited by: Shuai Che, David Tarjan, Sang-Ha Lee **/
/** University of Virginia **/
/** **/
/** Description: No longer supports fuzzy c-means clustering; **/
/** only regular k-means clustering. **/
/** No longer performs "validity" function to analyze **/
/** compactness and separation crietria; instead **/
/** calculate root mean squared error. **/
/** **/
/*************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <float.h>
#include <math.h>
#include "kmeans.h"
#define RANDOM_MAX 2147483647
extern double wtime(void);
/*----< kmeans_clustering() >---------------------------------------------*/
float** kmeans_clustering(float **feature, /* in: [npoints][nfeatures] */
int nfeatures,
int npoints,
int nclusters,
float threshold,
int *membership) /* out: [npoints] */
{
int i, j, n = 0; /* counters */
int loop=0, temp;
int *new_centers_len; /* [nclusters]: no. of points in each cluster */
float delta; /* if the point moved */
float **clusters; /* out: [nclusters][nfeatures] */
float **new_centers; /* [nclusters][nfeatures] */
int *initial; /* used to hold the index of points not yet selected
prevents the "birthday problem" of dual selection (?)
considered holding initial cluster indices, but changed due to
possible, though unlikely, infinite loops */
int initial_points;
int c = 0;
/* nclusters should never be > npoints
that would guarantee a cluster without points */
if (nclusters > npoints)
nclusters = npoints;
/* allocate space for and initialize returning variable clusters[] */
clusters = (float**) malloc(nclusters * sizeof(float*));
clusters[0] = (float*) malloc(nclusters * nfeatures * sizeof(float));
for (i=1; i<nclusters; i++)
clusters[i] = clusters[i-1] + nfeatures;
/* initialize the random clusters */
initial = (int *) malloc (npoints * sizeof(int));
for (i = 0; i < npoints; i++)
{
initial[i] = i;
}
initial_points = npoints;
/* randomly pick cluster centers */
for (i=0; i<nclusters && initial_points >= 0; i++) {
//n = (int)rand() % initial_points;
for (j=0; j<nfeatures; j++)
clusters[i][j] = feature[initial[n]][j]; // remapped
/* swap the selected index to the end (not really necessary,
could just move the end up) */
temp = initial[n];
initial[n] = initial[initial_points-1];
initial[initial_points-1] = temp;
initial_points--;
n++;
}
/* initialize the membership to -1 for all */
for (i=0; i < npoints; i++)
membership[i] = -1;
/* allocate space for and initialize new_centers_len and new_centers */
new_centers_len = (int*) calloc(nclusters, sizeof(int));
new_centers = (float**) malloc(nclusters * sizeof(float*));
new_centers[0] = (float*) calloc(nclusters * nfeatures, sizeof(float));
for (i=1; i<nclusters; i++)
new_centers[i] = new_centers[i-1] + nfeatures;
/* iterate until convergence */
do {
delta = 0.0;
// CUDA
delta = (float) kmeansOCL(feature, /* in: [npoints][nfeatures] */
nfeatures, /* number of attributes for each point */
npoints, /* number of data points */
nclusters, /* number of clusters */
membership, /* which cluster the point belongs to */
clusters, /* out: [nclusters][nfeatures] */
new_centers_len, /* out: number of points in each cluster */
new_centers /* sum of points in each cluster */
);
/* replace old cluster centers with new_centers */
/* CPU side of reduction */
for (i=0; i<nclusters; i++) {
for (j=0; j<nfeatures; j++) {
if (new_centers_len[i] > 0)
clusters[i][j] = new_centers[i][j] / new_centers_len[i]; /* take average i.e. sum/n */
new_centers[i][j] = 0.0; /* set back to 0 */
}
new_centers_len[i] = 0; /* set back to 0 */
}
c++;
} while ((delta > threshold) && (loop++ < 500)); /* makes sure loop terminates */
printf("iterated %d times\n", c);
free(new_centers[0]);
free(new_centers);
free(new_centers_len);
return clusters;
}
/*****************************************************************************/
/*IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. */
/*By downloading, copying, installing or using the software you agree */
/*to this license. If you do not agree to this license, do not download, */
/*install, copy or use the software. */
/* */
/* */
/*Copyright (c) 2005 Northwestern University */
/*All rights reserved. */
/*Redistribution of the software in source and binary forms, */
/*with or without modification, is permitted provided that the */
/*following conditions are met: */
/* */
/*1 Redistributions of source code must retain the above copyright */
/* notice, this list of conditions and the following disclaimer. */
/* */
/*2 Redistributions in binary form must reproduce the above copyright */
/* notice, this list of conditions and the following disclaimer in the */
/* documentation and/or other materials provided with the distribution.*/
/* */
/*3 Neither the name of Northwestern University nor the names of its */
/* contributors may be used to endorse or promote products derived */
/* from this software without specific prior written permission. */
/* */
/*THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS */
/*IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED */
/*TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT AND */
/*FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL */
/*NORTHWESTERN UNIVERSITY OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, */
/*INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/*(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */
/*SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) */
/*HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, */
/*STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN */
/*ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/*POSSIBILITY OF SUCH DAMAGE. */
/******************************************************************************/
/*************************************************************************/
/** File: kmeans_clustering.c **/
/** Description: Implementation of regular k-means clustering **/
/** algorithm **/
/** Author: Wei-keng Liao **/
/** ECE Department, Northwestern University **/
/** email: wkliao@ece.northwestern.edu **/
/** **/
/** Edited by: Jay Pisharath **/
/** Northwestern University. **/
/** **/
/** ================================================================ **/
/** **/
/** Edited by: Shuai Che, David Tarjan, Sang-Ha Lee **/
/** University of Virginia **/
/** **/
/** Description: No longer supports fuzzy c-means clustering; **/
/** only regular k-means clustering. **/
/** No longer performs "validity" function to analyze **/
/** compactness and separation crietria; instead **/
/** calculate root mean squared error. **/
/** **/
/*************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <float.h>
#include <math.h>
#include "kmeans.h"
#define RANDOM_MAX 2147483647
extern double wtime(void);
/*----< kmeans_clustering() >---------------------------------------------*/
float** kmeans_clustering(float **feature, /* in: [npoints][nfeatures] */
int nfeatures,
int npoints,
int nclusters,
float threshold,
int *membership) /* out: [npoints] */
{
int i, j, n = 0; /* counters */
int loop=0, temp;
int *new_centers_len; /* [nclusters]: no. of points in each cluster */
float delta; /* if the point moved */
float **clusters; /* out: [nclusters][nfeatures] */
float **new_centers; /* [nclusters][nfeatures] */
int *initial; /* used to hold the index of points not yet selected
prevents the "birthday problem" of dual selection (?)
considered holding initial cluster indices, but changed due to
possible, though unlikely, infinite loops */
int initial_points;
int c = 0;
/* nclusters should never be > npoints
that would guarantee a cluster without points */
if (nclusters > npoints)
nclusters = npoints;
/* allocate space for and initialize returning variable clusters[] */
clusters = (float**) malloc(nclusters * sizeof(float*));
clusters[0] = (float*) malloc(nclusters * nfeatures * sizeof(float));
for (i=1; i<nclusters; i++)
clusters[i] = clusters[i-1] + nfeatures;
/* initialize the random clusters */
initial = (int *) malloc (npoints * sizeof(int));
for (i = 0; i < npoints; i++)
{
initial[i] = i;
}
initial_points = npoints;
/* randomly pick cluster centers */
for (i=0; i<nclusters && initial_points >= 0; i++) {
//n = (int)rand() % initial_points;
for (j=0; j<nfeatures; j++)
clusters[i][j] = feature[initial[n]][j]; // remapped
/* swap the selected index to the end (not really necessary,
could just move the end up) */
temp = initial[n];
initial[n] = initial[initial_points-1];
initial[initial_points-1] = temp;
initial_points--;
n++;
}
/* initialize the membership to -1 for all */
for (i=0; i < npoints; i++)
membership[i] = -1;
/* allocate space for and initialize new_centers_len and new_centers */
new_centers_len = (int*) calloc(nclusters, sizeof(int));
new_centers = (float**) malloc(nclusters * sizeof(float*));
new_centers[0] = (float*) calloc(nclusters * nfeatures, sizeof(float));
for (i=1; i<nclusters; i++)
new_centers[i] = new_centers[i-1] + nfeatures;
/* iterate until convergence */
do {
delta = 0.0;
// CUDA
delta = (float) kmeansOCL(feature, /* in: [npoints][nfeatures] */
nfeatures, /* number of attributes for each point */
npoints, /* number of data points */
nclusters, /* number of clusters */
membership, /* which cluster the point belongs to */
clusters, /* out: [nclusters][nfeatures] */
new_centers_len, /* out: number of points in each cluster */
new_centers /* sum of points in each cluster */
);
/* replace old cluster centers with new_centers */
/* CPU side of reduction */
for (i=0; i<nclusters; i++) {
for (j=0; j<nfeatures; j++) {
if (new_centers_len[i] > 0)
clusters[i][j] = new_centers[i][j] / new_centers_len[i]; /* take average i.e. sum/n */
new_centers[i][j] = 0.0; /* set back to 0 */
}
new_centers_len[i] = 0; /* set back to 0 */
}
c++;
} while ((delta > threshold) && (loop++ < 500)); /* makes sure loop terminates */
printf("iterated %d times\n", c);
free(new_centers[0]);
free(new_centers);
free(new_centers_len);
return clusters;
}

Binary file not shown.

View File

@@ -1,394 +1,382 @@
#include "kmeans.h"
#include <iostream>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <string>
#ifdef WIN
#include <windows.h>
#else
#include <pthread.h>
#include <sys/time.h>
double gettime() {
struct timeval t;
gettimeofday(&t, NULL);
return t.tv_sec + t.tv_usec * 1e-6;
}
#endif
#ifdef NV
#include <oclUtils.h>
#else
#include <CL/cl.h>
#endif
#ifndef FLT_MAX
#define FLT_MAX 3.40282347e+38
#endif
#ifdef RD_WG_SIZE_0_0
#define BLOCK_SIZE RD_WG_SIZE_0_0
#elif defined(RD_WG_SIZE_0)
#define BLOCK_SIZE RD_WG_SIZE_0
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE RD_WG_SIZE
#else
#define BLOCK_SIZE 256
#endif
#ifdef RD_WG_SIZE_1_0
#define BLOCK_SIZE2 RD_WG_SIZE_1_0
#elif defined(RD_WG_SIZE_1)
#define BLOCK_SIZE2 RD_WG_SIZE_1
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE2 RD_WG_SIZE
#else
#define BLOCK_SIZE2 256
#endif
// local variables
static cl_context context;
static cl_command_queue cmd_queue;
static cl_device_type device_type;
static cl_device_id *device_list;
static cl_int num_devices;
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
if (nullptr == filename || nullptr == data || 0 == size)
return -1;
FILE* fp = fopen(filename, "r");
if (NULL == fp) {
fprintf(stderr, "Failed to load kernel.");
return -1;
}
fseek(fp , 0 , SEEK_END);
long fsize = ftell(fp);
rewind(fp);
*data = (uint8_t*)malloc(fsize);
*size = fread(*data, 1, fsize, fp);
fclose(fp);
return 0;
}
static int initialize(int use_gpu) {
cl_int result;
size_t size;
/*// create OpenCL context
cl_platform_id platform_id;
if (clGetPlatformIDs(1, &platform_id, NULL) != CL_SUCCESS) {
printf("ERROR: clGetPlatformIDs(1,*,0) failed\n");
return -1;
}
cl_context_properties ctxprop[] = {CL_CONTEXT_PLATFORM,
(cl_context_properties)platform_id, 0};
device_type = use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU;
context = clCreateContextFromType(ctxprop, device_type, NULL, NULL, NULL);
if (!context) {
printf("ERROR: clCreateContextFromType(%s) failed\n",
use_gpu ? "GPU" : "CPU");
return -1;
}
// get the list of GPUs
result = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size);
num_devices = (int)(size / sizeof(cl_device_id));
if (result != CL_SUCCESS || num_devices < 1) {
printf("ERROR: clGetContextInfo() failed\n");
return -1;
}
device_list = new cl_device_id[num_devices];
if (!device_list) {
printf("ERROR: new cl_device_id[] failed\n");
return -1;
}
result =
clGetContextInfo(context, CL_CONTEXT_DEVICES, size, device_list, NULL);
if (result != CL_SUCCESS) {
printf("ERROR: clGetContextInfo() failed\n");
return -1;
}*/
cl_platform_id platform_id;
num_devices = 1;
device_list = new cl_device_id[num_devices];
result = clGetPlatformIDs(1, &platform_id, NULL);
result = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, device_list, NULL);
context = clCreateContext(NULL, 1, device_list, NULL, NULL, &result);
// create command queue for the first device
cmd_queue = clCreateCommandQueue(context, device_list[0], 0, NULL);
if (!cmd_queue) {
printf("ERROR: clCreateCommandQueue() failed\n");
return -1;
}
return 0;
}
static int shutdown() {
// release resources
if (cmd_queue)
clReleaseCommandQueue(cmd_queue);
if (context)
clReleaseContext(context);
if (device_list)
delete device_list;
// reset all variables
cmd_queue = 0;
context = 0;
device_list = 0;
num_devices = 0;
device_type = 0;
return 0;
}
cl_mem d_feature;
cl_mem d_feature_swap;
cl_mem d_cluster;
cl_mem d_membership;
cl_kernel kernel;
cl_kernel kernel_s;
cl_kernel kernel2;
int *membership_OCL;
int *membership_d;
float *feature_d;
float *clusters_d;
float *center_d;
uint8_t* kernel_bin = NULL;
size_t kernel_size = 0;
cl_int binary_status = 0;
int allocate(int n_points, int n_features, int n_clusters, float **feature) {
/*int sourcesize = 1024 * 1024;
char *source = (char *)calloc(sourcesize, sizeof(char));
if (!source) {
printf("ERROR: calloc(%d) failed\n", sourcesize);
return -1;
}
// read the kernel core source
char *tempchar = "./kmeans.cl";
FILE *fp = fopen(tempchar, "rb");
if (!fp) {
printf("ERROR: unable to open '%s'\n", tempchar);
return -1;
}
fread(source + strlen(source), sourcesize, 1, fp);
fclose(fp);*/
// OpenCL initialization
int use_gpu = 1;
if (initialize(use_gpu))
return -1;
// Load Kernel
if (read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size)) {
return -1;
}
// compile kernel
cl_int err = 0;
//const char *slist[2] = {source, 0};
//cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
cl_program prog = clCreateProgramWithBinary(
context, 1, device_list, &kernel_size, &kernel_bin, &binary_status, &err);
// cl_program prog = clCreateProgramWithBuiltInKernels(context, 1, device_list, "kmeans_kernel_c;kmeans_swap", &err);
if (err != CL_SUCCESS) {
printf("ERROR: clCreateProgramWithSource() => %d\n", err);
return -1;
}
err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL);
{ // show warnings/errors
// static char log[65536]; memset(log, 0, sizeof(log));
// cl_device_id device_id = 0;
// err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id),
//&device_id, NULL);
// clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG,
// sizeof(log)-1, log, NULL);
// if(err || strstr(log,"warning:") || strstr(log, "error:"))
// printf("<<<<\n%s\n>>>>\n", log);
}
if (err != CL_SUCCESS) {
printf("ERROR: clBuildProgram() => %d\n", err);
return -1;
}
char *kernel_kmeans_c = "kmeans_kernel_c";
char *kernel_swap = "kmeans_swap";
kernel_s = clCreateKernel(prog, kernel_kmeans_c, &err);
if (err != CL_SUCCESS) {
printf("ERROR: clCreateKernel() 0 => %d\n", err);
return -1;
}
kernel2 = clCreateKernel(prog, kernel_swap, &err);
if (err != CL_SUCCESS) {
printf("ERROR: clCreateKernel() 0 => %d\n", err);
return -1;
}
clReleaseProgram(prog);
d_feature = clCreateBuffer(context, CL_MEM_READ_WRITE,
n_points * n_features * sizeof(float), NULL, &err);
if (err != CL_SUCCESS) {
printf("ERROR: clCreateBuffer d_feature (size:%d) => %d\n",
n_points * n_features, err);
return -1;
}
d_feature_swap =
clCreateBuffer(context, CL_MEM_READ_WRITE,
n_points * n_features * sizeof(float), NULL, &err);
if (err != CL_SUCCESS) {
printf("ERROR: clCreateBuffer d_feature_swap (size:%d) => %d\n",
n_points * n_features, err);
return -1;
}
d_cluster =
clCreateBuffer(context, CL_MEM_READ_WRITE,
n_clusters * n_features * sizeof(float), NULL, &err);
if (err != CL_SUCCESS) {
printf("ERROR: clCreateBuffer d_cluster (size:%d) => %d\n",
n_clusters * n_features, err);
return -1;
}
d_membership = clCreateBuffer(context, CL_MEM_READ_WRITE,
n_points * sizeof(int), NULL, &err);
if (err != CL_SUCCESS) {
printf("ERROR: clCreateBuffer d_membership (size:%d) => %d\n", n_points,
err);
return -1;
}
// write buffers
err = clEnqueueWriteBuffer(cmd_queue, d_feature, 1, 0,
n_points * n_features * sizeof(float), feature[0],
0, 0, 0);
if (err != CL_SUCCESS) {
printf("ERROR: clEnqueueWriteBuffer d_feature (size:%d) => %d\n",
n_points * n_features, err);
return -1;
}
clSetKernelArg(kernel2, 0, sizeof(void *), (void *)&d_feature);
clSetKernelArg(kernel2, 1, sizeof(void *), (void *)&d_feature_swap);
clSetKernelArg(kernel2, 2, sizeof(cl_int), (void *)&n_points);
clSetKernelArg(kernel2, 3, sizeof(cl_int), (void *)&n_features);
size_t global_work[3] = {n_points, 1, 1};
/// Ke Wang adjustable local group size 2013/08/07 10:37:33
size_t local_work_size = BLOCK_SIZE; // work group size is defined by
// RD_WG_SIZE_0 or RD_WG_SIZE_0_0
// 2014/06/10 17:00:51
if (global_work[0] % local_work_size != 0)
global_work[0] = (global_work[0] / local_work_size + 1) * local_work_size;
err = clEnqueueNDRangeKernel(cmd_queue, kernel2, 1, NULL, global_work,
&local_work_size, 0, 0, 0);
if (err != CL_SUCCESS) {
printf("ERROR: clEnqueueNDRangeKernel()=>%d failed\n", err);
return -1;
}
membership_OCL = (int *)malloc(n_points * sizeof(int));
}
void deallocateMemory() {
clReleaseMemObject(d_feature);
clReleaseMemObject(d_feature_swap);
clReleaseMemObject(d_cluster);
clReleaseMemObject(d_membership);
if (kernel_bin) free(kernel_bin);
free(membership_OCL);
}
int main(int argc, char **argv) {
printf("WG size of kernel_swap = %d, WG size of kernel_kmeans = %d \n",
BLOCK_SIZE, BLOCK_SIZE2);
setup(argc, argv);
shutdown();
}
int kmeansOCL(float **feature, /* in: [npoints][nfeatures] */
int n_features, int n_points, int n_clusters, int *membership,
float **clusters, int *new_centers_len, float **new_centers) {
int delta = 0;
int i, j, k;
cl_int err = 0;
size_t global_work[3] = {n_points, 1, 1};
/// Ke Wang adjustable local group size 2013/08/07 10:37:33
size_t local_work_size = BLOCK_SIZE2; // work group size is defined by
// RD_WG_SIZE_1 or RD_WG_SIZE_1_0
// 2014/06/10 17:00:41
if (global_work[0] % local_work_size != 0)
global_work[0] = (global_work[0] / local_work_size + 1) * local_work_size;
err = clEnqueueWriteBuffer(cmd_queue, d_cluster, 1, 0,
n_clusters * n_features * sizeof(float),
clusters[0], 0, 0, 0);
if (err != CL_SUCCESS) {
printf("ERROR: clEnqueueWriteBuffer d_cluster (size:%d) => %d\n", n_points,
err);
return -1;
}
int size = 0;
int offset = 0;
clSetKernelArg(kernel_s, 0, sizeof(void *), (void *)&d_feature_swap);
clSetKernelArg(kernel_s, 1, sizeof(void *), (void *)&d_cluster);
clSetKernelArg(kernel_s, 2, sizeof(void *), (void *)&d_membership);
clSetKernelArg(kernel_s, 3, sizeof(cl_int), (void *)&n_points);
clSetKernelArg(kernel_s, 4, sizeof(cl_int), (void *)&n_clusters);
clSetKernelArg(kernel_s, 5, sizeof(cl_int), (void *)&n_features);
clSetKernelArg(kernel_s, 6, sizeof(cl_int), (void *)&offset);
clSetKernelArg(kernel_s, 7, sizeof(cl_int), (void *)&size);
err = clEnqueueNDRangeKernel(cmd_queue, kernel_s, 1, NULL, global_work,
&local_work_size, 0, 0, 0);
if (err != CL_SUCCESS) {
printf("ERROR: clEnqueueNDRangeKernel()=>%d failed\n", err);
return -1;
}
clFinish(cmd_queue);
err = clEnqueueReadBuffer(cmd_queue, d_membership, 1, 0,
n_points * sizeof(int), membership_OCL, 0, 0, 0);
if (err != CL_SUCCESS) {
printf("ERROR: Memcopy Out\n");
return -1;
}
delta = 0;
for (i = 0; i < n_points; i++) {
int cluster_id = membership_OCL[i];
new_centers_len[cluster_id]++;
if (membership_OCL[i] != membership[i]) {
delta++;
membership[i] = membership_OCL[i];
}
for (j = 0; j < n_features; j++) {
new_centers[cluster_id][j] += feature[i][j];
}
}
return delta;
}
#include "kmeans.h"
#include <iostream>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <string>
#ifdef WIN
#include <windows.h>
#else
#include <pthread.h>
#include <sys/time.h>
double gettime() {
struct timeval t;
gettimeofday(&t, NULL);
return t.tv_sec + t.tv_usec * 1e-6;
}
#endif
#ifdef NV
#include <oclUtils.h>
#else
#include <CL/cl.h>
#endif
#ifndef FLT_MAX
#define FLT_MAX 3.40282347e+38
#endif
#ifdef RD_WG_SIZE_0_0
#define BLOCK_SIZE RD_WG_SIZE_0_0
#elif defined(RD_WG_SIZE_0)
#define BLOCK_SIZE RD_WG_SIZE_0
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE RD_WG_SIZE
#else
#define BLOCK_SIZE 256
#endif
#ifdef RD_WG_SIZE_1_0
#define BLOCK_SIZE2 RD_WG_SIZE_1_0
#elif defined(RD_WG_SIZE_1)
#define BLOCK_SIZE2 RD_WG_SIZE_1
#elif defined(RD_WG_SIZE)
#define BLOCK_SIZE2 RD_WG_SIZE
#else
#define BLOCK_SIZE2 256
#endif
// local variables
static cl_context context;
static cl_command_queue cmd_queue;
static cl_device_type device_type;
static cl_device_id *device_list;
static cl_int num_devices;
static int initialize(int use_gpu) {
cl_int result;
size_t size;
/*// create OpenCL context
cl_platform_id platform_id;
if (clGetPlatformIDs(1, &platform_id, NULL) != CL_SUCCESS) {
printf("ERROR: clGetPlatformIDs(1,*,0) failed\n");
return -1;
}
cl_context_properties ctxprop[] = {CL_CONTEXT_PLATFORM,
(cl_context_properties)platform_id, 0};
device_type = use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU;
context = clCreateContextFromType(ctxprop, device_type, NULL, NULL, NULL);
if (!context) {
printf("ERROR: clCreateContextFromType(%s) failed\n",
use_gpu ? "GPU" : "CPU");
return -1;
}
// get the list of GPUs
result = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &size);
num_devices = (int)(size / sizeof(cl_device_id));
if (result != CL_SUCCESS || num_devices < 1) {
printf("ERROR: clGetContextInfo() failed\n");
return -1;
}
device_list = new cl_device_id[num_devices];
if (!device_list) {
printf("ERROR: new cl_device_id[] failed\n");
return -1;
}
result =
clGetContextInfo(context, CL_CONTEXT_DEVICES, size, device_list, NULL);
if (result != CL_SUCCESS) {
printf("ERROR: clGetContextInfo() failed\n");
return -1;
}*/
cl_platform_id platform_id;
num_devices = 1;
device_list = new cl_device_id[num_devices];
result = clGetPlatformIDs(1, &platform_id, NULL);
result = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, device_list, NULL);
context = clCreateContext(NULL, 1, device_list, NULL, NULL, &result);
// create command queue for the first device
cmd_queue = clCreateCommandQueue(context, device_list[0], 0, NULL);
if (!cmd_queue) {
printf("ERROR: clCreateCommandQueue() failed\n");
return -1;
}
return 0;
}
static int shutdown() {
// release resources
if (cmd_queue)
clReleaseCommandQueue(cmd_queue);
if (context)
clReleaseContext(context);
if (device_list)
delete device_list;
// reset all variables
cmd_queue = 0;
context = 0;
device_list = 0;
num_devices = 0;
device_type = 0;
return 0;
}
cl_mem d_feature;
cl_mem d_feature_swap;
cl_mem d_cluster;
cl_mem d_membership;
cl_kernel kernel;
cl_kernel kernel_s;
cl_kernel kernel2;
int *membership_OCL;
int *membership_d;
float *feature_d;
float *clusters_d;
float *center_d;
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
if (nullptr == filename || nullptr == data || 0 == size)
return -1;
FILE* fp = fopen(filename, "r");
if (NULL == fp) {
fprintf(stderr, "Failed to load kernel.");
return -1;
}
fseek(fp , 0 , SEEK_END);
long fsize = ftell(fp);
rewind(fp);
*data = (uint8_t*)malloc(fsize);
*size = fread(*data, 1, fsize, fp);
fclose(fp);
return 0;
}
int allocate(int n_points, int n_features, int n_clusters, float **feature) {
/*int sourcesize = 1024 * 1024;
char *source = (char *)calloc(sourcesize, sizeof(char));
if (!source) {
printf("ERROR: calloc(%d) failed\n", sourcesize);
return -1;
}
// read the kernel core source
char *tempchar = "./kmeans.cl";
FILE *fp = fopen(tempchar, "rb");
if (!fp) {
printf("ERROR: unable to open '%s'\n", tempchar);
return -1;
}
fread(source + strlen(source), sourcesize, 1, fp);
fclose(fp);*/
// OpenCL initialization
int use_gpu = 1;
if (initialize(use_gpu))
return -1;
// compile kernel
cl_int err = 0;
//const char *slist[2] = {source, 0};
//cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err);
cl_program prog = clCreateProgramWithBuiltInKernels(context, 1, device_list, "kmeans_kernel_c;kmeans_swap", &err);
if (err != CL_SUCCESS) {
printf("ERROR: clCreateProgramWithSource() => %d\n", err);
return -1;
}
err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL);
{ // show warnings/errors
// static char log[65536]; memset(log, 0, sizeof(log));
// cl_device_id device_id = 0;
// err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id),
//&device_id, NULL);
// clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG,
// sizeof(log)-1, log, NULL);
// if(err || strstr(log,"warning:") || strstr(log, "error:"))
// printf("<<<<\n%s\n>>>>\n", log);
}
if (err != CL_SUCCESS) {
printf("ERROR: clBuildProgram() => %d\n", err);
return -1;
}
char *kernel_kmeans_c = "kmeans_kernel_c";
char *kernel_swap = "kmeans_swap";
kernel_s = clCreateKernel(prog, kernel_kmeans_c, &err);
if (err != CL_SUCCESS) {
printf("ERROR: clCreateKernel() 0 => %d\n", err);
return -1;
}
kernel2 = clCreateKernel(prog, kernel_swap, &err);
if (err != CL_SUCCESS) {
printf("ERROR: clCreateKernel() 0 => %d\n", err);
return -1;
}
clReleaseProgram(prog);
d_feature = clCreateBuffer(context, CL_MEM_READ_WRITE,
n_points * n_features * sizeof(float), NULL, &err);
if (err != CL_SUCCESS) {
printf("ERROR: clCreateBuffer d_feature (size:%d) => %d\n",
n_points * n_features, err);
return -1;
}
d_feature_swap =
clCreateBuffer(context, CL_MEM_READ_WRITE,
n_points * n_features * sizeof(float), NULL, &err);
if (err != CL_SUCCESS) {
printf("ERROR: clCreateBuffer d_feature_swap (size:%d) => %d\n",
n_points * n_features, err);
return -1;
}
d_cluster =
clCreateBuffer(context, CL_MEM_READ_WRITE,
n_clusters * n_features * sizeof(float), NULL, &err);
if (err != CL_SUCCESS) {
printf("ERROR: clCreateBuffer d_cluster (size:%d) => %d\n",
n_clusters * n_features, err);
return -1;
}
d_membership = clCreateBuffer(context, CL_MEM_READ_WRITE,
n_points * sizeof(int), NULL, &err);
if (err != CL_SUCCESS) {
printf("ERROR: clCreateBuffer d_membership (size:%d) => %d\n", n_points,
err);
return -1;
}
// write buffers
err = clEnqueueWriteBuffer(cmd_queue, d_feature, 1, 0,
n_points * n_features * sizeof(float), feature[0],
0, 0, 0);
if (err != CL_SUCCESS) {
printf("ERROR: clEnqueueWriteBuffer d_feature (size:%d) => %d\n",
n_points * n_features, err);
return -1;
}
clSetKernelArg(kernel2, 0, sizeof(void *), (void *)&d_feature);
clSetKernelArg(kernel2, 1, sizeof(void *), (void *)&d_feature_swap);
clSetKernelArg(kernel2, 2, sizeof(cl_int), (void *)&n_points);
clSetKernelArg(kernel2, 3, sizeof(cl_int), (void *)&n_features);
size_t global_work[3] = {n_points, 1, 1};
/// Ke Wang adjustable local group size 2013/08/07 10:37:33
size_t local_work_size = BLOCK_SIZE; // work group size is defined by
// RD_WG_SIZE_0 or RD_WG_SIZE_0_0
// 2014/06/10 17:00:51
if (global_work[0] % local_work_size != 0)
global_work[0] = (global_work[0] / local_work_size + 1) * local_work_size;
err = clEnqueueNDRangeKernel(cmd_queue, kernel2, 1, NULL, global_work,
&local_work_size, 0, 0, 0);
if (err != CL_SUCCESS) {
printf("ERROR: clEnqueueNDRangeKernel()=>%d failed\n", err);
return -1;
}
membership_OCL = (int *)malloc(n_points * sizeof(int));
}
void deallocateMemory() {
clReleaseMemObject(d_feature);
clReleaseMemObject(d_feature_swap);
clReleaseMemObject(d_cluster);
clReleaseMemObject(d_membership);
free(membership_OCL);
}
int main(int argc, char **argv) {
printf("WG size of kernel_swap = %d, WG size of kernel_kmeans = %d \n",
BLOCK_SIZE, BLOCK_SIZE2);
setup(argc, argv);
shutdown();
}
int kmeansOCL(float **feature, /* in: [npoints][nfeatures] */
int n_features, int n_points, int n_clusters, int *membership,
float **clusters, int *new_centers_len, float **new_centers) {
int delta = 0;
int i, j, k;
cl_int err = 0;
size_t global_work[3] = {n_points, 1, 1};
/// Ke Wang adjustable local group size 2013/08/07 10:37:33
size_t local_work_size = BLOCK_SIZE2; // work group size is defined by
// RD_WG_SIZE_1 or RD_WG_SIZE_1_0
// 2014/06/10 17:00:41
if (global_work[0] % local_work_size != 0)
global_work[0] = (global_work[0] / local_work_size + 1) * local_work_size;
err = clEnqueueWriteBuffer(cmd_queue, d_cluster, 1, 0,
n_clusters * n_features * sizeof(float),
clusters[0], 0, 0, 0);
if (err != CL_SUCCESS) {
printf("ERROR: clEnqueueWriteBuffer d_cluster (size:%d) => %d\n", n_points,
err);
return -1;
}
int size = 0;
int offset = 0;
clSetKernelArg(kernel_s, 0, sizeof(void *), (void *)&d_feature_swap);
clSetKernelArg(kernel_s, 1, sizeof(void *), (void *)&d_cluster);
clSetKernelArg(kernel_s, 2, sizeof(void *), (void *)&d_membership);
clSetKernelArg(kernel_s, 3, sizeof(cl_int), (void *)&n_points);
clSetKernelArg(kernel_s, 4, sizeof(cl_int), (void *)&n_clusters);
clSetKernelArg(kernel_s, 5, sizeof(cl_int), (void *)&n_features);
clSetKernelArg(kernel_s, 6, sizeof(cl_int), (void *)&offset);
clSetKernelArg(kernel_s, 7, sizeof(cl_int), (void *)&size);
err = clEnqueueNDRangeKernel(cmd_queue, kernel_s, 1, NULL, global_work,
&local_work_size, 0, 0, 0);
if (err != CL_SUCCESS) {
printf("ERROR: clEnqueueNDRangeKernel()=>%d failed\n", err);
return -1;
}
clFinish(cmd_queue);
err = clEnqueueReadBuffer(cmd_queue, d_membership, 1, 0,
n_points * sizeof(int), membership_OCL, 0, 0, 0);
if (err != CL_SUCCESS) {
printf("ERROR: Memcopy Out\n");
return -1;
}
delta = 0;
for (i = 0; i < n_points; i++) {
int cluster_id = membership_OCL[i];
new_centers_len[cluster_id]++;
if (membership_OCL[i] != membership[i]) {
delta++;
membership[i] = membership_OCL[i];
}
for (j = 0; j < n_features; j++) {
new_centers[cluster_id][j] += feature[i][j];
}
}
return delta;
}

View File

@@ -1 +0,0 @@
./kmeans -o -i ../../data/kmeans/kdd_cup

File diff suppressed because it is too large Load Diff

View File

@@ -1,68 +0,0 @@
RISCV_TOOL_PATH ?= $(wildcard ../../../../riscv-gnu-toolchain/drops)
POCL_CC_PATH ?= $(wildcard ../../../../pocl/drops_riscv_cc)
POCL_INC_PATH ?= $(wildcard ../include)
POCL_LIB_PATH ?= $(wildcard ../lib)
VX_RT_PATH ?= $(wildcard ../../../runtime)
VX_SIMX_PATH ?= $(wildcard ../../../simX/obj_dir)
CC = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gcc
CXX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-g++
DMP = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objdump
HEX = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-objcopy
GDB = $(RISCV_TOOL_PATH)/bin/riscv32-unknown-elf-gdb
VX_SRCS = $(VX_RT_PATH)/newlib/newlib.c
VX_SRCS += $(VX_RT_PATH)/startup/vx_start.S
VX_SRCS += $(VX_RT_PATH)/intrinsics/vx_intrinsics.s
VX_SRCS += $(VX_RT_PATH)/io/vx_io.s $(VX_RT_PATH)/io/vx_io.c
VX_SRCS += $(VX_RT_PATH)/fileio/fileio.s
VX_SRCS += $(VX_RT_PATH)/tests/tests.c
VX_SRCS += $(VX_RT_PATH)/vx_api/vx_api.c
VX_SRCS += $(VX_STR) $(VX_FIO) $(VX_NEWLIB) $(VX_INT) $(VX_IO) $(VX_API) $(VX_TEST)
VX_CFLAGS = -nostartfiles -Wl,-Bstatic,-T,$(VX_RT_PATH)/mains/vortex_link.ld
CXXFLAGS = -g -O0 -march=rv32im -mabi=ilp32
CXXFLAGS += -ffreestanding # program may not begin at main()
CXXFLAGS += -Wl,--gc-sections # enable garbage collection of unused input sections
CXXFLAGS += -fno-rtti -fno-non-call-exceptions # disable RTTI and exceptions
CXXFLAGS += -I$(POCL_INC_PATH) -I.
VX_LIBS = -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
QEMU_LIBS = $(VX_RT_PATH)/qemu/vx_api.c -Wl,--whole-archive lib$(PROJECT).a -Wl,--no-whole-archive $(POCL_LIB_PATH)/libOpenCL.a
PROJECT = lbm
SRCS = main.cc args.c parboil_opencl.c gpu_info.c lbm.c ocl.c
all: $(PROJECT).dump $(PROJECT).hex
lib$(PROJECT).a: kernel.cl
POCL_DEBUG=all POCL_DEBUG_LLVM_PASSES=1 LD_LIBRARY_PATH=$(RISCV_TOOL_PATH)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -o lib$(PROJECT).a kernel.cl
$(PROJECT).elf: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(VX_CFLAGS) $(VX_SRCS) $(SRCS) $(VX_LIBS) -o $(PROJECT).elf
$(PROJECT).qemu: $(SRCS) lib$(PROJECT).a
$(CXX) $(CXXFLAGS) $(SRCS) $(QEMU_LIBS) -o $(PROJECT).qemu
$(PROJECT).hex: $(PROJECT).elf
$(HEX) -O ihex $(PROJECT).elf $(PROJECT).hex
$(PROJECT).dump: $(PROJECT).elf
$(DMP) -D $(PROJECT).elf > $(PROJECT).dump
run: $(PROJECT).hex
POCL_DEBUG=all $(VX_SIMX_PATH)/Vcache_simX -E -a rv32i --core $(PROJECT).hex -s -b 1> emulator.debug
qemu: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -d in_asm -D debug.log $(PROJECT).qemu
gdb-s: $(PROJECT).qemu
POCL_DEBUG=all $(RISCV_TOOL_PATH)/bin/qemu-riscv32 -g 1234 -d in_asm -D debug.log $(PROJECT).qemu
gdb-c: $(PROJECT).qemu
$(GDB) $(PROJECT).qemu
clean:
rm -rf *.o *.elf *.dump *.hex *.qemu *.log *.debug

View File

@@ -1,617 +0,0 @@
#include <parboil.h>
#include <errno.h>
#include <limits.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
/*****************************************************************************/
/* Memory management routines */
/* Free an array of owned strings. */
void
pb_FreeStringArray(char **string_array)
{
char **p;
if (!string_array) return;
for (p = string_array; *p; p++) free(*p);
free(string_array);
}
struct pb_PlatformParam *
pb_PlatformParam(char *name, char *version)
{
if (name == NULL) {
fprintf(stderr, "pb_PlatformParam: Invalid argument\n");
exit(-1);
}
struct pb_PlatformParam *ret =
(struct pb_PlatformParam *)malloc(sizeof (struct pb_PlatformParam));
ret->name = name;
ret->version = version;
return ret;
}
void
pb_FreePlatformParam(struct pb_PlatformParam *p)
{
if (p == NULL) return;
free(p->name);
free(p->version);
free(p);
}
struct pb_DeviceParam *
pb_DeviceParam_index(int index)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_INDEX;
ret->index = index;
return ret;
}
struct pb_DeviceParam *
pb_DeviceParam_cpu(void)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_CPU;
return ret;
}
struct pb_DeviceParam *
pb_DeviceParam_gpu(void)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_GPU;
return ret;
}
struct pb_DeviceParam *
pb_DeviceParam_accelerator(void)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_ACCELERATOR;
return ret;
}
struct pb_DeviceParam *
pb_DeviceParam_name(char *name)
{
struct pb_DeviceParam *ret =
(struct pb_DeviceParam *)malloc(sizeof (struct pb_DeviceParam));
ret->criterion = pb_Device_NAME;
ret->name = name;
return ret;
}
void
pb_FreeDeviceParam(struct pb_DeviceParam *p)
{
if (p == NULL) return;
switch(p->criterion) {
case pb_Device_NAME:
free(p->name);
break;
case pb_Device_INDEX:
case pb_Device_CPU:
case pb_Device_ACCELERATOR:
break;
default:
fprintf(stderr, "pb_FreeDeviceParam: Invalid argument\n");
exit(-1);
}
}
void
pb_FreeParameters(struct pb_Parameters *p)
{
free(p->outFile);
pb_FreeStringArray(p->inpFiles);
pb_FreePlatformParam(p->platform);
pb_FreeDeviceParam(p->device);
free(p);
}
/*****************************************************************************/
/* Parse a comma-delimited list of strings into an
* array of strings. */
static char **
read_string_array(char *in)
{
char **ret;
int i;
int count; /* Number of items in the input */
char *substring; /* Current substring within 'in' */
/* Count the number of items in the string */
count = 1;
for (i = 0; in[i]; i++) if (in[i] == ',') count++;
/* Allocate storage */
ret = (char **)malloc((count + 1) * sizeof(char *));
/* Create copies of the strings from the list */
substring = in;
for (i = 0; i < count; i++) {
char *substring_end;
int substring_length;
/* Find length of substring */
for (substring_end = substring;
(*substring_end != ',') && (*substring_end != 0);
substring_end++);
substring_length = substring_end - substring;
/* Allocate memory and copy the substring */
ret[i] = (char *)malloc(substring_length + 1);
memcpy(ret[i], substring, substring_length);
ret[i][substring_length] = 0;
/* go to next substring */
substring = substring_end + 1;
}
ret[i] = NULL; /* Write the sentinel value */
return ret;
}
static void
report_parse_error(const char *str)
{
fputs(str, stderr);
}
/* Interpret a string as a 'pb_DeviceParam' value.
* Return a pointer to a new value, or NULL on failure.
*/
static struct pb_DeviceParam *
read_device_param(char *str)
{
/* Try different ways of interpreting 'device_string' until one works */
/* If argument is an integer, then interpret it as a device index */
errno = 0;
char *end;
long device_int = strtol(str, &end, 10);
if (!errno) {
/* Negative numbers are not valid */
if (device_int < 0 || device_int > INT_MAX) return NULL;
return pb_DeviceParam_index(device_int);
}
/* Match against predefined strings */
if (strcmp(str, "CPU") == 0)
return pb_DeviceParam_cpu();
if (strcmp(str, "GPU") == 0)
return pb_DeviceParam_gpu();
if (strcmp(str, "ACCELERATOR") == 0)
return pb_DeviceParam_accelerator();
/* Assume any other string is a device name */
return pb_DeviceParam_name(strdup(str));
}
/* Interpret a string as a 'pb_PlatformParam' value.
* Return a pointer to a new value, or NULL on failure.
*/
static struct pb_PlatformParam *
read_platform_param(char *str)
{
int separator_index; /* Index of the '-' character separating
* name and version number. It's -1 if
* there's no '-' character. */
/* Find the last occurrence of '-' in 'str' */
{
char *cur;
separator_index = -1;
for (cur = str; *cur; cur++) {
if (*cur == '-') separator_index = cur - str;
}
}
/* The platform name is either the entire string, or all characters before
* the separator */
int name_length = separator_index == -1 ? strlen(str) : separator_index;
char *name_str = (char *)malloc(name_length + 1);
memcpy(name_str, str, name_length);
name_str[name_length] = 0;
/* The version is either NULL, or all characters after the separator */
char *version_str;
if (separator_index == -1) {
version_str = NULL;
}
else {
const char *version_input_str = str + separator_index + 1;
int version_length = strlen(version_input_str);
version_str = (char *)malloc(version_length + 1);
memcpy(version_str, version_input_str, version_length);
version_str[version_length] = 0;
}
/* Create output structure */
return pb_PlatformParam(name_str, version_str);
}
/****************************************************************************/
/* Argument parsing state */
/* Argument parsing state.
*
* Arguments that are interpreted by the argument parser are removed from
* the list. Variables 'argc' and 'argn' do not count arguments that have
* been removed.
*
* During argument parsing, the array of arguments is compacted, overwriting
* the erased arguments. Variable 'argv_put' points to the array element
* where the next argument will be written. Variable 'argv_get' points to
* the array element where the next argument will be read from.
*/
struct argparse {
int argc; /* Number of arguments. Mutable. */
int argn; /* Current argument index. */
char **argv_get; /* Argument value being read. */
char **argv_put; /* Argument value being written.
* argv_put <= argv_get. */
};
static void
initialize_argparse(struct argparse *ap, int argc, char **argv)
{
ap->argc = argc;
ap->argn = 0;
ap->argv_get = ap->argv_put = argv;
}
/* Finish argument parsing, without processing the remaining arguments.
* Write new argument count into _argc. */
static void
finalize_argparse(struct argparse *ap, int *_argc, char **argv)
{
/* Move the remaining arguments */
for(; ap->argn < ap->argc; ap->argn++)
*ap->argv_put++ = *ap->argv_get++;
/* Update the argument count */
*_argc = ap->argc;
/* Insert a terminating NULL */
argv[ap->argc] = NULL;
}
/* Delete the current argument. The argument will not be visible
* when argument parsing is done. */
static void
delete_argument(struct argparse *ap)
{
if (ap->argn >= ap->argc) {
fprintf(stderr, "delete_argument\n");
}
ap->argc--;
ap->argv_get++;
}
/* Go to the next argument. Also, move the current argument to its
* final location in argv. */
static void
next_argument(struct argparse *ap)
{
if (ap->argn >= ap->argc) {
fprintf(stderr, "next_argument\n");
}
/* Move argument to its new location. */
*ap->argv_put++ = *ap->argv_get++;
ap->argn++;
}
static int
is_end_of_arguments(struct argparse *ap)
{
return ap->argn == ap->argc;
}
/* Get the current argument */
static char *
get_argument(struct argparse *ap)
{
return *ap->argv_get;
}
/* Get the current argument, and also delete it */
static char *
consume_argument(struct argparse *ap)
{
char *ret = get_argument(ap);
delete_argument(ap);
return ret;
}
/****************************************************************************/
/* The result of parsing a command-line argument */
typedef enum {
ARGPARSE_OK, /* Success */
ARGPARSE_ERROR, /* Error */
ARGPARSE_DONE /* Success, and do not continue parsing */
} result;
typedef result parse_action(struct argparse *ap, struct pb_Parameters *params);
/* A command-line option */
struct option {
char short_name; /* If not 0, the one-character
* name of this option */
const char *long_name; /* If not NULL, the long name of this option */
parse_action *action; /* What to do when this option occurs.
* Sentinel value is NULL.
*/
};
/* Output file
*
* -o FILE
*/
static result
parse_output_file(struct argparse *ap, struct pb_Parameters *params)
{
if (is_end_of_arguments(ap))
{
report_parse_error("Expecting file name after '-o'\n");
return ARGPARSE_ERROR;
}
/* Replace the output file name */
free(params->outFile);
params->outFile = strdup(consume_argument(ap));
return ARGPARSE_OK;
}
/* Input files
*
* -i FILE,FILE,...
*/
static result
parse_input_files(struct argparse *ap, struct pb_Parameters *params)
{
if (is_end_of_arguments(ap))
{
report_parse_error("Expecting file name after '-i'\n");
return ARGPARSE_ERROR;
}
/* Replace the input file list */
pb_FreeStringArray(params->inpFiles);
params->inpFiles = read_string_array(consume_argument(ap));
return ARGPARSE_OK;
}
/* End of options
*
* --
*/
static result
parse_end_options(struct argparse *ap, struct pb_Parameters *params)
{
return ARGPARSE_DONE;
}
/* OpenCL device
*
* --device X
*/
static result
parse_device(struct argparse *ap, struct pb_Parameters *params)
{
/* Read the next argument, which specifies a device */
if (is_end_of_arguments(ap))
{
report_parse_error("Expecting device specification after '--device'\n");
return ARGPARSE_ERROR;
}
char *device_string = consume_argument(ap);
struct pb_DeviceParam *device_param = read_device_param(device_string);
if (!device_param) {
report_parse_error("Unrecognized device specification format on command line\n");
return ARGPARSE_ERROR;
}
/* Save the result */
pb_FreeDeviceParam(params->device);
params->device = device_param;
return ARGPARSE_OK;
}
static result
parse_platform(struct argparse *ap, struct pb_Parameters *params)
{
/* Read the next argument, which specifies a platform */
if (is_end_of_arguments(ap))
{
report_parse_error("Expecting device specification after '--platform'\n");
return ARGPARSE_ERROR;
}
char *platform_string = consume_argument(ap);
struct pb_PlatformParam *platform_param = read_platform_param(platform_string);
if (!platform_param) {
report_parse_error("Unrecognized platform specification format on command line\n");
return ARGPARSE_ERROR;
}
/* Save the result */
pb_FreePlatformParam(params->platform);
params->platform = platform_param;
return ARGPARSE_OK;
}
static struct option options[] = {
{ 'o', NULL, &parse_output_file },
{ 'i', NULL, &parse_input_files },
{ '-', NULL, &parse_end_options },
{ 0, "device", &parse_device },
{ 0, "platform", &parse_platform },
{ 0, NULL, NULL }
};
static int
is_last_option(struct option *op)
{
return op->action == NULL;
}
/****************************************************************************/
/* Parse command-line parameters.
* Return zero on error, nonzero otherwise.
* On error, the other outputs may be invalid.
*
* The information collected from parameters is used to update
* 'ret'. 'ret' should be initialized.
*
* '_argc' and 'argv' are updated to contain only the unprocessed arguments.
*/
static int
pb_ParseParameters (struct pb_Parameters *ret, int *_argc, char **argv)
{
char *err_message;
struct argparse ap;
/* Each argument */
initialize_argparse(&ap, *_argc, argv);
while(!is_end_of_arguments(&ap)) {
result arg_result; /* Result of parsing this option */
char *arg = get_argument(&ap);
/* Process this argument */
if (arg[0] == '-') {
/* Single-character flag */
if ((arg[1] != 0) && (arg[2] == 0)) {
delete_argument(&ap); /* This argument is consumed here */
/* Find a matching short option */
struct option *op;
for (op = options; !is_last_option(op); op++) {
if (op->short_name == arg[1]) {
arg_result = (*op->action)(&ap, ret);
goto option_was_processed;
}
}
/* No option matches */
report_parse_error("Unexpected command-line parameter\n");
arg_result = ARGPARSE_ERROR;
goto option_was_processed;
}
/* Long flag */
if (arg[1] == '-') {
delete_argument(&ap); /* This argument is consumed here */
/* Find a matching long option */
struct option *op;
for (op = options; !is_last_option(op); op++) {
if (op->long_name && strcmp(&arg[2], op->long_name) == 0) {
arg_result = (*op->action)(&ap, ret);
goto option_was_processed;
}
}
/* No option matches */
report_parse_error("Unexpected command-line parameter\n");
arg_result = ARGPARSE_ERROR;
goto option_was_processed;
}
}
else {
/* Other arguments are ignored */
next_argument(&ap);
arg_result = ARGPARSE_OK;
goto option_was_processed;
}
option_was_processed:
/* Decide what to do next based on 'arg_result' */
switch(arg_result) {
case ARGPARSE_OK:
/* Continue processing */
break;
case ARGPARSE_ERROR:
/* Error exit from the function */
return 0;
case ARGPARSE_DONE:
/* Normal exit from the argument parsing loop */
goto end_of_options;
}
} /* end for each argument */
/* If all arguments were processed, then normal exit from the loop */
end_of_options:
finalize_argparse(&ap, _argc, argv);
return 1;
}
/*****************************************************************************/
/* Other exported functions */
struct pb_Parameters *
pb_ReadParameters(int *_argc, char **argv)
{
struct pb_Parameters *ret =
(struct pb_Parameters *)malloc(sizeof(struct pb_Parameters));
/* Initialize the parameters structure */
ret->outFile = NULL;
ret->inpFiles = (char **)malloc(sizeof(char *));
ret->inpFiles[0] = NULL;
ret->platform = NULL;
ret->device = NULL;
/* Read parameters and update _argc, argv */
if (!pb_ParseParameters(ret, _argc, argv)) {
/* Parse error */
pb_FreeParameters(ret);
return NULL;
}
return ret;
}
int
pb_Parameters_CountInputs(struct pb_Parameters *p)
{
int n;
for (n = 0; p->inpFiles[n]; n++);
return n;
}

View File

@@ -1,55 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
//#include <endian.h>
#include <stdlib.h>
#include <malloc.h>
#include <stdio.h>
#include <inttypes.h>
#include "gpu_info.h"
void compute_active_thread(size_t *thread,
size_t *grid,
int task,
int pad,
int major,
int minor,
int sm)
{
int max_thread;
int max_block=8;
if(major==1)
{
if(minor>=2)
max_thread=1024;
else
max_thread=768;
}
else if(major==2)
max_thread=1536;
else
//newer GPU //keep using 2.0
max_thread=1536;
int _grid;
int _thread;
if(task*pad>sm*max_thread)
{
_thread=max_thread/max_block;
_grid = ((task*pad+_thread-1)/_thread)*_thread;
}
else
{
_thread=pad;
_grid=task*pad;
}
thread[0]=_thread;
grid[0]=_grid;
}

View File

@@ -1,20 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#ifndef __GPUINFOH__
#define __GPUINFOH__
void compute_active_thread(size_t *thread,
size_t *grid,
int task,
int pad,
int major,
int minor,
int sm);
#endif

View File

@@ -1,424 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#ifndef LBM_KERNEL_CL
#define LBM_KERNEL_CL
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/*############################################################################*/
#ifndef _LAYOUT_CONFIG_H_
#define _LAYOUT_CONFIG_H_
/*############################################################################*/
//Unchangeable settings: volume simulation size for the given example
#define SIZE_X (32)
#define SIZE_Y (32)
#define SIZE_Z (32)
//Changeable settings
//Padding in each dimension
#define PADDING_X (8)
#define PADDING_Y (0)
#define PADDING_Z (4)
//Pitch in each dimension
#define PADDED_X (SIZE_X+PADDING_X)
#define PADDED_Y (SIZE_Y+PADDING_Y)
#define PADDED_Z (SIZE_Z+PADDING_Z)
#define TOTAL_CELLS (SIZE_X*SIZE_Y*SIZE_Z)
#define TOTAL_PADDED_CELLS (PADDED_X*PADDED_Y*PADDED_Z)
//Flattening function
// This macro will be used to map a 3-D index and element to a value
// The macro below implements the equivalent of a 3-D array of
// 20-element structures in C standard layout.
#define CALC_INDEX(x,y,z,e) ( e + N_CELL_ENTRIES*\
((x)+(y)*PADDED_X+(z)*PADDED_X*PADDED_Y) )
#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0,0,0,0))
// Set this value to 1 for GATHER, or 0 for SCATTER
#if 1
#define GATHER
#else
#define SCATTER
#endif
//OpenCL block size (not trivially changeable here)
#define BLOCK_SIZE SIZE_X
/*############################################################################*/
typedef enum {C = 0,
N, S, E, W, T, B,
NE, NW, SE, SW,
NT, NB, ST, SB,
ET, EB, WT, WB,
FLAGS, N_CELL_ENTRIES} CELL_ENTRIES;
#define N_DISTR_FUNCS FLAGS
typedef enum {OBSTACLE = 1 << 0,
ACCEL = 1 << 1,
IN_OUT_FLOW = 1 << 2} CELL_FLAGS;
#endif /* _CONFIG_H_ */
#ifndef _LBM_MARCOS_H
#define _LBM_MACROS_H_
#define OMEGA (1.95f)
#define OUTPUT_PRECISION float
#define BOOL int
#define TRUE (-1)
#define FALSE (0)
#define DFL1 (1.0f/ 3.0f)
#define DFL2 (1.0f/18.0f)
#define DFL3 (1.0f/36.0f)
/*############################################################################*/
typedef float* LBM_Grid;//float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES];
typedef LBM_Grid* LBM_GridPtr;
/*############################################################################*/
#define SWEEP_X __temp_x__
#define SWEEP_Y __temp_y__
#define SWEEP_Z __temp_z__
#define SWEEP_VAR int __temp_x__, __temp_y__, __temp_z__;
#define SWEEP_START(x1,y1,z1,x2,y2,z2) \
for( __temp_z__ = z1; \
__temp_z__ < z2; \
__temp_z__++) { \
for( __temp_y__ = 0; \
__temp_y__ < SIZE_Y; \
__temp_y__++) { \
for(__temp_x__ = 0; \
__temp_x__ < SIZE_X; \
__temp_x__++) { \
#define SWEEP_END }}}
#define GRID_ENTRY(g,x,y,z,e) ((g)[CALC_INDEX( x, y, z, e)])
#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX((dx)+SWEEP_X, (dy)+SWEEP_Y, (dz)+SWEEP_Z, e)])
#define LOCAL(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e ))
#define NEIGHBOR_C(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e ))
#define NEIGHBOR_N(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, 0, e ))
#define NEIGHBOR_S(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, 0, e ))
#define NEIGHBOR_E(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, 0, e ))
#define NEIGHBOR_W(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, 0, e ))
#define NEIGHBOR_T(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, +1, e ))
#define NEIGHBOR_B(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, -1, e ))
#define NEIGHBOR_NE(g,e) (GRID_ENTRY_SWEEP( g, +1, +1, 0, e ))
#define NEIGHBOR_NW(g,e) (GRID_ENTRY_SWEEP( g, -1, +1, 0, e ))
#define NEIGHBOR_SE(g,e) (GRID_ENTRY_SWEEP( g, +1, -1, 0, e ))
#define NEIGHBOR_SW(g,e) (GRID_ENTRY_SWEEP( g, -1, -1, 0, e ))
#define NEIGHBOR_NT(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, +1, e ))
#define NEIGHBOR_NB(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, -1, e ))
#define NEIGHBOR_ST(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, +1, e ))
#define NEIGHBOR_SB(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, -1, e ))
#define NEIGHBOR_ET(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, +1, e ))
#define NEIGHBOR_EB(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, -1, e ))
#define NEIGHBOR_WT(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, +1, e ))
#define NEIGHBOR_WB(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, -1, e ))
#ifdef SCATTER
#define SRC_C(g) (LOCAL( g, C ))
#define SRC_N(g) (LOCAL( g, N ))
#define SRC_S(g) (LOCAL( g, S ))
#define SRC_E(g) (LOCAL( g, E ))
#define SRC_W(g) (LOCAL( g, W ))
#define SRC_T(g) (LOCAL( g, T ))
#define SRC_B(g) (LOCAL( g, B ))
#define SRC_NE(g) (LOCAL( g, NE ))
#define SRC_NW(g) (LOCAL( g, NW ))
#define SRC_SE(g) (LOCAL( g, SE ))
#define SRC_SW(g) (LOCAL( g, SW ))
#define SRC_NT(g) (LOCAL( g, NT ))
#define SRC_NB(g) (LOCAL( g, NB ))
#define SRC_ST(g) (LOCAL( g, ST ))
#define SRC_SB(g) (LOCAL( g, SB ))
#define SRC_ET(g) (LOCAL( g, ET ))
#define SRC_EB(g) (LOCAL( g, EB ))
#define SRC_WT(g) (LOCAL( g, WT ))
#define SRC_WB(g) (LOCAL( g, WB ))
#define DST_C(g) (NEIGHBOR_C ( g, C ))
#define DST_N(g) (NEIGHBOR_N ( g, N ))
#define DST_S(g) (NEIGHBOR_S ( g, S ))
#define DST_E(g) (NEIGHBOR_E ( g, E ))
#define DST_W(g) (NEIGHBOR_W ( g, W ))
#define DST_T(g) (NEIGHBOR_T ( g, T ))
#define DST_B(g) (NEIGHBOR_B ( g, B ))
#define DST_NE(g) (NEIGHBOR_NE( g, NE ))
#define DST_NW(g) (NEIGHBOR_NW( g, NW ))
#define DST_SE(g) (NEIGHBOR_SE( g, SE ))
#define DST_SW(g) (NEIGHBOR_SW( g, SW ))
#define DST_NT(g) (NEIGHBOR_NT( g, NT ))
#define DST_NB(g) (NEIGHBOR_NB( g, NB ))
#define DST_ST(g) (NEIGHBOR_ST( g, ST ))
#define DST_SB(g) (NEIGHBOR_SB( g, SB ))
#define DST_ET(g) (NEIGHBOR_ET( g, ET ))
#define DST_EB(g) (NEIGHBOR_EB( g, EB ))
#define DST_WT(g) (NEIGHBOR_WT( g, WT ))
#define DST_WB(g) (NEIGHBOR_WB( g, WB ))
#else /* GATHER */
#define SRC_C(g) (NEIGHBOR_C ( g, C ))
#define SRC_N(g) (NEIGHBOR_S ( g, N ))
#define SRC_S(g) (NEIGHBOR_N ( g, S ))
#define SRC_E(g) (NEIGHBOR_W ( g, E ))
#define SRC_W(g) (NEIGHBOR_E ( g, W ))
#define SRC_T(g) (NEIGHBOR_B ( g, T ))
#define SRC_B(g) (NEIGHBOR_T ( g, B ))
#define SRC_NE(g) (NEIGHBOR_SW( g, NE ))
#define SRC_NW(g) (NEIGHBOR_SE( g, NW ))
#define SRC_SE(g) (NEIGHBOR_NW( g, SE ))
#define SRC_SW(g) (NEIGHBOR_NE( g, SW ))
#define SRC_NT(g) (NEIGHBOR_SB( g, NT ))
#define SRC_NB(g) (NEIGHBOR_ST( g, NB ))
#define SRC_ST(g) (NEIGHBOR_NB( g, ST ))
#define SRC_SB(g) (NEIGHBOR_NT( g, SB ))
#define SRC_ET(g) (NEIGHBOR_WB( g, ET ))
#define SRC_EB(g) (NEIGHBOR_WT( g, EB ))
#define SRC_WT(g) (NEIGHBOR_EB( g, WT ))
#define SRC_WB(g) (NEIGHBOR_ET( g, WB ))
#define DST_C(g) (LOCAL( g, C ))
#define DST_N(g) (LOCAL( g, N ))
#define DST_S(g) (LOCAL( g, S ))
#define DST_E(g) (LOCAL( g, E ))
#define DST_W(g) (LOCAL( g, W ))
#define DST_T(g) (LOCAL( g, T ))
#define DST_B(g) (LOCAL( g, B ))
#define DST_NE(g) (LOCAL( g, NE ))
#define DST_NW(g) (LOCAL( g, NW ))
#define DST_SE(g) (LOCAL( g, SE ))
#define DST_SW(g) (LOCAL( g, SW ))
#define DST_NT(g) (LOCAL( g, NT ))
#define DST_NB(g) (LOCAL( g, NB ))
#define DST_ST(g) (LOCAL( g, ST ))
#define DST_SB(g) (LOCAL( g, SB ))
#define DST_ET(g) (LOCAL( g, ET ))
#define DST_EB(g) (LOCAL( g, EB ))
#define DST_WT(g) (LOCAL( g, WT ))
#define DST_WB(g) (LOCAL( g, WB ))
#endif /* GATHER */
#define MAGIC_CAST(v) ((unsigned int*) ((void*) (&(v))))
#define FLAG_VAR(v) unsigned int* _aux_ = MAGIC_CAST(v)
#define TEST_FLAG_SWEEP(g,f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f))
#define SET_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) |= (f);}
#define CLEAR_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) &= ~(f);}
#define CLEAR_ALL_FLAGS_SWEEP(g) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) = 0;}
#define TEST_FLAG(g,x,y,z,f) ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f))
#define SET_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) |= (f);}
#define CLEAR_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) &= ~(f);}
#define CLEAR_ALL_FLAGS(g,x,y,z) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) = 0;}
/*############################################################################*/
#endif /* _CONFIG_H_ */
/******************************************************************************/
__kernel void performStreamCollide_kernel( __global float* srcGrid, __global float* dstGrid )
{
srcGrid += MARGIN;
dstGrid += MARGIN;
//Using some predefined macros here. Consider this the declaration
// and initialization of the variables SWEEP_X, SWEEP_Y and SWEEP_Z
SWEEP_VAR
SWEEP_X = get_local_id(0);
SWEEP_Y = get_group_id(0);
SWEEP_Z = get_group_id(1);
float temp_swp, tempC, tempN, tempS, tempE, tempW, tempT, tempB;
float tempNE, tempNW, tempSE, tempSW, tempNT, tempNB, tempST ;
float tempSB, tempET, tempEB, tempWT, tempWB ;
//Load all of the input fields
//This is a gather operation of the SCATTER preprocessor variable
// is undefined in layout_config.h, or a "local" read otherwise
tempC = SRC_C(srcGrid);
tempN = SRC_N(srcGrid);
tempS = SRC_S(srcGrid);
tempE = SRC_E(srcGrid);
tempW = SRC_W(srcGrid);
tempT = SRC_T(srcGrid);
tempB = SRC_B(srcGrid);
tempNE = SRC_NE(srcGrid);
tempNW = SRC_NW(srcGrid);
tempSE = SRC_SE(srcGrid);
tempSW = SRC_SW(srcGrid);
tempNT = SRC_NT(srcGrid);
tempNB = SRC_NB(srcGrid);
tempST = SRC_ST(srcGrid);
tempSB = SRC_SB(srcGrid);
tempET = SRC_ET(srcGrid);
tempEB = SRC_EB(srcGrid);
tempWT = SRC_WT(srcGrid);
tempWB = SRC_WB(srcGrid);
//Test whether the cell is fluid or obstacle
if(as_uint(LOCAL(srcGrid,FLAGS)) & (OBSTACLE)) {
//Swizzle the inputs: reflect any fluid coming into this cell
// back to where it came from
temp_swp = tempN ; tempN = tempS ; tempS = temp_swp ;
temp_swp = tempE ; tempE = tempW ; tempW = temp_swp;
temp_swp = tempT ; tempT = tempB ; tempB = temp_swp;
temp_swp = tempNE; tempNE = tempSW ; tempSW = temp_swp;
temp_swp = tempNW; tempNW = tempSE ; tempSE = temp_swp;
temp_swp = tempNT ; tempNT = tempSB ; tempSB = temp_swp;
temp_swp = tempNB ; tempNB = tempST ; tempST = temp_swp;
temp_swp = tempET ; tempET= tempWB ; tempWB = temp_swp;
temp_swp = tempEB ; tempEB = tempWT ; tempWT = temp_swp;
}
else {
//The math meat of LBM: ignore for optimization
float ux, uy, uz, rho, u2;
float temp1, temp2, temp_base;
rho = tempC + tempN
+ tempS + tempE
+ tempW + tempT
+ tempB + tempNE
+ tempNW + tempSE
+ tempSW + tempNT
+ tempNB + tempST
+ tempSB + tempET
+ tempEB + tempWT
+ tempWB;
ux = + tempE - tempW
+ tempNE - tempNW
+ tempSE - tempSW
+ tempET + tempEB
- tempWT - tempWB;
uy = + tempN - tempS
+ tempNE + tempNW
- tempSE - tempSW
+ tempNT + tempNB
- tempST - tempSB;
uz = + tempT - tempB
+ tempNT - tempNB
+ tempST - tempSB
+ tempET - tempEB
+ tempWT - tempWB;
ux /= rho;
uy /= rho;
uz /= rho;
if(as_uint(LOCAL(srcGrid,FLAGS)) & (ACCEL)) {
ux = 0.005f;
uy = 0.002f;
uz = 0.000f;
}
u2 = 1.5f * (ux*ux + uy*uy + uz*uz) - 1.0f;
temp_base = OMEGA*rho;
temp1 = DFL1*temp_base;
//Put the output values for this cell in the shared memory
temp_base = OMEGA*rho;
temp1 = DFL1*temp_base;
temp2 = 1.0f-OMEGA;
tempC = temp2*tempC + temp1*( - u2);
temp1 = DFL2*temp_base;
tempN = temp2*tempN + temp1*( uy*(4.5f*uy + 3.0f) - u2);
tempS = temp2*tempS + temp1*( uy*(4.5f*uy - 3.0f) - u2);
tempT = temp2*tempT + temp1*( uz*(4.5f*uz + 3.0f) - u2);
tempB = temp2*tempB + temp1*( uz*(4.5f*uz - 3.0f) - u2);
tempE = temp2*tempE + temp1*( ux*(4.5f*ux + 3.0f) - u2);
tempW = temp2*tempW + temp1*( ux*(4.5f*ux - 3.0f) - u2);
temp1 = DFL3*temp_base;
tempNT= temp2*tempNT + temp1 *( (+uy+uz)*(4.5f*(+uy+uz) + 3.0f) - u2);
tempNB= temp2*tempNB + temp1 *( (+uy-uz)*(4.5f*(+uy-uz) + 3.0f) - u2);
tempST= temp2*tempST + temp1 *( (-uy+uz)*(4.5f*(-uy+uz) + 3.0f) - u2);
tempSB= temp2*tempSB + temp1 *( (-uy-uz)*(4.5f*(-uy-uz) + 3.0f) - u2);
tempNE = temp2*tempNE + temp1 *( (+ux+uy)*(4.5f*(+ux+uy) + 3.0f) - u2);
tempSE = temp2*tempSE + temp1 *((+ux-uy)*(4.5f*(+ux-uy) + 3.0f) - u2);
tempET = temp2*tempET + temp1 *( (+ux+uz)*(4.5f*(+ux+uz) + 3.0f) - u2);
tempEB = temp2*tempEB + temp1 *( (+ux-uz)*(4.5f*(+ux-uz) + 3.0f) - u2);
tempNW = temp2*tempNW + temp1 *( (-ux+uy)*(4.5f*(-ux+uy) + 3.0f) - u2);
tempSW = temp2*tempSW + temp1 *( (-ux-uy)*(4.5f*(-ux-uy) + 3.0f) - u2);
tempWT = temp2*tempWT + temp1 *( (-ux+uz)*(4.5f*(-ux+uz) + 3.0f) - u2);
tempWB = temp2*tempWB + temp1 *( (-ux-uz)*(4.5f*(-ux-uz) + 3.0f) - u2);
}
//Write the results computed above
//This is a scatter operation of the SCATTER preprocessor variable
// is defined in layout_config.h, or a "local" write otherwise
DST_C ( dstGrid ) = tempC;
DST_N ( dstGrid ) = tempN;
DST_S ( dstGrid ) = tempS;
DST_E ( dstGrid ) = tempE;
DST_W ( dstGrid ) = tempW;
DST_T ( dstGrid ) = tempT;
DST_B ( dstGrid ) = tempB;
DST_NE( dstGrid ) = tempNE;
DST_NW( dstGrid ) = tempNW;
DST_SE( dstGrid ) = tempSE;
DST_SW( dstGrid ) = tempSW;
DST_NT( dstGrid ) = tempNT;
DST_NB( dstGrid ) = tempNB;
DST_ST( dstGrid ) = tempST;
DST_SB( dstGrid ) = tempSB;
DST_ET( dstGrid ) = tempET;
DST_EB( dstGrid ) = tempEB;
DST_WT( dstGrid ) = tempWT;
DST_WB( dstGrid ) = tempWB;
}
#endif // LBM_KERNEL_CL

View File

@@ -1,69 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/*############################################################################*/
#ifndef _LAYOUT_CONFIG_H_
#define _LAYOUT_CONFIG_H_
/*############################################################################*/
//Unchangeable settings: volume simulation size for the given example
#define SIZE_X (32)
#define SIZE_Y (16)
#define SIZE_Z (8)
//Changeable settings
//Padding in each dimension
#define PADDING_X (8)
#define PADDING_Y (0)
#define PADDING_Z (4)
//Pitch in each dimension
#define PADDED_X (SIZE_X+PADDING_X)
#define PADDED_Y (SIZE_Y+PADDING_Y)
#define PADDED_Z (SIZE_Z+PADDING_Z)
#define TOTAL_CELLS (SIZE_X*SIZE_Y*SIZE_Z)
#define TOTAL_PADDED_CELLS (PADDED_X*PADDED_Y*PADDED_Z)
//Flattening function
// This macro will be used to map a 3-D index and element to a value
// The macro below implements the equivalent of a 3-D array of
// 20-element structures in C standard layout.
#define CALC_INDEX(x,y,z,e) ( e + N_CELL_ENTRIES*\
((x)+(y)*PADDED_X+(z)*PADDED_X*PADDED_Y) )
#define MARGIN (CALC_INDEX(0, 0, 2, 0) - CALC_INDEX(0,0,0,0))
// Set this value to 1 for GATHER, or 0 for SCATTER
#if 1
#define GATHER
#else
#define SCATTER
#endif
//OpenCL block size (not trivially changeable here)
#define BLOCK_SIZE SIZE_X
/*############################################################################*/
typedef enum {C = 0,
N, S, E, W, T, B,
NE, NW, SE, SW,
NT, NB, ST, SB,
ET, EB, WT, WB,
FLAGS, N_CELL_ENTRIES} CELL_ENTRIES;
#define N_DISTR_FUNCS FLAGS
typedef enum {OBSTACLE = 1 << 0,
ACCEL = 1 << 1,
IN_OUT_FLOW = 1 << 2} CELL_FLAGS;
#endif /* _CONFIG_H_ */

View File

@@ -1,356 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/*############################################################################*/
// includes, system
#include <CL/cl.h>
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <float.h>
// includes, project
#include "layout_config.h"
#include "lbm_macros.h"
#include "ocl.h"
#include "lbm.h"
#include "parboil.h"
/******************************************************************************/
void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid ) {
cl_int clStatus;
clStatus = clSetKernelArg(prm->clKernel,0,sizeof(cl_mem),(void*)&srcGrid);
CHECK_ERROR("clSetKernelArg")
clStatus = clSetKernelArg(prm->clKernel,1,sizeof(cl_mem),(void*)&dstGrid);
CHECK_ERROR("clSetKernelArg")
size_t dimBlock[3] = {SIZE_X,1,1};
size_t dimGrid[3] = {SIZE_X*SIZE_Y,SIZE_Z,1};
clStatus = clEnqueueNDRangeKernel(prm->clCommandQueue,prm->clKernel,3,NULL,dimGrid,dimBlock,0,NULL,NULL);
CHECK_ERROR("clEnqueueNDRangeKernel")
clStatus = clFinish(prm->clCommandQueue);
CHECK_ERROR("clFinish")
}
/*############################################################################*/
void LBM_allocateGrid( float** ptr ) {
const size_t size = TOTAL_PADDED_CELLS * N_CELL_ENTRIES * sizeof(float);
*ptr = (float*)malloc( size );
if( !ptr ) {
printf( "LBM_allocateGrid: could not allocate %.1f MByte\n",
size / (1024.0*1024.0) );
exit( 1 );
}
memset( *ptr, 0, size );
printf( "LBM_allocateGrid: allocated %.1f MByte\n",
size / (1024.0*1024.0) );
*ptr += MARGIN;
}
/******************************************************************************/
void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr ) {
const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
cl_int clStatus;
/*size_t max_alloc_size = 0;
clGetDeviceInfo(prm->clDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE,
sizeof(max_alloc_size), &max_alloc_size, NULL);
if (max_alloc_size < size) {
fprintf(stderr, "Can't allocate buffer: max alloc size is %dMB\n",
(int) (max_alloc_size >> 20));
exit(-1);
}*/
*ptr = clCreateBuffer(prm->clContext,CL_MEM_READ_WRITE,size,NULL,&clStatus);
CHECK_ERROR("clCreateBuffer")
}
/*############################################################################*/
void LBM_freeGrid( float** ptr ) {
free( *ptr-MARGIN );
*ptr = NULL;
}
/******************************************************************************/
void OpenCL_LBM_freeGrid(cl_mem ptr) {
clReleaseMemObject(ptr);
}
/*############################################################################*/
void LBM_initializeGrid( LBM_Grid grid ) {
SWEEP_VAR
SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z )
SRC_C( grid ) = DFL1;
SRC_N( grid ) = DFL2;
SRC_S( grid ) = DFL2;
SRC_E( grid ) = DFL2;
SRC_W( grid ) = DFL2;
SRC_T( grid ) = DFL2;
SRC_B( grid ) = DFL2;
SRC_NE( grid ) = DFL3;
SRC_NW( grid ) = DFL3;
SRC_SE( grid ) = DFL3;
SRC_SW( grid ) = DFL3;
SRC_NT( grid ) = DFL3;
SRC_NB( grid ) = DFL3;
SRC_ST( grid ) = DFL3;
SRC_SB( grid ) = DFL3;
SRC_ET( grid ) = DFL3;
SRC_EB( grid ) = DFL3;
SRC_WT( grid ) = DFL3;
SRC_WB( grid ) = DFL3;
CLEAR_ALL_FLAGS_SWEEP( grid );
SWEEP_END
}
/******************************************************************************/
void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) {
const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
cl_int clStatus;
clStatus = clEnqueueWriteBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL);
CHECK_ERROR("clEnqueueWriteBuffer")
}
void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid ) {
const size_t size = TOTAL_PADDED_CELLS*N_CELL_ENTRIES*sizeof( float );
cl_int clStatus;
clStatus = clEnqueueReadBuffer(prm->clCommandQueue,d_grid,CL_TRUE,0,size,h_grid-MARGIN,0,NULL,NULL);
CHECK_ERROR("clEnqueueReadBuffer")
}
/*############################################################################*/
void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 ) {
cl_mem aux = *grid1;
*grid1 = *grid2;
*grid2 = aux;
}
/*############################################################################*/
void LBM_loadObstacleFile( LBM_Grid grid, const char* filename ) {
int x, y, z;
FILE* file = fopen( filename, "rb" );
for( z = 0; z < SIZE_Z; z++ ) {
for( y = 0; y < SIZE_Y; y++ ) {
for( x = 0; x < SIZE_X; x++ ) {
if( fgetc( file ) != '.' ) SET_FLAG( grid, x, y, z, OBSTACLE );
}
fgetc( file );
}
fgetc( file );
}
fclose( file );
}
/*############################################################################*/
void LBM_initializeSpecialCellsForLDC( LBM_Grid grid ) {
int x, y, z;
for( z = -2; z < SIZE_Z+2; z++ ) {
for( y = 0; y < SIZE_Y; y++ ) {
for( x = 0; x < SIZE_X; x++ ) {
if( x == 0 || x == SIZE_X-1 ||
y == 0 || y == SIZE_Y-1 ||
z == 0 || z == SIZE_Z-1 ) {
SET_FLAG( grid, x, y, z, OBSTACLE );
}
else {
if( (z == 1 || z == SIZE_Z-2) &&
x > 1 && x < SIZE_X-2 &&
y > 1 && y < SIZE_Y-2 ) {
SET_FLAG( grid, x, y, z, ACCEL );
}
}
}
}
}
}
/*############################################################################*/
void LBM_showGridStatistics( LBM_Grid grid ) {
int nObstacleCells = 0,
nAccelCells = 0,
nFluidCells = 0;
float ux, uy, uz;
float minU2 = 1e+30, maxU2 = -1e+30, u2;
float minRho = 1e+30, maxRho = -1e+30, rho;
float mass = 0;
SWEEP_VAR
SWEEP_START( 0, 0, 0, 0, 0, SIZE_Z )
rho = LOCAL( grid, C ) + LOCAL( grid, N )
+ LOCAL( grid, S ) + LOCAL( grid, E )
+ LOCAL( grid, W ) + LOCAL( grid, T )
+ LOCAL( grid, B ) + LOCAL( grid, NE )
+ LOCAL( grid, NW ) + LOCAL( grid, SE )
+ LOCAL( grid, SW ) + LOCAL( grid, NT )
+ LOCAL( grid, NB ) + LOCAL( grid, ST )
+ LOCAL( grid, SB ) + LOCAL( grid, ET )
+ LOCAL( grid, EB ) + LOCAL( grid, WT )
+ LOCAL( grid, WB );
if( rho < minRho ) minRho = rho;
if( rho > maxRho ) maxRho = rho;
mass += rho;
if( TEST_FLAG_SWEEP( grid, OBSTACLE )) {
nObstacleCells++;
}
else {
if( TEST_FLAG_SWEEP( grid, ACCEL ))
nAccelCells++;
else
nFluidCells++;
ux = + LOCAL( grid, E ) - LOCAL( grid, W )
+ LOCAL( grid, NE ) - LOCAL( grid, NW )
+ LOCAL( grid, SE ) - LOCAL( grid, SW )
+ LOCAL( grid, ET ) + LOCAL( grid, EB )
- LOCAL( grid, WT ) - LOCAL( grid, WB );
uy = + LOCAL( grid, N ) - LOCAL( grid, S )
+ LOCAL( grid, NE ) + LOCAL( grid, NW )
- LOCAL( grid, SE ) - LOCAL( grid, SW )
+ LOCAL( grid, NT ) + LOCAL( grid, NB )
- LOCAL( grid, ST ) - LOCAL( grid, SB );
uz = + LOCAL( grid, T ) - LOCAL( grid, B )
+ LOCAL( grid, NT ) - LOCAL( grid, NB )
+ LOCAL( grid, ST ) - LOCAL( grid, SB )
+ LOCAL( grid, ET ) - LOCAL( grid, EB )
+ LOCAL( grid, WT ) - LOCAL( grid, WB );
u2 = (ux*ux + uy*uy + uz*uz) / (rho*rho);
if( u2 < minU2 ) minU2 = u2;
if( u2 > maxU2 ) maxU2 = u2;
}
SWEEP_END
printf( "LBM_showGridStatistics:\n"
"\tnObstacleCells: %7i nAccelCells: %7i nFluidCells: %7i\n"
"\tminRho: %8.4f maxRho: %8.4f mass: %e\n"
"\tminU: %e maxU: %e\n\n",
nObstacleCells, nAccelCells, nFluidCells,
minRho, maxRho, mass,
sqrt( minU2 ), sqrt( maxU2 ) );
}
/*############################################################################*/
static void storeValue( FILE* file, OUTPUT_PRECISION* v ) {
const int litteBigEndianTest = 1;
if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) { /* big endian */
const char* vPtr = (char*) v;
char buffer[sizeof( OUTPUT_PRECISION )];
int i;
for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++)
buffer[i] = vPtr[sizeof( OUTPUT_PRECISION ) - i - 1];
fwrite( buffer, sizeof( OUTPUT_PRECISION ), 1, file );
}
else { /* little endian */
fwrite( v, sizeof( OUTPUT_PRECISION ), 1, file );
}
}
/*############################################################################*/
static void loadValue( FILE* file, OUTPUT_PRECISION* v ) {
const int litteBigEndianTest = 1;
if( (*((unsigned char*) &litteBigEndianTest)) == 0 ) { /* big endian */
char* vPtr = (char*) v;
char buffer[sizeof( OUTPUT_PRECISION )];
int i;
fread( buffer, sizeof( OUTPUT_PRECISION ), 1, file );
for (i = 0; i < sizeof( OUTPUT_PRECISION ); i++)
vPtr[i] = buffer[sizeof( OUTPUT_PRECISION ) - i - 1];
}
else { /* little endian */
fread( v, sizeof( OUTPUT_PRECISION ), 1, file );
}
}
/*############################################################################*/
void LBM_storeVelocityField( LBM_Grid grid, const char* filename,
const int binary ) {
OUTPUT_PRECISION rho, ux, uy, uz;
FILE* file = fopen( filename, (binary ? "wb" : "w") );
SWEEP_VAR
SWEEP_START(0,0,0,SIZE_X,SIZE_Y,SIZE_Z)
rho = + SRC_C( grid ) + SRC_N( grid )
+ SRC_S( grid ) + SRC_E( grid )
+ SRC_W( grid ) + SRC_T( grid )
+ SRC_B( grid ) + SRC_NE( grid )
+ SRC_NW( grid ) + SRC_SE( grid )
+ SRC_SW( grid ) + SRC_NT( grid )
+ SRC_NB( grid ) + SRC_ST( grid )
+ SRC_SB( grid ) + SRC_ET( grid )
+ SRC_EB( grid ) + SRC_WT( grid )
+ SRC_WB( grid );
ux = + SRC_E( grid ) - SRC_W( grid )
+ SRC_NE( grid ) - SRC_NW( grid )
+ SRC_SE( grid ) - SRC_SW( grid )
+ SRC_ET( grid ) + SRC_EB( grid )
- SRC_WT( grid ) - SRC_WB( grid );
uy = + SRC_N( grid ) - SRC_S( grid )
+ SRC_NE( grid ) + SRC_NW( grid )
- SRC_SE( grid ) - SRC_SW( grid )
+ SRC_NT( grid ) + SRC_NB( grid )
- SRC_ST( grid ) - SRC_SB( grid );
uz = + SRC_T( grid ) - SRC_B( grid )
+ SRC_NT( grid ) - SRC_NB( grid )
+ SRC_ST( grid ) - SRC_SB( grid )
+ SRC_ET( grid ) - SRC_EB( grid )
+ SRC_WT( grid ) - SRC_WB( grid );
ux /= rho;
uy /= rho;
uz /= rho;
if( binary ) {
/*
fwrite( &ux, sizeof( ux ), 1, file );
fwrite( &uy, sizeof( uy ), 1, file );
fwrite( &uz, sizeof( uz ), 1, file );
*/
storeValue( file, &ux );
storeValue( file, &uy );
storeValue( file, &uz );
} else
fprintf( file, "%e %e %e\n", ux, uy, uz );
SWEEP_END;
fclose( file );
}

View File

@@ -1,39 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/*############################################################################*/
#ifndef _LBM_H_
#define _LBM_H_
/*############################################################################*/
#include "ocl.h"
#include "lbm_macros.h"
void LBM_allocateGrid( float** ptr );
void LBM_freeGrid( float** ptr );
void LBM_initializeGrid( LBM_Grid grid );
void LBM_initializeSpecialCellsForLDC( LBM_Grid grid );
void LBM_loadObstacleFile( LBM_Grid grid, const char* filename );
void LBM_swapGrids( cl_mem* grid1, cl_mem* grid2 );
void LBM_showGridStatistics( LBM_Grid Grid );
void LBM_storeVelocityField( LBM_Grid grid, const char* filename,
const BOOL binary );
/* OpenCL *********************************************************************/
void OpenCL_LBM_allocateGrid( const OpenCL_Param* prm, cl_mem* ptr );
void OpenCL_LBM_freeGrid( cl_mem ptr );
void OpenCL_LBM_initializeGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid );
void OpenCL_LBM_getDeviceGrid( const OpenCL_Param* prm, cl_mem d_grid, LBM_Grid h_grid );
void OpenCL_LBM_performStreamCollide( const OpenCL_Param* prm, cl_mem srcGrid, cl_mem dstGrid );
/*############################################################################*/
#endif /* _LBM_H_ */

View File

@@ -1,177 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#ifndef _LBM_MARCOS_H
#define _LBM_MACROS_H_
#define OMEGA (1.95f)
#define OUTPUT_PRECISION float
#define BOOL int
#define TRUE (-1)
#define FALSE (0)
#define DFL1 (1.0f/ 3.0f)
#define DFL2 (1.0f/18.0f)
#define DFL3 (1.0f/36.0f)
/*############################################################################*/
typedef float* LBM_Grid;//float LBM_Grid[PADDED_Z*PADDED_Y*PADDED_X*N_CELL_ENTRIES];
typedef LBM_Grid* LBM_GridPtr;
/*############################################################################*/
#define SWEEP_X __temp_x__
#define SWEEP_Y __temp_y__
#define SWEEP_Z __temp_z__
#define SWEEP_VAR int __temp_x__, __temp_y__, __temp_z__;
#define SWEEP_START(x1,y1,z1,x2,y2,z2) \
for( __temp_z__ = z1; \
__temp_z__ < z2; \
__temp_z__++) { \
for( __temp_y__ = 0; \
__temp_y__ < SIZE_Y; \
__temp_y__++) { \
for(__temp_x__ = 0; \
__temp_x__ < SIZE_X; \
__temp_x__++) { \
#define SWEEP_END }}}
#define GRID_ENTRY(g,x,y,z,e) ((g)[CALC_INDEX( x, y, z, e)])
#define GRID_ENTRY_SWEEP(g,dx,dy,dz,e) ((g)[CALC_INDEX((dx)+SWEEP_X, (dy)+SWEEP_Y, (dz)+SWEEP_Z, e)])
#define LOCAL(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e ))
#define NEIGHBOR_C(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, 0, e ))
#define NEIGHBOR_N(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, 0, e ))
#define NEIGHBOR_S(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, 0, e ))
#define NEIGHBOR_E(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, 0, e ))
#define NEIGHBOR_W(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, 0, e ))
#define NEIGHBOR_T(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, +1, e ))
#define NEIGHBOR_B(g,e) (GRID_ENTRY_SWEEP( g, 0, 0, -1, e ))
#define NEIGHBOR_NE(g,e) (GRID_ENTRY_SWEEP( g, +1, +1, 0, e ))
#define NEIGHBOR_NW(g,e) (GRID_ENTRY_SWEEP( g, -1, +1, 0, e ))
#define NEIGHBOR_SE(g,e) (GRID_ENTRY_SWEEP( g, +1, -1, 0, e ))
#define NEIGHBOR_SW(g,e) (GRID_ENTRY_SWEEP( g, -1, -1, 0, e ))
#define NEIGHBOR_NT(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, +1, e ))
#define NEIGHBOR_NB(g,e) (GRID_ENTRY_SWEEP( g, 0, +1, -1, e ))
#define NEIGHBOR_ST(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, +1, e ))
#define NEIGHBOR_SB(g,e) (GRID_ENTRY_SWEEP( g, 0, -1, -1, e ))
#define NEIGHBOR_ET(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, +1, e ))
#define NEIGHBOR_EB(g,e) (GRID_ENTRY_SWEEP( g, +1, 0, -1, e ))
#define NEIGHBOR_WT(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, +1, e ))
#define NEIGHBOR_WB(g,e) (GRID_ENTRY_SWEEP( g, -1, 0, -1, e ))
#ifdef SCATTER
#define SRC_C(g) (LOCAL( g, C ))
#define SRC_N(g) (LOCAL( g, N ))
#define SRC_S(g) (LOCAL( g, S ))
#define SRC_E(g) (LOCAL( g, E ))
#define SRC_W(g) (LOCAL( g, W ))
#define SRC_T(g) (LOCAL( g, T ))
#define SRC_B(g) (LOCAL( g, B ))
#define SRC_NE(g) (LOCAL( g, NE ))
#define SRC_NW(g) (LOCAL( g, NW ))
#define SRC_SE(g) (LOCAL( g, SE ))
#define SRC_SW(g) (LOCAL( g, SW ))
#define SRC_NT(g) (LOCAL( g, NT ))
#define SRC_NB(g) (LOCAL( g, NB ))
#define SRC_ST(g) (LOCAL( g, ST ))
#define SRC_SB(g) (LOCAL( g, SB ))
#define SRC_ET(g) (LOCAL( g, ET ))
#define SRC_EB(g) (LOCAL( g, EB ))
#define SRC_WT(g) (LOCAL( g, WT ))
#define SRC_WB(g) (LOCAL( g, WB ))
#define DST_C(g) (NEIGHBOR_C ( g, C ))
#define DST_N(g) (NEIGHBOR_N ( g, N ))
#define DST_S(g) (NEIGHBOR_S ( g, S ))
#define DST_E(g) (NEIGHBOR_E ( g, E ))
#define DST_W(g) (NEIGHBOR_W ( g, W ))
#define DST_T(g) (NEIGHBOR_T ( g, T ))
#define DST_B(g) (NEIGHBOR_B ( g, B ))
#define DST_NE(g) (NEIGHBOR_NE( g, NE ))
#define DST_NW(g) (NEIGHBOR_NW( g, NW ))
#define DST_SE(g) (NEIGHBOR_SE( g, SE ))
#define DST_SW(g) (NEIGHBOR_SW( g, SW ))
#define DST_NT(g) (NEIGHBOR_NT( g, NT ))
#define DST_NB(g) (NEIGHBOR_NB( g, NB ))
#define DST_ST(g) (NEIGHBOR_ST( g, ST ))
#define DST_SB(g) (NEIGHBOR_SB( g, SB ))
#define DST_ET(g) (NEIGHBOR_ET( g, ET ))
#define DST_EB(g) (NEIGHBOR_EB( g, EB ))
#define DST_WT(g) (NEIGHBOR_WT( g, WT ))
#define DST_WB(g) (NEIGHBOR_WB( g, WB ))
#else /* GATHER */
#define SRC_C(g) (NEIGHBOR_C ( g, C ))
#define SRC_N(g) (NEIGHBOR_S ( g, N ))
#define SRC_S(g) (NEIGHBOR_N ( g, S ))
#define SRC_E(g) (NEIGHBOR_W ( g, E ))
#define SRC_W(g) (NEIGHBOR_E ( g, W ))
#define SRC_T(g) (NEIGHBOR_B ( g, T ))
#define SRC_B(g) (NEIGHBOR_T ( g, B ))
#define SRC_NE(g) (NEIGHBOR_SW( g, NE ))
#define SRC_NW(g) (NEIGHBOR_SE( g, NW ))
#define SRC_SE(g) (NEIGHBOR_NW( g, SE ))
#define SRC_SW(g) (NEIGHBOR_NE( g, SW ))
#define SRC_NT(g) (NEIGHBOR_SB( g, NT ))
#define SRC_NB(g) (NEIGHBOR_ST( g, NB ))
#define SRC_ST(g) (NEIGHBOR_NB( g, ST ))
#define SRC_SB(g) (NEIGHBOR_NT( g, SB ))
#define SRC_ET(g) (NEIGHBOR_WB( g, ET ))
#define SRC_EB(g) (NEIGHBOR_WT( g, EB ))
#define SRC_WT(g) (NEIGHBOR_EB( g, WT ))
#define SRC_WB(g) (NEIGHBOR_ET( g, WB ))
#define DST_C(g) (LOCAL( g, C ))
#define DST_N(g) (LOCAL( g, N ))
#define DST_S(g) (LOCAL( g, S ))
#define DST_E(g) (LOCAL( g, E ))
#define DST_W(g) (LOCAL( g, W ))
#define DST_T(g) (LOCAL( g, T ))
#define DST_B(g) (LOCAL( g, B ))
#define DST_NE(g) (LOCAL( g, NE ))
#define DST_NW(g) (LOCAL( g, NW ))
#define DST_SE(g) (LOCAL( g, SE ))
#define DST_SW(g) (LOCAL( g, SW ))
#define DST_NT(g) (LOCAL( g, NT ))
#define DST_NB(g) (LOCAL( g, NB ))
#define DST_ST(g) (LOCAL( g, ST ))
#define DST_SB(g) (LOCAL( g, SB ))
#define DST_ET(g) (LOCAL( g, ET ))
#define DST_EB(g) (LOCAL( g, EB ))
#define DST_WT(g) (LOCAL( g, WT ))
#define DST_WB(g) (LOCAL( g, WB ))
#endif /* GATHER */
#define MAGIC_CAST(v) ((unsigned int*) ((void*) (&(v))))
#define FLAG_VAR(v) unsigned int* _aux_ = MAGIC_CAST(v)
#define TEST_FLAG_SWEEP(g,f) ((*MAGIC_CAST(LOCAL(g, FLAGS))) & (f))
#define SET_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) |= (f);}
#define CLEAR_FLAG_SWEEP(g,f) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) &= ~(f);}
#define CLEAR_ALL_FLAGS_SWEEP(g) {FLAG_VAR(LOCAL(g, FLAGS)); (*_aux_) = 0;}
#define TEST_FLAG(g,x,y,z,f) ((*MAGIC_CAST(GRID_ENTRY(g, x, y, z, FLAGS))) & (f))
#define SET_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) |= (f);}
#define CLEAR_FLAG(g,x,y,z,f) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) &= ~(f);}
#define CLEAR_ALL_FLAGS(g,x,y,z) {FLAG_VAR(GRID_ENTRY(g, x, y, z, FLAGS)); (*_aux_) = 0;}
/*############################################################################*/
#endif /* _CONFIG_H_ */

Binary file not shown.

View File

@@ -1,238 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
/*############################################################################*/
#include <CL/cl.h>
#include <parboil.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/stat.h>
#include "layout_config.h"
#include "lbm.h"
#include "lbm_macros.h"
#include "main.h"
#include "ocl.h"
/*############################################################################*/
static cl_mem OpenCL_srcGrid, OpenCL_dstGrid;
/*############################################################################*/
struct pb_TimerSet timers;
int main(int nArgs, char *arg[]) {
MAIN_Param param;
int t;
OpenCL_Param prm;
pb_InitializeTimerSet(&timers);
struct pb_Parameters *params;
params = pb_ReadParameters(&nArgs, arg);
params->inpFiles = (char **)malloc(sizeof(char *) * 2);
params->inpFiles[0] = (char *)malloc(100);
params->inpFiles[1] = NULL;
strncpy(params->inpFiles[0], "120_120_150_ldc.of", 100);
static LBM_GridPtr TEMP_srcGrid;
// Setup TEMP datastructures
LBM_allocateGrid((float **)&TEMP_srcGrid);
MAIN_parseCommandLine(nArgs, arg, &param, params);
MAIN_printInfo(&param);
OpenCL_initialize(params, &prm);
MAIN_initialize(&param, &prm);
for (t = 1; t <= param.nTimeSteps; t++) {
pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
OpenCL_LBM_performStreamCollide(&prm, OpenCL_srcGrid, OpenCL_dstGrid);
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
LBM_swapGrids(&OpenCL_srcGrid, &OpenCL_dstGrid);
if ((t & 63) == 0) {
printf("timestep: %i\n", t);
#if 0
CUDA_LBM_getDeviceGrid((float**)&CUDA_srcGrid, (float**)&TEMP_srcGrid);
LBM_showGridStatistics( *TEMP_srcGrid );
#endif
}
}
MAIN_finalize(&param, &prm);
LBM_freeGrid((float **)&TEMP_srcGrid);
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
pb_PrintTimerSet(&timers);
pb_FreeParameters(params);
return 0;
}
/*############################################################################*/
void MAIN_parseCommandLine(int nArgs, char *arg[], MAIN_Param *param,
struct pb_Parameters *params) {
struct stat fileStat;
/*if (nArgs < 2) {
printf("syntax: lbm <time steps>\n");
exit(1);
}*/
param->nTimeSteps = 4; //atoi(arg[1]);
if (params->inpFiles[0] != NULL) {
param->obstacleFilename = params->inpFiles[0];
/*if (stat(param->obstacleFilename, &fileStat) != 0) {
printf("MAIN_parseCommandLine: cannot stat obstacle file '%s'\n",
param->obstacleFilename);
exit(1);
}
if (fileStat.st_size != SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z) {
printf("MAIN_parseCommandLine:\n"
"\tsize of file '%s' is %i bytes\n"
"\texpected size is %i bytes\n",
param->obstacleFilename, (int)fileStat.st_size,
SIZE_X * SIZE_Y * SIZE_Z + (SIZE_Y + 1) * SIZE_Z);
exit(1);
}*/
} else
param->obstacleFilename = NULL;
param->resultFilename = params->outFile;
}
/*############################################################################*/
void MAIN_printInfo(const MAIN_Param *param) {
printf("MAIN_printInfo:\n"
"\tgrid size : %i x %i x %i = %.2f * 10^6 Cells\n"
"\tnTimeSteps : %i\n"
"\tresult file : %s\n"
"\taction : %s\n"
"\tsimulation type: %s\n"
"\tobstacle file : %s\n\n",
SIZE_X, SIZE_Y, SIZE_Z, 1e-6 * SIZE_X * SIZE_Y * SIZE_Z,
param->nTimeSteps, param->resultFilename, "store", "lid-driven cavity",
(param->obstacleFilename == NULL) ? "<none>"
: param->obstacleFilename);
}
/*############################################################################*/
void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) {
static LBM_Grid TEMP_srcGrid, TEMP_dstGrid;
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
// Setup TEMP datastructures
LBM_allocateGrid((float **)&TEMP_srcGrid);
LBM_allocateGrid((float **)&TEMP_dstGrid);
LBM_initializeGrid(TEMP_srcGrid);
LBM_initializeGrid(TEMP_dstGrid);
pb_SwitchToTimer(&timers, pb_TimerID_IO);
if (param->obstacleFilename != NULL) {
LBM_loadObstacleFile(TEMP_srcGrid, param->obstacleFilename);
LBM_loadObstacleFile(TEMP_dstGrid, param->obstacleFilename);
}
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
LBM_initializeSpecialCellsForLDC(TEMP_srcGrid);
LBM_initializeSpecialCellsForLDC(TEMP_dstGrid);
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
printf("OK+\n");
// Setup DEVICE datastructures
OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid);
OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid);
printf("OK-\n");
// Initialize DEVICE datastructures
OpenCL_LBM_initializeGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
OpenCL_LBM_initializeGrid(prm, OpenCL_dstGrid, TEMP_dstGrid);
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
LBM_showGridStatistics(TEMP_srcGrid);
LBM_freeGrid((float **)&TEMP_srcGrid);
LBM_freeGrid((float **)&TEMP_dstGrid);
printf("OK\n");
}
/*############################################################################*/
void MAIN_finalize(const MAIN_Param *param, const OpenCL_Param *prm) {
LBM_Grid TEMP_srcGrid;
// Setup TEMP datastructures
LBM_allocateGrid((float **)&TEMP_srcGrid);
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
OpenCL_LBM_getDeviceGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
LBM_showGridStatistics(TEMP_srcGrid);
LBM_storeVelocityField(TEMP_srcGrid, param->resultFilename, TRUE);
LBM_freeGrid((float **)&TEMP_srcGrid);
OpenCL_LBM_freeGrid(OpenCL_srcGrid);
OpenCL_LBM_freeGrid(OpenCL_dstGrid);
clReleaseProgram(prm->clProgram);
clReleaseKernel(prm->clKernel);
clReleaseCommandQueue(prm->clCommandQueue);
clReleaseContext(prm->clContext);
}
void OpenCL_initialize(struct pb_Parameters *p, OpenCL_Param *prm) {
cl_int clStatus;
pb_Context *pb_context;
pb_context = pb_InitOpenCLContext(p);
if (pb_context == NULL) {
fprintf(stderr, "Error: No OpenCL platform/device can be found.");
return;
}
prm->clDevice = (cl_device_id)pb_context->clDeviceId;
prm->clPlatform = (cl_platform_id)pb_context->clPlatformId;
prm->clContext = (cl_context)pb_context->clContext;
prm->clCommandQueue = clCreateCommandQueue(
prm->clContext, prm->clDevice, CL_QUEUE_PROFILING_ENABLE, &clStatus);
CHECK_ERROR("clCreateCommandQueue")
pb_SetOpenCL(&(prm->clContext), &(prm->clCommandQueue));
//const char *clSource[] = {readFile("src/opencl_base/kernel.cl")};
//prm->clProgram = clCreateProgramWithSource(prm->clContext, 1, clSource, NULL, &clStatus);
prm->clProgram = clCreateProgramWithBuiltInKernels(
prm->clContext, 1, &prm->clDevice, "performStreamCollide_kernel", &clStatus);
CHECK_ERROR("clCreateProgramWithSource")
//char clOptions[100];
//sprintf(clOptions, "-I src/opencl_base");
//clStatus = clBuildProgram(prm->clProgram, 1, &(prm->clDevice), clOptions, NULL, NULL);
clStatus = clBuildProgram(prm->clProgram, 1, &prm->clDevice, NULL, NULL, NULL);
CHECK_ERROR("clBuildProgram")
prm->clKernel =
clCreateKernel(prm->clProgram, "performStreamCollide_kernel", &clStatus);
CHECK_ERROR("clCreateKernel")
//free((void *)clSource[0]);
}

View File

@@ -1,31 +0,0 @@
/***************************************************************************
*cr
*cr (C) Copyright 2010 The Board of Trustees of the
*cr University of Illinois
*cr All Rights Reserved
*cr
***************************************************************************/
#ifndef _MAIN_H_
#define _MAIN_H_
/*############################################################################*/
typedef struct {
int nTimeSteps;
char* resultFilename;
char* obstacleFilename;
} MAIN_Param;
/*############################################################################*/
void MAIN_parseCommandLine( int nArgs, char* arg[], MAIN_Param* param, struct pb_Parameters* );
void MAIN_printInfo( const MAIN_Param* param );
void MAIN_initialize( const MAIN_Param* param, const OpenCL_Param* prm );
void MAIN_finalize( const MAIN_Param* param, const OpenCL_Param* prm );
void OpenCL_initialize(struct pb_Parameters*, OpenCL_Param* prm);
/*############################################################################*/
#endif /* _MAIN_H_ */

View File

@@ -1,40 +0,0 @@
#include <CL/cl.h>
#include <stdio.h>
#include <stdlib.h>
#include "ocl.h"
char* readFile(char* fileName)
{
FILE* fp;
fp = fopen(fileName,"r");
if(fp == NULL)
{
printf("Error 1!\n");
return NULL;
}
fseek(fp,0,SEEK_END);
long size = ftell(fp);
rewind(fp);
char* buffer = (char*)malloc(sizeof(char)*(size+1));
if(buffer == NULL)
{
printf("Error 2!\n");
fclose(fp);
return NULL;
}
size_t res = fread(buffer,1,size,fp);
if(res != size)
{
printf("Error 3!\n");
fclose(fp);
return NULL;
}
buffer[size] = 0;
fclose(fp);
return buffer;
}

Some files were not shown because too many files have changed in this diff Show More