Files
kernels/tests/opencl/blackscholes/main.cpp
Blaise Tine c1e168fdbe Vortex 2.0 changes:
+ Microarchitecture optimizations
+ 64-bit support
+ Xilinx FPGA support
+ LLVM-16 support
+ Refactoring and quality control fixes

minor update

minor update

minor update

minor update

minor update

minor update

cleanup

cleanup

cache bindings and memory perf refactory

minor update

minor update

hw unit tests fixes

minor update

minor update

minor update

minor update

minor update

minor udpate

minor update

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor updates

minor updates

minor update

minor update
2023-11-10 02:47:05 -08:00

249 lines
9.9 KiB
C++

/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
// standard utilities and systems includes
#include <oclUtils.h>
#include <shrQATest.h>
#include "oclBlackScholes_common.h"
////////////////////////////////////////////////////////////////////////////////
// Helper functions
////////////////////////////////////////////////////////////////////////////////
double executionTime(cl_event &event){
cl_ulong start, end;
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
return (double)1.0e-9 * (end - start); // convert nanoseconds to seconds on return
}
////////////////////////////////////////////////////////////////////////////////
// Random float helper
////////////////////////////////////////////////////////////////////////////////
float randFloat(float low, float high){
float t = (float)rand() / (float)RAND_MAX;
return (1.0f - t) * low + t * high;
}
////////////////////////////////////////////////////////////////////////////////
// Main program
////////////////////////////////////////////////////////////////////////////////
int main(int argc, char **argv)
{
cl_platform_id cpPlatform; //OpenCL platform
cl_device_id* cdDevices = NULL; //OpenCL devices list (array)
cl_context cxGPUContext; //OpenCL context
cl_command_queue cqCommandQueue; //OpenCL command que
cl_mem //OpenCL memory buffer objects
d_Call,
d_Put,
d_S,
d_X,
d_T;
cl_int ciErrNum;
float
*h_CallCPU,
*h_PutCPU,
*h_CallGPU,
*h_PutGPU,
*h_S,
*h_X,
*h_T;
const unsigned int optionCount = 64;
const float R = 0.02f;
const float V = 0.30f;
shrQAStart(argc, argv);
// Get the NVIDIA platform
ciErrNum = oclGetPlatformID(&cpPlatform);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
shrLog("clGetPlatformID...\n");
//Get all the devices
cl_uint uiNumDevices = 0; // Number of devices available
cl_uint uiTargetDevice = 0; // Default Device to compute on
cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU)
shrLog("Get the Device info and select Device...\n");
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
// Get command line device options and config accordingly
shrLog(" # of Devices Available = %u\n", uiNumDevices);
if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE)
{
uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
}
shrLog(" Using Device %u: ", uiTargetDevice);
oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits);
// set logfile name and start logs
shrSetLogFileName ("oclBlackScholes.txt");
shrLog("%s Starting...\n\n", argv[0]);
shrLog("Allocating and initializing host memory...\n");
h_CallCPU = (float *)malloc(optionCount * sizeof(float));
h_PutCPU = (float *)malloc(optionCount * sizeof(float));
h_CallGPU = (float *)malloc(optionCount * sizeof(float));
h_PutGPU = (float *)malloc(optionCount * sizeof(float));
h_S = (float *)malloc(optionCount * sizeof(float));
h_X = (float *)malloc(optionCount * sizeof(float));
h_T = (float *)malloc(optionCount * sizeof(float));
srand(2009);
for(unsigned int i = 0; i < optionCount; i++){
h_CallCPU[i] = -1.0f;
h_PutCPU[i] = -1.0f;
h_S[i] = randFloat(5.0f, 30.0f);
h_X[i] = randFloat(1.0f, 100.0f);
h_T[i] = randFloat(0.25f, 10.0f);
}
shrLog("Initializing OpenCL...\n");
// Get the NVIDIA platform
ciErrNum = oclGetPlatformID(&cpPlatform);
oclCheckError(ciErrNum, CL_SUCCESS);
// Get a GPU device
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 1, &cdDevices[uiTargetDevice], NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
// Create the context
cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
//Create a command-queue
cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
shrLog("Creating OpenCL memory objects...\n");
d_Call = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, optionCount * sizeof(float), NULL, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
d_Put = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, optionCount * sizeof(float), NULL, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
d_S = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_S, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
d_X = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_X, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
d_T = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_T, &ciErrNum);
oclCheckError(ciErrNum, CL_SUCCESS);
shrLog("Starting up BlackScholes...\n");
initBlackScholes(cxGPUContext, cqCommandQueue, (const char **)argv);
shrLog("Running OpenCL BlackScholes...\n\n");
//Just a single run or a warmup iteration
BlackScholes(
NULL,
d_Call,
d_Put,
d_S,
d_X,
d_T,
R,
V,
optionCount
);
#ifdef GPU_PROFILING
const int numIterations = 16;
cl_event startMark, endMark;
ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark);
ciErrNum |= clFinish(cqCommandQueue);
shrCheckError(ciErrNum, CL_SUCCESS);
shrDeltaT(0);
for(int i = 0; i < numIterations; i++){
BlackScholes(
cqCommandQueue,
d_Call,
d_Put,
d_S,
d_X,
d_T,
R,
V,
optionCount
);
}
ciErrNum = clEnqueueMarker(cqCommandQueue, &endMark);
ciErrNum |= clFinish(cqCommandQueue);
shrCheckError(ciErrNum, CL_SUCCESS);
//Calculate performance metrics by wallclock time
double gpuTime = shrDeltaT(0) / numIterations;
shrLogEx(LOGBOTH | MASTER, 0, "oclBlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u options, NumDevsUsed = %i, Workgroup = %u\n",
(double)(2.0 * optionCount * 1.0e-9)/gpuTime, gpuTime, (2 * optionCount), 1, 0);
//Get profiling info
cl_ulong startTime = 0, endTime = 0;
ciErrNum = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &startTime, NULL);
ciErrNum |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
shrLog("\nOpenCL time: %.5f s\n\n", 1.0e-9 * ((double)endTime - (double)startTime) / (double)numIterations);
#endif
shrLog("\nReading back OpenCL BlackScholes results...\n");
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Call, CL_TRUE, 0, optionCount * sizeof(float), h_CallGPU, 0, NULL, NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Put, CL_TRUE, 0, optionCount * sizeof(float), h_PutGPU, 0, NULL, NULL);
oclCheckError(ciErrNum, CL_SUCCESS);
shrLog("Comparing against Host/C++ computation...\n");
BlackScholesCPU(h_CallCPU, h_PutCPU, h_S, h_X, h_T, R, V, optionCount);
double deltaCall = 0, deltaPut = 0, sumCall = 0, sumPut = 0;
double L1call, L1put;
for(unsigned int i = 0; i < optionCount; i++)
{
sumCall += fabs(h_CallCPU[i]);
sumPut += fabs(h_PutCPU[i]);
deltaCall += fabs(h_CallCPU[i] - h_CallGPU[i]);
deltaPut += fabs(h_PutCPU[i] - h_PutGPU[i]);
}
L1call = deltaCall / sumCall;
L1put = deltaPut / sumPut;
shrLog("Relative L1 (call, put) = (%.3e, %.3e)\n\n", L1call, L1put);
shrLog("Shutting down...\n");
closeBlackScholes();
ciErrNum = clReleaseMemObject(d_T);
ciErrNum |= clReleaseMemObject(d_X);
ciErrNum |= clReleaseMemObject(d_S);
ciErrNum |= clReleaseMemObject(d_Put);
ciErrNum |= clReleaseMemObject(d_Call);
ciErrNum |= clReleaseCommandQueue(cqCommandQueue);
ciErrNum |= clReleaseContext(cxGPUContext);
oclCheckError(ciErrNum, CL_SUCCESS);
free(h_T);
free(h_X);
free(h_S);
free(h_PutGPU);
free(h_CallGPU);
free(h_PutCPU);
free(h_CallCPU);
if(cdDevices)free(cdDevices);
shrQAFinishExit(argc, (const char **)argv, ((L1call < 1E-6) && (L1put < 1E-6)) ? QA_PASSED : QA_FAILED );
}