+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes minor update minor update minor update minor update minor update minor update cleanup cleanup cache bindings and memory perf refactory minor update minor update hw unit tests fixes minor update minor update minor update minor update minor update minor udpate minor update minor update minor update minor update minor update minor update minor update minor updates minor updates minor update minor update minor update minor update minor update minor update minor updates minor updates minor updates minor updates minor update minor update
249 lines
9.9 KiB
C++
249 lines
9.9 KiB
C++
/*
|
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
|
*
|
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
|
* with this source code for terms and conditions that govern your use of
|
|
* this software. Any use, reproduction, disclosure, or distribution of
|
|
* this software and related documentation outside the terms of the EULA
|
|
* is strictly prohibited.
|
|
*
|
|
*/
|
|
|
|
// standard utilities and systems includes
|
|
#include <oclUtils.h>
|
|
#include <shrQATest.h>
|
|
#include "oclBlackScholes_common.h"
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Helper functions
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
double executionTime(cl_event &event){
|
|
cl_ulong start, end;
|
|
|
|
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
|
|
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
|
|
|
|
return (double)1.0e-9 * (end - start); // convert nanoseconds to seconds on return
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Random float helper
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
float randFloat(float low, float high){
|
|
float t = (float)rand() / (float)RAND_MAX;
|
|
return (1.0f - t) * low + t * high;
|
|
}
|
|
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
// Main program
|
|
////////////////////////////////////////////////////////////////////////////////
|
|
int main(int argc, char **argv)
|
|
{
|
|
cl_platform_id cpPlatform; //OpenCL platform
|
|
cl_device_id* cdDevices = NULL; //OpenCL devices list (array)
|
|
cl_context cxGPUContext; //OpenCL context
|
|
cl_command_queue cqCommandQueue; //OpenCL command que
|
|
cl_mem //OpenCL memory buffer objects
|
|
d_Call,
|
|
d_Put,
|
|
d_S,
|
|
d_X,
|
|
d_T;
|
|
|
|
cl_int ciErrNum;
|
|
|
|
float
|
|
*h_CallCPU,
|
|
*h_PutCPU,
|
|
*h_CallGPU,
|
|
*h_PutGPU,
|
|
*h_S,
|
|
*h_X,
|
|
*h_T;
|
|
|
|
const unsigned int optionCount = 64;
|
|
const float R = 0.02f;
|
|
const float V = 0.30f;
|
|
|
|
shrQAStart(argc, argv);
|
|
|
|
// Get the NVIDIA platform
|
|
ciErrNum = oclGetPlatformID(&cpPlatform);
|
|
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
|
shrLog("clGetPlatformID...\n");
|
|
|
|
//Get all the devices
|
|
cl_uint uiNumDevices = 0; // Number of devices available
|
|
cl_uint uiTargetDevice = 0; // Default Device to compute on
|
|
cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU)
|
|
shrLog("Get the Device info and select Device...\n");
|
|
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
|
|
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
|
cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
|
|
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
|
|
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
|
|
|
// Get command line device options and config accordingly
|
|
shrLog(" # of Devices Available = %u\n", uiNumDevices);
|
|
if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE)
|
|
{
|
|
uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
|
|
}
|
|
shrLog(" Using Device %u: ", uiTargetDevice);
|
|
oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
|
|
ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
|
|
oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
|
|
shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits);
|
|
|
|
// set logfile name and start logs
|
|
shrSetLogFileName ("oclBlackScholes.txt");
|
|
shrLog("%s Starting...\n\n", argv[0]);
|
|
|
|
shrLog("Allocating and initializing host memory...\n");
|
|
h_CallCPU = (float *)malloc(optionCount * sizeof(float));
|
|
h_PutCPU = (float *)malloc(optionCount * sizeof(float));
|
|
h_CallGPU = (float *)malloc(optionCount * sizeof(float));
|
|
h_PutGPU = (float *)malloc(optionCount * sizeof(float));
|
|
h_S = (float *)malloc(optionCount * sizeof(float));
|
|
h_X = (float *)malloc(optionCount * sizeof(float));
|
|
h_T = (float *)malloc(optionCount * sizeof(float));
|
|
|
|
srand(2009);
|
|
for(unsigned int i = 0; i < optionCount; i++){
|
|
h_CallCPU[i] = -1.0f;
|
|
h_PutCPU[i] = -1.0f;
|
|
h_S[i] = randFloat(5.0f, 30.0f);
|
|
h_X[i] = randFloat(1.0f, 100.0f);
|
|
h_T[i] = randFloat(0.25f, 10.0f);
|
|
}
|
|
|
|
shrLog("Initializing OpenCL...\n");
|
|
// Get the NVIDIA platform
|
|
ciErrNum = oclGetPlatformID(&cpPlatform);
|
|
oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
// Get a GPU device
|
|
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 1, &cdDevices[uiTargetDevice], NULL);
|
|
oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
// Create the context
|
|
cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
|
|
oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
//Create a command-queue
|
|
cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
|
|
oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
shrLog("Creating OpenCL memory objects...\n");
|
|
d_Call = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, optionCount * sizeof(float), NULL, &ciErrNum);
|
|
oclCheckError(ciErrNum, CL_SUCCESS);
|
|
d_Put = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, optionCount * sizeof(float), NULL, &ciErrNum);
|
|
oclCheckError(ciErrNum, CL_SUCCESS);
|
|
d_S = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_S, &ciErrNum);
|
|
oclCheckError(ciErrNum, CL_SUCCESS);
|
|
d_X = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_X, &ciErrNum);
|
|
oclCheckError(ciErrNum, CL_SUCCESS);
|
|
d_T = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_T, &ciErrNum);
|
|
oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
shrLog("Starting up BlackScholes...\n");
|
|
initBlackScholes(cxGPUContext, cqCommandQueue, (const char **)argv);
|
|
|
|
shrLog("Running OpenCL BlackScholes...\n\n");
|
|
//Just a single run or a warmup iteration
|
|
BlackScholes(
|
|
NULL,
|
|
d_Call,
|
|
d_Put,
|
|
d_S,
|
|
d_X,
|
|
d_T,
|
|
R,
|
|
V,
|
|
optionCount
|
|
);
|
|
|
|
#ifdef GPU_PROFILING
|
|
const int numIterations = 16;
|
|
cl_event startMark, endMark;
|
|
ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark);
|
|
ciErrNum |= clFinish(cqCommandQueue);
|
|
shrCheckError(ciErrNum, CL_SUCCESS);
|
|
shrDeltaT(0);
|
|
|
|
for(int i = 0; i < numIterations; i++){
|
|
BlackScholes(
|
|
cqCommandQueue,
|
|
d_Call,
|
|
d_Put,
|
|
d_S,
|
|
d_X,
|
|
d_T,
|
|
R,
|
|
V,
|
|
optionCount
|
|
);
|
|
}
|
|
|
|
ciErrNum = clEnqueueMarker(cqCommandQueue, &endMark);
|
|
ciErrNum |= clFinish(cqCommandQueue);
|
|
shrCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
//Calculate performance metrics by wallclock time
|
|
double gpuTime = shrDeltaT(0) / numIterations;
|
|
shrLogEx(LOGBOTH | MASTER, 0, "oclBlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u options, NumDevsUsed = %i, Workgroup = %u\n",
|
|
(double)(2.0 * optionCount * 1.0e-9)/gpuTime, gpuTime, (2 * optionCount), 1, 0);
|
|
|
|
//Get profiling info
|
|
cl_ulong startTime = 0, endTime = 0;
|
|
ciErrNum = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &startTime, NULL);
|
|
ciErrNum |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL);
|
|
shrCheckError(ciErrNum, CL_SUCCESS);
|
|
shrLog("\nOpenCL time: %.5f s\n\n", 1.0e-9 * ((double)endTime - (double)startTime) / (double)numIterations);
|
|
#endif
|
|
|
|
shrLog("\nReading back OpenCL BlackScholes results...\n");
|
|
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Call, CL_TRUE, 0, optionCount * sizeof(float), h_CallGPU, 0, NULL, NULL);
|
|
oclCheckError(ciErrNum, CL_SUCCESS);
|
|
ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Put, CL_TRUE, 0, optionCount * sizeof(float), h_PutGPU, 0, NULL, NULL);
|
|
oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
shrLog("Comparing against Host/C++ computation...\n");
|
|
BlackScholesCPU(h_CallCPU, h_PutCPU, h_S, h_X, h_T, R, V, optionCount);
|
|
double deltaCall = 0, deltaPut = 0, sumCall = 0, sumPut = 0;
|
|
double L1call, L1put;
|
|
for(unsigned int i = 0; i < optionCount; i++)
|
|
{
|
|
sumCall += fabs(h_CallCPU[i]);
|
|
sumPut += fabs(h_PutCPU[i]);
|
|
deltaCall += fabs(h_CallCPU[i] - h_CallGPU[i]);
|
|
deltaPut += fabs(h_PutCPU[i] - h_PutGPU[i]);
|
|
}
|
|
L1call = deltaCall / sumCall;
|
|
L1put = deltaPut / sumPut;
|
|
shrLog("Relative L1 (call, put) = (%.3e, %.3e)\n\n", L1call, L1put);
|
|
|
|
shrLog("Shutting down...\n");
|
|
closeBlackScholes();
|
|
ciErrNum = clReleaseMemObject(d_T);
|
|
ciErrNum |= clReleaseMemObject(d_X);
|
|
ciErrNum |= clReleaseMemObject(d_S);
|
|
ciErrNum |= clReleaseMemObject(d_Put);
|
|
ciErrNum |= clReleaseMemObject(d_Call);
|
|
ciErrNum |= clReleaseCommandQueue(cqCommandQueue);
|
|
ciErrNum |= clReleaseContext(cxGPUContext);
|
|
oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
free(h_T);
|
|
free(h_X);
|
|
free(h_S);
|
|
free(h_PutGPU);
|
|
free(h_CallGPU);
|
|
free(h_PutCPU);
|
|
free(h_CallCPU);
|
|
|
|
if(cdDevices)free(cdDevices);
|
|
|
|
shrQAFinishExit(argc, (const char **)argv, ((L1call < 1E-6) && (L1put < 1E-6)) ? QA_PASSED : QA_FAILED );
|
|
}
|