/* * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. * * Please refer to the NVIDIA end user license agreement (EULA) associated * with this source code for terms and conditions that govern your use of * this software. Any use, reproduction, disclosure, or distribution of * this software and related documentation outside the terms of the EULA * is strictly prohibited. * */ // standard utilities and systems includes #include #include #include "oclBlackScholes_common.h" //////////////////////////////////////////////////////////////////////////////// // Helper functions //////////////////////////////////////////////////////////////////////////////// double executionTime(cl_event &event){ cl_ulong start, end; clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); return (double)1.0e-9 * (end - start); // convert nanoseconds to seconds on return } //////////////////////////////////////////////////////////////////////////////// // Random float helper //////////////////////////////////////////////////////////////////////////////// float randFloat(float low, float high){ float t = (float)rand() / (float)RAND_MAX; return (1.0f - t) * low + t * high; } //////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { cl_platform_id cpPlatform; //OpenCL platform cl_device_id* cdDevices = NULL; //OpenCL devices list (array) cl_context cxGPUContext; //OpenCL context cl_command_queue cqCommandQueue; //OpenCL command que cl_mem //OpenCL memory buffer objects d_Call, d_Put, d_S, d_X, d_T; cl_int ciErrNum; float *h_CallCPU, *h_PutCPU, *h_CallGPU, *h_PutGPU, *h_S, *h_X, *h_T; const unsigned int optionCount = 64; const float R = 0.02f; const float V = 0.30f; shrQAStart(argc, argv); // Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("clGetPlatformID...\n"); //Get all the devices cl_uint uiNumDevices = 0; // Number of devices available cl_uint uiTargetDevice = 0; // Default Device to compute on cl_uint uiNumComputeUnits; // Number of compute units (SM's on NV GPU) shrLog("Get the Device info and select Device...\n"); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); // Get command line device options and config accordingly shrLog(" # of Devices Available = %u\n", uiNumDevices); if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE) { uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1)); } shrLog(" Using Device %u: ", uiTargetDevice); oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]); ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL); shrLog("\n # of Compute Units = %u\n", uiNumComputeUnits); // set logfile name and start logs shrSetLogFileName ("oclBlackScholes.txt"); shrLog("%s Starting...\n\n", argv[0]); shrLog("Allocating and initializing host memory...\n"); h_CallCPU = (float *)malloc(optionCount * sizeof(float)); h_PutCPU = (float *)malloc(optionCount * sizeof(float)); h_CallGPU = (float *)malloc(optionCount * sizeof(float)); h_PutGPU = (float *)malloc(optionCount * sizeof(float)); h_S = (float *)malloc(optionCount * sizeof(float)); h_X = (float *)malloc(optionCount * sizeof(float)); h_T = (float *)malloc(optionCount * sizeof(float)); srand(2009); for(unsigned int i = 0; i < optionCount; i++){ h_CallCPU[i] = -1.0f; h_PutCPU[i] = -1.0f; h_S[i] = randFloat(5.0f, 30.0f); h_X[i] = randFloat(1.0f, 100.0f); h_T[i] = randFloat(0.25f, 10.0f); } shrLog("Initializing OpenCL...\n"); // Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckError(ciErrNum, CL_SUCCESS); // Get a GPU device ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 1, &cdDevices[uiTargetDevice], NULL); oclCheckError(ciErrNum, CL_SUCCESS); // Create the context cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); //Create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Creating OpenCL memory objects...\n"); d_Call = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, optionCount * sizeof(float), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); d_Put = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, optionCount * sizeof(float), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); d_S = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_S, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); d_X = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_X, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); d_T = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, optionCount * sizeof(float), h_T, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Starting up BlackScholes...\n"); initBlackScholes(cxGPUContext, cqCommandQueue, (const char **)argv); shrLog("Running OpenCL BlackScholes...\n\n"); //Just a single run or a warmup iteration BlackScholes( NULL, d_Call, d_Put, d_S, d_X, d_T, R, V, optionCount ); #ifdef GPU_PROFILING const int numIterations = 16; cl_event startMark, endMark; ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark); ciErrNum |= clFinish(cqCommandQueue); shrCheckError(ciErrNum, CL_SUCCESS); shrDeltaT(0); for(int i = 0; i < numIterations; i++){ BlackScholes( cqCommandQueue, d_Call, d_Put, d_S, d_X, d_T, R, V, optionCount ); } ciErrNum = clEnqueueMarker(cqCommandQueue, &endMark); ciErrNum |= clFinish(cqCommandQueue); shrCheckError(ciErrNum, CL_SUCCESS); //Calculate performance metrics by wallclock time double gpuTime = shrDeltaT(0) / numIterations; shrLogEx(LOGBOTH | MASTER, 0, "oclBlackScholes, Throughput = %.4f GOptions/s, Time = %.5f s, Size = %u options, NumDevsUsed = %i, Workgroup = %u\n", (double)(2.0 * optionCount * 1.0e-9)/gpuTime, gpuTime, (2 * optionCount), 1, 0); //Get profiling info cl_ulong startTime = 0, endTime = 0; ciErrNum = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &startTime, NULL); ciErrNum |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); shrCheckError(ciErrNum, CL_SUCCESS); shrLog("\nOpenCL time: %.5f s\n\n", 1.0e-9 * ((double)endTime - (double)startTime) / (double)numIterations); #endif shrLog("\nReading back OpenCL BlackScholes results...\n"); ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Call, CL_TRUE, 0, optionCount * sizeof(float), h_CallGPU, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Put, CL_TRUE, 0, optionCount * sizeof(float), h_PutGPU, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Comparing against Host/C++ computation...\n"); BlackScholesCPU(h_CallCPU, h_PutCPU, h_S, h_X, h_T, R, V, optionCount); double deltaCall = 0, deltaPut = 0, sumCall = 0, sumPut = 0; double L1call, L1put; for(unsigned int i = 0; i < optionCount; i++) { sumCall += fabs(h_CallCPU[i]); sumPut += fabs(h_PutCPU[i]); deltaCall += fabs(h_CallCPU[i] - h_CallGPU[i]); deltaPut += fabs(h_PutCPU[i] - h_PutGPU[i]); } L1call = deltaCall / sumCall; L1put = deltaPut / sumPut; shrLog("Relative L1 (call, put) = (%.3e, %.3e)\n\n", L1call, L1put); shrLog("Shutting down...\n"); closeBlackScholes(); ciErrNum = clReleaseMemObject(d_T); ciErrNum |= clReleaseMemObject(d_X); ciErrNum |= clReleaseMemObject(d_S); ciErrNum |= clReleaseMemObject(d_Put); ciErrNum |= clReleaseMemObject(d_Call); ciErrNum |= clReleaseCommandQueue(cqCommandQueue); ciErrNum |= clReleaseContext(cxGPUContext); oclCheckError(ciErrNum, CL_SUCCESS); free(h_T); free(h_X); free(h_S); free(h_PutGPU); free(h_CallGPU); free(h_PutCPU); free(h_CallCPU); if(cdDevices)free(cdDevices); shrQAFinishExit(argc, (const char **)argv, ((L1call < 1E-6) && (L1put < 1E-6)) ? QA_PASSED : QA_FAILED ); }