Add files via upload

2019-11-25 00:50:08 -05:00
parent 56a076ff4a
commit ad9623b7dd
5 changed files with 1414 additions and 0 deletions
--- a/benchmarks/opencl/DotProduct/main.cc
+++ b/benchmarks/opencl/DotProduct/main.cc
@@ -0,0 +1,270 @@
+/*
+ * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
+ *
+ * Please refer to the NVIDIA end user license agreement (EULA) associated
+ * with this source code for terms and conditions that govern your use of
+ * this software. Any use, reproduction, disclosure, or distribution of
+ * this software and related documentation outside the terms of the EULA
+ * is strictly prohibited.
+ *
+ */
+
+// *********************************************************************
+// oclDotProduct Notes:  
+//
+// A simple OpenCL API demo application that implements a
+// vector dot product computation between 2 float arrays. 
+//
+// Runs computations with OpenCL on the GPU device and then checks results 
+// against basic host CPU/C++ computation.
+//
+// Uses 'shr' and 'ocl' functions from oclUtils and shrUtils libraries for compactness. 
+// But these are NOT required libs for OpenCL developement in general.
+// *********************************************************************
+
+// standard utilities and systems includes
+#include <oclUtils.h>
+#include <shrQATest.h>
+
+// Name of the file with the source code for the computation kernel
+// *********************************************************************
+const char* cSourceFile = "DotProduct.cl";
+
+// Host buffers for demo
+// *********************************************************************
+void *srcA, *srcB, *dst;        // Host buffers for OpenCL test
+void* Golden;                   // Host buffer for host golden processing cross check
+
+// OpenCL Vars
+cl_platform_id cpPlatform;      // OpenCL platform
+cl_device_id   *cdDevices;      // OpenCL device
+cl_context cxGPUContext;        // OpenCL context
+cl_command_queue cqCommandQueue;// OpenCL command que
+cl_program program;           // OpenCL program
+cl_kernel ckKernel;             // OpenCL kernel
+cl_mem cmDevSrcA;               // OpenCL device source buffer A
+cl_mem cmDevSrcB;               // OpenCL device source buffer B 
+cl_mem cmDevDst;                // OpenCL device destination buffer 
+size_t szGlobalWorkSize;        // Total # of work items in the 1D range
+size_t szLocalWorkSize;		    // # of work items in the 1D work group	
+size_t szParmDataBytes;			// Byte size of context information
+size_t szKernelLength;			// Byte size of kernel code
+cl_int ciErrNum;			    // Error code var
+char* cPathAndName = NULL;      // var for full paths to data, src, etc.
+char* cSourceCL = NULL;         // Buffer to hold source for compilation 
+const char* cExecutableName = NULL;
+
+// demo config vars
+int iNumElements= 1277944;	    // Length of float arrays to process (odd # for illustration)
+shrBOOL bNoPrompt = shrFALSE;  
+
+// Forward Declarations
+// *********************************************************************
+void DotProductHost(const float* pfData1, const float* pfData2, float* pfResult, int iNumElements);
+void Cleanup (int iExitCode);
+void (*pCleanup)(int) = &Cleanup;
+
+int *gp_argc = NULL;
+char ***gp_argv = NULL;
+
+// Main function 
+// *********************************************************************
+int main(int argc, char **argv)
+{
+    gp_argc = &argc;
+    gp_argv = &argv;
+
+    shrQAStart(argc, argv);
+
+    // Get the NVIDIA platform
+    ciErrNum = oclGetPlatformID(&cpPlatform);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
+    shrLog("clGetPlatformID...\n"); 
+
+    // Get the NVIDIA platform
+    ciErrNum = oclGetPlatformID(&cpPlatform);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
+    shrLog("clGetPlatformID...\n"); 
+
+    //Get all the devices
+    cl_uint uiNumDevices = 0;           // Number of devices available
+    cl_uint uiTargetDevice = 0;	        // Default Device to compute on
+    cl_uint uiNumComputeUnits;          // Number of compute units (SM's on NV GPU)
+    shrLog("Get the Device info and select Device...\n");
+    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
+    cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
+    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
+
+    // Get command line device options and config accordingly
+    shrLog("  # of Devices Available = %u\n", uiNumDevices); 
+    if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE) 
+    {
+        uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
+    }
+    shrLog("  Using Device %u: ", uiTargetDevice); 
+    oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
+    ciErrNum = clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, NULL);
+    shrLog("\n  # of Compute Units = %u\n", uiNumComputeUnits); 
+
+    // get command line arg for quick test, if provided
+    bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt");
+
+    // start logs
+	cExecutableName = argv[0];
+    shrSetLogFileName ("oclDotProduct.txt");
+    shrLog("%s Starting...\n\n# of float elements per Array \t= %u\n", argv[0], iNumElements); 
+
+    // set and log Global and Local work size dimensions
+    szLocalWorkSize = 256;
+    szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements);  // rounded up to the nearest multiple of the LocalWorkSize
+    shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n", 
+           szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); 
+
+    // Allocate and initialize host arrays
+    shrLog( "Allocate and Init Host Mem...\n"); 
+    srcA = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize);
+    srcB = (void *)malloc(sizeof(cl_float4) * szGlobalWorkSize);
+    dst = (void *)malloc(sizeof(cl_float) * szGlobalWorkSize);
+    Golden = (void *)malloc(sizeof(cl_float) * iNumElements);
+    shrFillArray((float*)srcA, 4 * iNumElements);
+    shrFillArray((float*)srcB, 4 * iNumElements);
+
+    // Get the NVIDIA platform
+    ciErrNum = oclGetPlatformID(&cpPlatform);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+
+    // Get a GPU device
+    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 1, &cdDevices[uiTargetDevice], NULL);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+
+    // Create the context
+    cxGPUContext = clCreateContext(0, 1, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+
+    // Create a command-queue
+    shrLog("clCreateCommandQueue...\n"); 
+    cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+
+    // Allocate the OpenCL buffer memory objects for source and result on the device GMEM
+    shrLog("clCreateBuffer (SrcA, SrcB and Dst in Device GMEM)...\n"); 
+    cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, sizeof(cl_float) * szGlobalWorkSize * 4, NULL, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+    cmDevDst = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * szGlobalWorkSize, NULL, &ciErrNum);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+
+    // Read the OpenCL kernel in from source file
+    shrLog("oclLoadProgSource (%s)...\n", cSourceFile); 
+    cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
+    //oclCheckErrorEX(cPathAndName != NULL, shrTRUE, pCleanup);
+    cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
+    //oclCheckErrorEX(cSourceCL != NULL, shrTRUE, pCleanup);
+
+    // Create the program
+    shrLog("clCreateProgramWithSource...\n"); 
+    //program = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);
+    cl_program program =
+      clCreateProgramWithBuiltInKernels(context, 1, &device_id, "sgemm", NULL);
+        // Build the program with 'mad' Optimization option
+    #ifdef MAC
+        char* flags = "-cl-fast-relaxed-math -DMAC";
+    #else
+        char* flags = "-cl-fast-relaxed-math";
+    #endif
+    shrLog("clBuildProgram...\n"); 
+    ciErrNum = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);
+    if (ciErrNum != CL_SUCCESS)
+    {
+        // write out standard error, Build Log and PTX, then cleanup and exit
+        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
+        oclLogBuildInfo(program, oclGetFirstDev(cxGPUContext));
+        oclLogPtx(program, oclGetFirstDev(cxGPUContext), "oclDotProduct.ptx");
+        Cleanup(EXIT_FAILURE); 
+    }
+
+    // Create the kernel
+    shrLog("clCreateKernel (DotProduct)...\n"); 
+    ckKernel = clCreateKernel(program, "DotProduct", &ciErrNum);
+
+    // Set the Argument values
+    shrLog("clSetKernelArg 0 - 3...\n\n"); 
+    ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cmDevSrcA);
+    ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&cmDevSrcB);
+    ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&cmDevDst);
+    ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(cl_int), (void*)&iNumElements);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+
+    // --------------------------------------------------------
+    // Core sequence... copy input data to GPU, compute, copy results back
+
+    // Asynchronous write of data to GPU device
+    shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n"); 
+    ciErrNum = clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcA, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcA, 0, NULL, NULL);
+    ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue, cmDevSrcB, CL_FALSE, 0, sizeof(cl_float) * szGlobalWorkSize * 4, srcB, 0, NULL, NULL);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+
+    // Launch kernel
+    shrLog("clEnqueueNDRangeKernel (DotProduct)...\n"); 
+    ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+
+    // Read back results and check accumulated errors
+    shrLog("clEnqueueReadBuffer (Dst)...\n\n"); 
+    ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmDevDst, CL_TRUE, 0, sizeof(cl_float) * szGlobalWorkSize, dst, 0, NULL, NULL);
+    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+
+    // Compute and compare results for golden-host and report errors and pass/fail
+    shrLog("Comparing against Host/C++ computation...\n\n"); 
+    DotProductHost ((const float*)srcA, (const float*)srcB, (float*)Golden, iNumElements);
+    shrBOOL bMatch = shrComparefet((const float*)Golden, (const float*)dst, (unsigned int)iNumElements, 0.0f, 0);
+
+    // Cleanup and leave
+    Cleanup (EXIT_SUCCESS);
+}
+
+// "Golden" Host processing dot product function for comparison purposes
+// *********************************************************************
+void DotProductHost(const float* pfData1, const float* pfData2, float* pfResult, int iNumElements)
+{
+    int i, j, k;
+    for (i = 0, j = 0; i < iNumElements; i++) 
+    {
+        pfResult[i] = 0.0f;
+        for (k = 0; k < 4; k++, j++) 
+        {
+            pfResult[i] += pfData1[j] * pfData2[j]; 
+        } 
+    }
+}
+
+// Cleanup and exit code
+// *********************************************************************
+void Cleanup(int iExitCode)
+{
+    // Cleanup allocated objects
+    shrLog("Starting Cleanup...\n\n");
+    if(cPathAndName)free(cPathAndName);
+    if(cSourceCL)free(cSourceCL);
+	if(ckKernel)clReleaseKernel(ckKernel);  
+    if(program)clReleaseProgram(program);
+    if(cqCommandQueue)clReleaseCommandQueue(cqCommandQueue);
+    if(cxGPUContext)clReleaseContext(cxGPUContext);
+    if (cmDevSrcA)clReleaseMemObject(cmDevSrcA);
+    if (cmDevSrcB)clReleaseMemObject(cmDevSrcB);
+    if (cmDevDst)clReleaseMemObject(cmDevDst);
+
+    // Free host memory
+    free(srcA); 
+    free(srcB);
+    free (dst);
+    free(Golden);
+
+    if (cdDevices) free(cdDevices);
+
+    shrQAFinishExit(*gp_argc, (const char **)*gp_argv, (iExitCode == EXIT_SUCCESS) ? QA_PASSED : QA_FAILED);
+}