/*
 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */

// *********************************************************************
// oclCopyComputeOverlap Notes:  
//
// OpenCL API demo application for NVIDIA CUDA GPU's that implements a
// element by element vector hyptenuse computation using 2 input float arrays
// and 1 output float array.
//
// Demonstrates host->GPU and GPU->host copies that are asynchronous/overlapped
// with respect to GPU computation (and with respect to host thread). 
//
// Because the overlap acheivable for this computation and data set on a given system depends upon the GPU being used and the
// GPU/Host bandwidth, the sample adjust the computation duration to test the most ideal case and test against a consistent standard.
// This sample should be able to achieve up to 30% overlap on GPU's arch 1.2 and 1.3, and up to 50% on arch 2.0+ (Fermi) GPU's.
//
// After setup, warmup and calibration to the system, the sample runs 4 scenarios:  
//      A) Computations with 2 command queues on GPU
//         A multiple-cycle sequence is executed, timed and compared against the host
//      B) Computations with 1 command queue on GPU
//         A multiple-cycle sequence is executed, timed and compared against the host
//
//      The 2-command queue approach ought to be substantially faster
//
// For developmental purposes, the "iInnerLoopCount" variable passes into kernel and independently 
// increases compute time without increasing data size (via a loop inside the kernel)
//
//      At some value of iInnerLoopCount, # of elements, workgroup size, etc the Overlap percentage should reach 30%:
//      (This ~naively assumes time H2D bandwidth is the same as D2H bandwidth, but this is close on most systems)
//
//      If we name the time to copy single input vector H2D (or outpute vector D2H) as "T", then the optimum comparison case is:
//        
//          Single Queue with all the data and all the work  
//             Ttot (serial)        = 4T + 4T + 2T      = 10T    
//
//          Dual Queue, where each queue has 1/2 the data and 1/2 the work 
//             Tq0  (overlap)       = 2T + 2T + T .... 
//             Tq1  (overlap)       = .... 2T + 2T + T
//
//             Ttot (elapsed, wall) = 2T + 2T + 2T + T  = 7T
//
//          Best Overlap % = 100.0 * (10T - 7T)/10T = 30.0 %	(Tesla arch 1.2 or 1.3, single copy engine)
//
//			For multiple independent cycles using arch >= 2.0 with 2 copy engines, input and output copies can also be overlapped.
//			This doesn't help for the first cycle, but theoretically can lead to 50% overlap over many independent cycles.			
// *********************************************************************

// common SDK header for standard utilities and system libs 
#include <oclUtils.h>
#include <shrQATest.h>

// Best possible and Min ratio of compute/copy overlap timing benefit to pass the test
// values greater than 0.0f represent a speed-up relative to non-overlapped
#define EXPECTED_OVERLAP 30.0f
#define EXPECTED_OVERLAP_FERMI 45.0f
#define PASS_FACTOR 0.60f
#define RETRIES_ON_FAILURE 1

// Base sizes for parameters manipulated dynamically or on the command line
#define BASE_WORK_ITEMS 64
#define BASE_ARRAY_LENGTH 40000
#define BASE_LOOP_COUNT 32

// Vars
// *********************************************************************
cl_platform_id cpPlatform;                          // OpenCL platform
cl_context cxGPUContext;                            // OpenCL context
cl_command_queue cqCommandQueue[2];                 // OpenCL command queues
cl_device_id* cdDevices;                            // OpenCL device list  
cl_program cpProgram;                               // OpenCL program
cl_kernel ckKernel[2];                              // OpenCL kernel, 1 per queue
cl_mem cmPinnedSrcA;                                // OpenCL pinned host source buffer A
cl_mem cmPinnedSrcB;                                // OpenCL pinned host source buffer B 
cl_mem cmPinnedResult;                              // OpenCL pinned host result buffer 
float* fSourceA = NULL;                             // Mapped pointer for pinned Host source A buffer
float* fSourceB = NULL;                             // Mapped pointer for pinned Host source B buffer
float* fResult = NULL;                              // Mapped pointer for pinned Host result buffer 
cl_mem cmDevSrcA;                                   // OpenCL device source buffer A
cl_mem cmDevSrcB;                                   // OpenCL device source buffer B 
cl_mem cmDevResult;                                 // OpenCL device result buffer 
size_t szBuffBytes;                                 // Size of main buffers
size_t szGlobalWorkSize;                            // 1D var for Total # of work items in the launched ND range
size_t szLocalWorkSize = BASE_WORK_ITEMS;           // initial # of work items in the work group	
cl_int ciErrNum;			                        // Error code var
char* cPathAndName = NULL;                          // Var for full paths to data, src, etc.
char* cSourceCL = NULL;                             // Buffer to hold source for compilation 
const char* cExecutableName = NULL;

// demo config vars
const char* cSourceFile = "VectorHypot.cl";         // OpenCL computation kernel source code
float* Golden = NULL;                               // temp buffer to hold golden results for cross check    
bool bNoPrompt = false;                             // Command line switch to skip exit prompt
bool bQATest = false;                               // Command line switch to test

// Forward Declarations
// *********************************************************************
double DualQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig);
double OneQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig);
int AdjustCompute(cl_device_id cdTargetDevice, unsigned int uiNumElements, int iInitialLoopCount, int iCycles); 
void VectorHypotHost(const float* pfData1, const float* pfData2, float* pfResult, unsigned int uiNumElements, int iInnerLoopCount);
void Cleanup (int iExitCode);
void (*pCleanup)(int) = &Cleanup;

int *gp_argc = 0;
const char *** gp_argv = NULL;

// Main function 
// *********************************************************************
int main(int argc, const char **argv)
{
    //Locals
    size_t szKernelLength;                      // Byte size of kernel code
    double dBuildTime;                          // Compile time
    cl_uint uiTargetDevice = 0;	                // Default Device to compute on
    cl_uint uiNumDevsUsed = 1;                  // Number of devices used in this sample   
    cl_uint uiNumDevices;                       // Number of devices available 
    int iDevCap = -1;                           // Capability of device
    int iInnerLoopCount = BASE_LOOP_COUNT;      // Varies "compute intensity" per data within the kernel 
    const int iTestCycles = 10;                 // How many times to run the external test loop 
    const int iWarmupCycles = 8;                // How many times to run the warmup sequence 
    cl_uint uiWorkGroupMultiple = 4;            // Command line var (using "workgroupmult=<n>") to optionally increase workgroup size
    cl_uint uiNumElements = BASE_ARRAY_LENGTH;  // initial # of elements per array to process (note: procesing 4 per work item)
    cl_uint uiSizeMultiple = 4;                 // Command line var (using "sizemult=<n>") to optionally increase vector sizes
    bool bPassFlag = false;                     // Var to accumulate test pass/fail
    shrBOOL bMatch = shrFALSE;                  // Cross check result
	shrBOOL bTestOverlap = shrFALSE;
	double dAvgGPUTime[2] = {0.0, 0.0};         // Average time of iTestCycles calls for 2-Queue and 1-Queue test
    double dHostTime[2] = {0.0, 0.0};           // Host computation time (2nd test is redundant but a good stability indicator)
    float fMinPassCriteria[2] = {0.0f, 0.0f};	// Test pass cireria, adjusted dependant on GPU arch

    gp_argc = &argc;
    gp_argv = &argv;

    shrQAStart(argc, (char **)argv);

    // start logs 
	cExecutableName = argv[0];
    shrSetLogFileName ("oclCopyComputeOverlap.txt");
    shrLog("%s Starting...\n\n", argv[0]); 

    // get basic command line args 
    bNoPrompt = (shrTRUE == shrCheckCmdLineFlag(argc, argv, "noprompt"));
    bQATest   = (shrTRUE == shrCheckCmdLineFlag(argc, argv, "qatest"));
    shrGetCmdLineArgumentu(argc, argv, "device", &uiTargetDevice);

    // Optional Command-line multiplier for vector size 
    //   Default val of 4 gives 10.24 million float elements per vector
    //   Range of 3 - 16 (7.68 to 40.96 million floats) is reasonable range (if system and GPU have enough memory)
    shrGetCmdLineArgumentu(argc, argv, "sizemult", &uiSizeMultiple);
    uiSizeMultiple = CLAMP(uiSizeMultiple, 1, 50);  
    uiNumElements = uiSizeMultiple * BASE_ARRAY_LENGTH * BASE_WORK_ITEMS;
    shrLog("Array sizes = %u float elements\n", uiNumElements); 

    // Optional Command-line multiplier for workgroup size (x 64 work items)
    //   Default val of 4 gives szLocalWorkSize of 256.
    //   Range of 1 - 8 (resulting in workgroup sizes of 64 to 512) is reasonable range
    shrGetCmdLineArgumentu(argc, argv, "workgroupmult", &uiWorkGroupMultiple);
    uiWorkGroupMultiple = CLAMP(uiWorkGroupMultiple, 1, 10); 
    szLocalWorkSize = uiWorkGroupMultiple * BASE_WORK_ITEMS;
    shrLog("Workgroup Size = %u\n\n", szLocalWorkSize); 

    // Get the NVIDIA platform if available, otherwise use default
    shrLog("Get the Platform ID...\n\n");
    ciErrNum = oclGetPlatformID(&cpPlatform);
    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Get OpenCL platform name and version
    char cBuffer[256];
    ciErrNum = clGetPlatformInfo (cpPlatform, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL);
    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    shrLog("Platform Name = %s\n\n", cBuffer);

    // Get all the devices
    shrLog("Get the Device info and select Device...\n");
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    cdDevices = (cl_device_id*)malloc(uiNumDevices * sizeof(cl_device_id));

    // Ethans changes
    CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
    CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
    //ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

    // Set target device and check capabilities 
    shrLog(" # of Devices Available = %u\n", uiNumDevices); 
    uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
    shrLog(" Using Device %u, ", uiTargetDevice); 
    oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);  
    iDevCap = oclGetDevCap(cdDevices[uiTargetDevice]);
    if (iDevCap > 0) {
       shrLog(", Capability = %d.%d\n\n", iDevCap/10, iDevCap%10);
    } else {
       shrLog("\n\n", iDevCap); 
    }
    if (strstr(cBuffer, "NVIDIA") != NULL)
    {
        if (iDevCap < 12)
        {
            shrLog("Device doesn't have overlap capability.  Skipping test...\n"); 
            Cleanup (EXIT_SUCCESS);
        }

		// Device and Platform eligible for overlap testing
		bTestOverlap = shrTRUE;

        // If device has overlap capability, proceed
        fMinPassCriteria[0] = PASS_FACTOR * EXPECTED_OVERLAP;               // 1st cycle overlap is same for 1 or 2 copy engines
        if (iDevCap != 20) 
        {
            // Single copy engine
            fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP;            // avg of many cycles
        }
        else 
        {
            char cDevName[1024];
            clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_NAME, sizeof(cDevName), &cDevName, NULL);
            if(strstr(cDevName, "Quadro")!=0 || strstr(cDevName, "Tesla")!=0)
            {
                // Tesla or Quadro (arch = 2.0) ... Dual copy engine 
                fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP_FERMI;  // average of many cycles
            }
            else
            {
                // Geforce ... Single copy engine
                fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP;        // average of many cycles
            }
        }
    }   

    // Create the context
    shrLog("clCreateContext...\n"); 
    cxGPUContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    
    // Create 2 command-queues
    cqCommandQueue[0] = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    shrLog("clCreateCommandQueue [0]...\n"); 
    cqCommandQueue[1] = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    shrLog("clCreateCommandQueue [1]...\n"); 

    // Allocate the OpenCL source and result buffer memory objects on GPU device GMEM
    szBuffBytes = sizeof(cl_float) * uiNumElements;
    cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szBuffBytes, NULL, &ciErrNum);
    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szBuffBytes, NULL, &ciErrNum);
    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    cmDevResult = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, szBuffBytes, NULL, &ciErrNum);
    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    shrLog("clCreateBuffer (Src A, Src B and Result GPU Device GMEM, 3 x %u floats) ...\n", uiNumElements); 

    // Allocate pinned source and result host buffers:  
    //   Note: Pinned (Page Locked) memory is needed for async host<->GPU memory copy operations ***
    cmPinnedSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    cmPinnedSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    cmPinnedResult = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    shrLog("clCreateBuffer (Src A, Src B and Result Pinned Host buffers, 3 x %u floats)...\n\n", uiNumElements); 

    // Get mapped pointers to pinned input host buffers
    //   Note:  This allows general (non-OpenCL) host functions to access pinned buffers using standard pointers
    fSourceA = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedSrcA, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    fSourceB = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedSrcB, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    fResult = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedResult, CL_TRUE, CL_MAP_READ, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
    //oclCheckErrorEX (ciErrNum, CL_SUCCESS, pCleanup);
    shrLog("clEnqueueMapBuffer (Pointers to 3 pinned host buffers)...\n"); 

    // Alloc temp golden buffer for cross checks
    Golden = (float*)malloc(szBuffBytes);
    //oclCheckErrorEX(Golden != NULL, shrTRUE, pCleanup);

    // Read the OpenCL kernel in from source file
    cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
    //oclCheckError(cPathAndName != NULL, shrTRUE);
    cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
   // oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    shrLog("oclLoadProgSource (%s)...\n", cSourceFile); 

    // Create the program object
    //cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);
    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    shrLog("clCreateProgramWithSource...\n"); 
    cl_program program =
      clCreateProgramWithBuiltInKernels(context, 1, &device_id, "VectorHypot", NULL);
    // Build the program for the target device
    clFinish(cqCommandQueue[0]);
    shrDeltaT(0);
    ciErrNum = clBuildProgram(program, uiNumDevsUsed, &cdDevices[uiTargetDevice], "-cl-fast-relaxed-math", NULL, NULL);
    shrLog("clBuildProgram..."); 
    if (ciErrNum != CL_SUCCESS)
    {
        // write out standard error, Build Log and PTX, then cleanup and exit
        shrLogEx(LOGBOTH | ERRORMSG, (double)ciErrNum, STDERROR);
        oclLogBuildInfo(program, oclGetFirstDev(cxGPUContext));
        oclLogPtx(program, oclGetFirstDev(cxGPUContext), "VectorHypot.ptx");
        Cleanup(EXIT_FAILURE);
    }
    dBuildTime = shrDeltaT(0);

    // Ethan - Kernel Addition

    if (program == NULL) {
        std::cerr << "Failed to write program binary" << std::endl;
        Cleanup(context, queue, program, kernel, memObjects);
        return 1;
    } else {
        std::cout << "Read program from binary." << std::endl;
    }

    // Create the kernel
    ckKernel[0] = clCreateKernel(program, "VectorHypot", &ciErrNum);
    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    ckKernel[1] = clCreateKernel(program, "VectorHypot", &ciErrNum);
    //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    shrLog("clCreateKernel (ckKernel[2])...\n"); 

    // Offsets for 2 queues
    cl_uint uiOffset[2] = {0, uiNumElements / (2 * 4)};

    // Set the Argument values for the 1st kernel instance (queue 0)
    ciErrNum = clSetKernelArg(ckKernel[0], 0, sizeof(cl_mem), (void*)&cmDevSrcA);
    ciErrNum |= clSetKernelArg(ckKernel[0], 1, sizeof(cl_mem), (void*)&cmDevSrcB);
    ciErrNum |= clSetKernelArg(ckKernel[0], 2, sizeof(cl_mem), (void*)&cmDevResult);
    ciErrNum |= clSetKernelArg(ckKernel[0], 3, sizeof(cl_uint), (void*)&uiOffset[0]);
    ciErrNum |= clSetKernelArg(ckKernel[0], 4, sizeof(cl_int), (void*)&iInnerLoopCount);
    ciErrNum |= clSetKernelArg(ckKernel[0], 5, sizeof(cl_uint), (void*)&uiNumElements);
    shrLog("clSetKernelArg ckKernel[0] args 0 - 5...\n"); 

    // Set the Argument values for the 2d kernel instance (queue 1)
    ciErrNum |= clSetKernelArg(ckKernel[1], 0, sizeof(cl_mem), (void*)&cmDevSrcA);
    ciErrNum |= clSetKernelArg(ckKernel[1], 1, sizeof(cl_mem), (void*)&cmDevSrcB);
    ciErrNum |= clSetKernelArg(ckKernel[1], 2, sizeof(cl_mem), (void*)&cmDevResult);
    ciErrNum |= clSetKernelArg(ckKernel[1], 3, sizeof(cl_uint), (void*)&uiOffset[1]);
    ciErrNum |= clSetKernelArg(ckKernel[1], 4, sizeof(cl_int), (void*)&iInnerLoopCount);
    ciErrNum |= clSetKernelArg(ckKernel[1], 5, sizeof(cl_uint), (void*)&uiNumElements);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    shrLog("clSetKernelArg ckKernel[1] args 0 - 5...\n\n"); 

    //*******************************************
    // Warmup the driver with dual queue sequence
    //*******************************************

    // Warmup with dual queue sequence for iTestCycles
    shrLog("Warmup with 2-Queue sequence, %d cycles...\n", iWarmupCycles);
    DualQueueSequence(iWarmupCycles, uiNumElements, false);

    // Use single queue config to adjust compute intensity 
    shrLog("Adjust compute for GPU / system...\n");
    iInnerLoopCount = AdjustCompute(cdDevices[uiTargetDevice], uiNumElements, iInnerLoopCount, iTestCycles); 
    shrLog("  Kernel inner loop count = %d\n", iInnerLoopCount); 

    //*******************************************
    // Run and time with 2 command-queues
    //*******************************************
	for( int iRun =0; iRun <= RETRIES_ON_FAILURE; ++iRun ) {
	
	// Run the sequence iTestCycles times
    dAvgGPUTime[0] = DualQueueSequence(iTestCycles, uiNumElements, false);

    // Warmup then Compute on host iTestCycles times (using mapped standard pointer to pinned host cl_mem buffer) 
    shrLog("  Device vs Host Result Comparison\t: "); 
    VectorHypotHost(fSourceA, fSourceB, Golden, uiNumElements, iInnerLoopCount);   
    shrDeltaT(0);
    for (int i = 0; i < iTestCycles; i++)
    {
        VectorHypotHost (fSourceA, fSourceB, Golden, uiNumElements, iInnerLoopCount);   
    }
    dHostTime[0] = shrDeltaT(0)/iTestCycles;

    // Compare host and GPU results (using mapped standard pointer to pinned host cl_mem buffer) 
    bMatch = shrComparefet(Golden, fResult, uiNumElements, 0.0f, 0);
    shrLog("gpu %s cpu\n", (bMatch == shrTRUE) ? "MATCHES" : "DOESN'T MATCH"); 
    bPassFlag = (bMatch == shrTRUE);

    //*******************************************
    // Run and time with 1 command queue
    //*******************************************
    // Run the sequence iTestCycles times
    dAvgGPUTime[1] = OneQueueSequence(iTestCycles, uiNumElements, false);

    // Compute on host iTestCycles times (using mapped standard pointer to pinned host cl_mem buffer) 
    shrLog("  Device vs Host Result Comparison\t: "); 
    shrDeltaT(0);
    for (int i = 0; i < iTestCycles; i++)
    {
        VectorHypotHost(fSourceA, fSourceB, Golden, (int)uiNumElements, iInnerLoopCount);   
    }
    dHostTime[1] = shrDeltaT(0)/iTestCycles;

    // Compare host and GPU results (using mapped standard pointer to pinned host cl_mem buffer) 
    bMatch = shrComparefet(Golden, fResult, uiNumElements, 0.0f, 0);
    shrLog("gpu %s cpu\n", (bMatch == shrTRUE) ? "MATCHES" : "DOESN'T MATCH"); 
    bPassFlag &= (bMatch == shrTRUE);

    //*******************************************

    // Compare Single and Dual queue timing 
    shrLog("\nResult Summary:\n"); 

    // Log GPU and CPU Time for 2-queue scenario
    shrLog("  Avg GPU Elapsed Time for 2-Queues\t= %.5f s\n", dAvgGPUTime[0]);
    shrLog("  Avg Host Elapsed Time\t\t\t= %.5f s\n\n", dHostTime[0]);

    // Log GPU and CPU Time for 1-queue scenario
    shrLog("  Avg GPU Elapsed Time for 1-Queue\t= %.5f s\n", dAvgGPUTime[1]);
    shrLog("  Avg Host Elapsed Time\t\t\t= %.5f s\n\n", dHostTime[1]);

    // Log overlap % for GPU (comparison of 2-queue and 1 queue scenarios) and status
    double dAvgOverlap = 100.0 * (1.0 - dAvgGPUTime[0]/dAvgGPUTime[1]);
    
	if( bTestOverlap ) {
		bool bAvgOverlapOK = (dAvgOverlap >= fMinPassCriteria[1]);
		if( iRun == RETRIES_ON_FAILURE || bAvgOverlapOK ) {
			shrLog("  Measured and (Acceptable) Avg Overlap\t= %.1f %% (%.1f %%)  -> Measured Overlap is %s\n\n", dAvgOverlap, fMinPassCriteria[1], bAvgOverlapOK ? "Acceptable" : "NOT Acceptable");

			// Log info to master log in standard format
			shrLogEx(LOGBOTH | MASTER, 0, "oclCopyComputeOverlap-Avg, Throughput = %.4f OverlapPercent, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n", 
			dAvgOverlap, dAvgGPUTime[0], uiNumElements, uiNumDevsUsed, szLocalWorkSize); 
			
			bPassFlag &= bAvgOverlapOK;
			break;
		}
	} 

		shrLog("  Measured and (Acceptable) Avg Overlap\t= %.1f %% (%.1f %%)  -> Retry %d more time(s)...\n\n", dAvgOverlap, fMinPassCriteria[1], RETRIES_ON_FAILURE - iRun);
	}


    //*******************************************
    // Report pass/fail, cleanup and exit
    Cleanup (bPassFlag ? EXIT_SUCCESS : EXIT_FAILURE);
}

// Run 1 queue sequence for n cycles
// *********************************************************************
double OneQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig)
{
    // Use fresh source Data: (re)initialize pinned host array buffers (using mapped standard pointer to pinned host cl_mem buffer) 
    shrFillArray(fSourceA, (int)uiNumElements);
    shrFillArray(fSourceB, (int)uiNumElements);

    // Reset Global work size for 1 command-queue, and log work sizes & dimensions
    szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/4));

    // *** Make sure queues are empty and then start timer 
    double dAvgTime = 0.0;
    clFinish(cqCommandQueue[0]);
    clFinish(cqCommandQueue[1]);  
    shrDeltaT(0);

    // Run the sequence iCycles times
    for (int i = 0; i < iCycles; i++)
    {
        // Nonblocking Write of all of input data from host to device in command-queue 0 
        ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szBuffBytes, (void*)&fSourceA[0], 0, NULL, NULL);
        ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szBuffBytes, (void*)&fSourceB[0], 0, NULL, NULL);
        shrCheckError(ciErrNum, CL_SUCCESS);

        // Launch kernel computation, command-queue 0 
        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
        //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

        // Non Blocking Read of output data from device to host, command-queue 0 
        ciErrNum = clEnqueueReadBuffer(cqCommandQueue[0], cmDevResult, CL_FALSE, 0, szBuffBytes, (void*)&fResult[0], 0, NULL, NULL);
        shrCheckError(ciErrNum, CL_SUCCESS);

        // Flush sequence to device (may not be necessary on Linux or WinXP or when using the NVIDIA Tesla Computing Cluster driver)
        clFlush(cqCommandQueue[0]);
    }
    
    // *** Assure sync to host and return average sequence time
    clFinish(cqCommandQueue[0]);
    dAvgTime = shrDeltaT(0)/(double)iCycles;

    // Log config if asked for
    if (bShowConfig)
    {
        shrLog("\n1-Queue sequence Configuration:\n");
        shrLog("  Global Work Size (per command-queue)\t= %u\n  Local Work Size \t\t\t= %u\n  # of Work Groups (per command-queue)\t= %u\n  # of command-queues\t\t\t= 1\n", 
           szGlobalWorkSize, szLocalWorkSize, szGlobalWorkSize/szLocalWorkSize); 
    }
    return dAvgTime;
}

// Run 2 queue sequence for n cycles
// *********************************************************************
double DualQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig)
{
    // Locals
    size_t szHalfBuffer = szBuffBytes / 2;
    size_t szHalfOffset = szHalfBuffer / sizeof(float);
    double dAvgTime = 0.0;

    // Use fresh source Data: (re)initialize pinned host array buffers (using mapped standard pointer to pinned host cl_mem buffer) 
    shrFillArray(fSourceA, (int)uiNumElements);
    shrFillArray(fSourceB, (int)uiNumElements);

    // Set Global work size for 2 command-queues, and log work sizes & dimensions
    szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/(2 * 4)));

    // Make sure queues are empty and then start timer 
    clFinish(cqCommandQueue[0]);
    clFinish(cqCommandQueue[1]);
    shrDeltaT(0);

    for (int i = 0; i < iCycles; i++)
    {
        // Mid Phase 0 
        // Nonblocking Write of 1st half of input data from host to device in command-queue 0 
        ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szHalfBuffer, (void*)&fSourceA[0], 0, NULL, NULL);
        ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szHalfBuffer, (void*)&fSourceB[0], 0, NULL, NULL);
        shrCheckError(ciErrNum, CL_SUCCESS);

        // Push out the write for queue 0 (and prior read from queue 1 at end of loop) to the driver 
        // (not necessary on Linux, Mac OSX or WinXP)
        clFlush(cqCommandQueue[0]);
        clFlush(cqCommandQueue[1]);

        // Start Phase 1 ***********************************

        // Launch kernel computation, command-queue 0
        // (Note:  The order MATTERS here on Fermi !  THE KERNEL IN THIS PHASE SHOULD BE LAUNCHED BEFORE THE WRITE)
        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

        // Nonblocking Write of 2nd half of input data from host to device in command-queue 1 
        // (Note:  The order MATTERS here on Fermi !  THE KERNEL IN THIS PHASE SHOULD BE LAUNCHED BEFORE THE WRITE)
        ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[1], cmDevSrcA, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fSourceA[szHalfOffset], 0, NULL, NULL);
        ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[1], cmDevSrcB, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fSourceB[szHalfOffset], 0, NULL, NULL);
        shrCheckError(ciErrNum, CL_SUCCESS);

        // Push out the compute for queue 0 and write for queue 1 to the driver
        // (not necessary on Linux, Mac OSX or WinXP)
        clFlush(cqCommandQueue[0]);
        clFlush(cqCommandQueue[1]);

        // Start Phase 2 ***********************************

        // Launch kernel computation, command-queue 1 
        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[1], ckKernel[1], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
        //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);

        // Non Blocking Read of 1st half of output data from device to host, command-queue 0 
        ciErrNum = clEnqueueReadBuffer(cqCommandQueue[0], cmDevResult, CL_FALSE, 0, szHalfBuffer, (void*)&fResult[0], 0, NULL, NULL);
        shrCheckError(ciErrNum, CL_SUCCESS);

        // Push out the compute for queue 1 and the read for queue 0 to the driver 
        // (not necessary on Linux, Mac OSX or WinXP)
        clFlush(cqCommandQueue[0]);
        clFlush(cqCommandQueue[1]);

        // Start Phase 0 (Rolls over) ***********************************

        // Non Blocking Read of 2nd half of output data from device to host, command-queue 1 
        ciErrNum = clEnqueueReadBuffer(cqCommandQueue[1], cmDevResult, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fResult[szHalfOffset], 0, NULL, NULL);
        shrCheckError(ciErrNum, CL_SUCCESS);
    }

    // *** Sync to host and get average sequence time
    clFinish(cqCommandQueue[0]);
    clFinish(cqCommandQueue[1]);    
    dAvgTime = shrDeltaT(0)/(double)iCycles;

    // Log config if asked for
    if (bShowConfig)
    {
        shrLog("\n2-Queue sequence Configuration:\n");
        shrLog("  Global Work Size (per command-queue)\t= %u\n  Local Work Size \t\t\t= %u\n  # of Work Groups (per command-queue)\t= %u\n  # of command-queues\t\t\t= 2\n", 
           szGlobalWorkSize, szLocalWorkSize, szGlobalWorkSize/szLocalWorkSize); 
    }

    return dAvgTime;
}

// Function to adjust compute task according to device capability
// This allows a consistent overlap % across a wide variety of GPU's for test purposes
// It also implitly illustrates the relationship between compute capability and overlap at fixed work size
// *********************************************************************
int AdjustCompute(cl_device_id cdTargetDevice, unsigned int uiNumElements, int iInitLoopCount, int iCycles)
{
    // Locals
    double dCopyTime, dComputeTime;
    int iComputedLoopCount; 

    // Change Source Data
    shrFillArray(fSourceA, (int)uiNumElements);
    shrFillArray(fSourceB, (int)uiNumElements);

    // Reset Global work size for 1 command-queue, and log work sizes & dimensions
    szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/4));

    // *** Make sure queues are empty and then start timer 
    clFinish(cqCommandQueue[0]);
    clFinish(cqCommandQueue[1]);  
    shrDeltaT(0);

    // Run the copy iCycles times and measure copy time on this system
    for (int i = 0; i < iCycles; i++)
    {
        // Nonblocking Write of all of input data from host to device in command-queue 0 
        ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szBuffBytes, (void*)&fSourceA[0], 0, NULL, NULL);
        ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szBuffBytes, (void*)&fSourceB[0], 0, NULL, NULL);
        ciErrNum |= clFlush(cqCommandQueue[0]);
        shrCheckError(ciErrNum, CL_SUCCESS);
    }
    clFinish(cqCommandQueue[0]);
    dCopyTime = shrDeltaT(0);

    // Run the compute iCycles times and measure compute time on this system
    for (int i = 0; i < iCycles; i++)
    {
        // Launch kernel computation, command-queue 0 
        ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
        ciErrNum |= clFlush(cqCommandQueue[0]);
        oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    }
    clFinish(cqCommandQueue[0]);
    dComputeTime = shrDeltaT(0);

    // Determine number of core loop cycles proportional to copy/compute time ratio
    dComputeTime = MAX(dComputeTime, 1.0e-6);
    iComputedLoopCount = CLAMP(2, (int)((dCopyTime/dComputeTime) * (double)iInitLoopCount), (iInitLoopCount * 4));
    ciErrNum |= clSetKernelArg(ckKernel[0], 4, sizeof(cl_int), (void*)&iComputedLoopCount);
    ciErrNum |= clSetKernelArg(ckKernel[1], 4, sizeof(cl_int), (void*)&iComputedLoopCount);
    oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
    return (iComputedLoopCount);
} 

// Cleanup/Exit function 
// *********************************************************************
void Cleanup (int iExitCode)
{
    // Cleanup allocated objects
    shrLog("Starting Cleanup...\n\n");
    if(cPathAndName)free(cPathAndName);
    if(cSourceCL)free(cSourceCL);
    if(Golden)free(Golden);
    if(ckKernel[0])clReleaseKernel(ckKernel[0]);
    if(ckKernel[1])clReleaseKernel(ckKernel[1]);
    if(program)clReleaseProgram(program);
    if(fSourceA)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedSrcA, (void*)fSourceA, 0, NULL, NULL);
    if(fSourceB)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedSrcB, (void*)fSourceB, 0, NULL, NULL);
    if(fResult)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedResult, (void*)fResult, 0, NULL, NULL);
    if(cmDevSrcA)clReleaseMemObject(cmDevSrcA);
    if(cmDevSrcB)clReleaseMemObject(cmDevSrcB);
    if(cmDevResult)clReleaseMemObject(cmDevResult);
    if(cmPinnedSrcA)clReleaseMemObject(cmPinnedSrcA);
    if(cmPinnedSrcB)clReleaseMemObject(cmPinnedSrcB);
    if(cmPinnedResult)clReleaseMemObject(cmPinnedResult);
    if(cqCommandQueue[0])clReleaseCommandQueue(cqCommandQueue[0]);
    if(cqCommandQueue[1])clReleaseCommandQueue(cqCommandQueue[1]);
    if(cxGPUContext)clReleaseContext(cxGPUContext);
    if(cdDevices)free(cdDevices);

    // Master status Pass/Fail (all tests)
    shrQAFinishExit( *gp_argc, (const char **)*gp_argv, (iExitCode == EXIT_SUCCESS) ? QA_PASSED : QA_FAILED );
}

// "Golden" Host processing vector hyptenuse function for comparison purposes
// *********************************************************************
void VectorHypotHost(const float* pfData1, const float* pfData2, float* pfResult, unsigned int uiNumElements, int iInnerLoopCount)
{
    for (unsigned int i = 0; i < uiNumElements; i++) 
    {
        float fA = pfData1[i];
        float fB = pfData2[i];
        float fC = sqrtf(fA * fA + fB * fB);

        pfResult[i] = fC;
    }
}