/*
 * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
 *
 * Please refer to the NVIDIA end user license agreement (EULA) associated
 * with this source code for terms and conditions that govern your use of
 * this software. Any use, reproduction, disclosure, or distribution of
 * this software and related documentation outside the terms of the EULA
 * is strictly prohibited.
 *
 */
 
/* Matrix transpose with Cuda 
 * Host code.

 * This example transposes arbitrary-size matrices.  It compares a naive
 * transpose kernel that suffers from non-coalesced writes, to an optimized
 * transpose with fully coalesced memory access and no bank conflicts.  On 
 * a G80 GPU, the optimized transpose can be more than 10x faster for large
 * matrices.
 */

// standard utility and system includes
#include "oclUtils.h"
#include "shrQATest.h"

#define BLOCK_DIM 16

// max GPU's to manage for multi-GPU parallel compute
const unsigned int MAX_GPU_COUNT = 8;

// global variables
cl_platform_id cpPlatform;
cl_uint uiNumDevices;
cl_device_id* cdDevices;
cl_context cxGPUContext;
cl_kernel ckKernel[MAX_GPU_COUNT];
cl_command_queue commandQueue[MAX_GPU_COUNT];
cl_program rv_program;

// forward declarations
// *********************************************************************
int runTest( int argc, const char** argv);
extern "C" void computeGold( float* reference, float* idata, 
                         const unsigned int size_x, const unsigned int size_y );

// Main Program
// *********************************************************************
int main( int argc, const char** argv) 
{    
    shrQAStart(argc, (char **)argv);

    // set logfile name and start logs
    shrSetLogFileName ("oclTranspose.txt");
    shrLog("%s Starting...\n\n", argv[0]); 

    // run the main test
    int result = runTest(argc, argv);
    //oclCheckError(result, 0);
}

double transposeGPU(const char* kernelName, bool useLocalMem,  cl_uint ciDeviceCount, float* h_idata, float* h_odata, unsigned int size_x, unsigned int size_y)
{
    cl_mem d_odata[MAX_GPU_COUNT];
    cl_mem d_idata[MAX_GPU_COUNT];
    cl_kernel ckKernel[MAX_GPU_COUNT];

    size_t szGlobalWorkSize[2];
    size_t szLocalWorkSize[2];
    cl_int ciErrNum;
 
    // Create buffers for each GPU
    // Each GPU will compute sizePerGPU rows of the result
    size_t sizePerGPU = shrRoundUp(BLOCK_DIM, (size_x+ciDeviceCount-1) / ciDeviceCount);
    
    // size of memory required to store the matrix
    const size_t mem_size = sizeof(float) * size_x * size_y;

    for(unsigned int i = 0; i < ciDeviceCount; ++i){
        // allocate device memory and copy host to device memory
        d_idata[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                    mem_size, h_idata, &ciErrNum);
        //oclCheckError(ciErrNum, CL_SUCCESS);

        // create buffer to store output
        d_odata[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY ,
                                    sizePerGPU*size_y*sizeof(float), NULL, &ciErrNum);
        //oclCheckError(ciErrNum, CL_SUCCESS);

        // create the naive transpose kernel
        ckKernel[i] = clCreateKernel(rv_program, kernelName, &ciErrNum);
        //oclCheckError(ciErrNum, CL_SUCCESS);
        
        // set the args values for the naive kernel
        size_t offset = i * sizePerGPU;
        ciErrNum  = clSetKernelArg(ckKernel[i], 0, sizeof(cl_mem), (void *) &d_odata[i]);
        ciErrNum |= clSetKernelArg(ckKernel[i], 1, sizeof(cl_mem), (void *) &d_idata[0]);
        ciErrNum |= clSetKernelArg(ckKernel[i], 2, sizeof(int), &offset);
        ciErrNum |= clSetKernelArg(ckKernel[i], 3, sizeof(int), &size_x);
        ciErrNum |= clSetKernelArg(ckKernel[i], 4, sizeof(int), &size_y);
        if(useLocalMem)
        {
            ciErrNum |= clSetKernelArg(ckKernel[i], 5, (BLOCK_DIM + 1) * BLOCK_DIM * sizeof(float), 0 );
        }
    }
    //oclCheckError(ciErrNum, CL_SUCCESS);

    // set up execution configuration
    szLocalWorkSize[0] = BLOCK_DIM;
    szLocalWorkSize[1] = BLOCK_DIM;
    szGlobalWorkSize[0] = sizePerGPU;
    szGlobalWorkSize[1] = shrRoundUp(BLOCK_DIM, size_y);
    
    // execute the kernel numIterations times
    int numIterations = 100;
    shrLog("\nProcessing a %d by %d matrix of floats...\n\n", size_x, size_y);
    for (int i = -1; i < numIterations; ++i)
    {
        // Start time measurement after warmup
        if( i == 0 ) shrDeltaT(0);

        for(unsigned int k=0; k < ciDeviceCount; ++k){
            ciErrNum |= clEnqueueNDRangeKernel(commandQueue[k], ckKernel[k], 2, NULL,                                           
                                szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
        }
        //oclCheckError(ciErrNum, CL_SUCCESS);
    }    

    // Block CPU till GPU is done
    for(unsigned int k=0; k < ciDeviceCount; ++k){ 
        ciErrNum |= clFinish(commandQueue[k]);
    }
    double time = shrDeltaT(0)/(double)numIterations;
    //oclCheckError(ciErrNum, CL_SUCCESS);

    // Copy back to host
    for(unsigned int i = 0; i < ciDeviceCount; ++i){
        size_t offset = i * sizePerGPU;
        size_t size = MIN(size_x - i * sizePerGPU, sizePerGPU);

        ciErrNum |= clEnqueueReadBuffer(commandQueue[i], d_odata[i], CL_TRUE, 0,
                                size * size_y * sizeof(float), &h_odata[offset * size_y], 
                                0, NULL, NULL);
    }
    //oclCheckError(ciErrNum, CL_SUCCESS);

    for(unsigned int i = 0; i < ciDeviceCount; ++i){
        ciErrNum |= clReleaseMemObject(d_idata[i]);
        ciErrNum |= clReleaseMemObject(d_odata[i]);
        ciErrNum |= clReleaseKernel(ckKernel[i]);
    }
    //oclCheckError(ciErrNum, CL_SUCCESS);

    return time;
}
uint8_t *kernel_bin = NULL;

static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
  if (nullptr == filename || nullptr == data || 0 == size)
    return -1;

  FILE* fp = fopen(filename, "r");
  if (NULL == fp) {
    fprintf(stderr, "Failed to load kernel.");
    return -1;
  }
  fseek(fp , 0 , SEEK_END);
  long fsize = ftell(fp);
  rewind(fp);

  *data = (uint8_t*)malloc(fsize);
  *size = fread(*data, 1, fsize, fp);
  
  fclose(fp);
  
  return 0;
}
//! Run a simple test for CUDA
// *********************************************************************
int runTest( const int argc, const char** argv) 
{
    cl_int ciErrNum;
    cl_uint ciDeviceCount;
    unsigned int size_x = 2048;
    unsigned int size_y = 2048;

    int temp;
    if( shrGetCmdLineArgumenti( argc, argv,"width", &temp) ){
        size_x = temp;
    }

    if( shrGetCmdLineArgumenti( argc, argv,"height", &temp) ){
        size_y = temp;
    }

    // size of memory required to store the matrix
    const size_t mem_size = sizeof(float) * size_x * size_y;

    //Get the NVIDIA platform
    ciErrNum = oclGetPlatformID(&cpPlatform);
    //oclCheckError(ciErrNum, CL_SUCCESS);

    //Get the devices
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
    //oclCheckError(ciErrNum, CL_SUCCESS);
    cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
    //oclCheckError(ciErrNum, CL_SUCCESS);

    //Create the context
    cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
    //oclCheckError(ciErrNum, CL_SUCCESS);
  
    if(shrCheckCmdLineFlag(argc, (const char**)argv, "device"))
    {
        ciDeviceCount = 0;
        // User specified GPUs
        char* deviceList;
        char* deviceStr;

        shrGetCmdLineArgumentstr(argc, (const char**)argv, "device", &deviceList);

        #ifdef WIN32
            char* next_token;
            deviceStr = strtok_s (deviceList," ,.-", &next_token);
        #else
            deviceStr = strtok (deviceList," ,.-");
        #endif   
        ciDeviceCount = 0;
        while(deviceStr != NULL) 
        {
            // get and print the device for this queue
            cl_device_id device = oclGetDev(cxGPUContext, atoi(deviceStr));
	    if( device == (cl_device_id)-1 ) {
                shrLog(" Invalid Device: %s\n\n", deviceStr);
                return -1;
	    }	

            shrLog("Device %d: ", atoi(deviceStr));
            oclPrintDevName(LOGBOTH, device);            
            shrLog("\n");
           
            // create command queue
            commandQueue[ciDeviceCount] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
            if (ciErrNum != CL_SUCCESS)
            {
                shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
                return ciErrNum;
            }

            ++ciDeviceCount;

            #ifdef WIN32
                deviceStr = strtok_s (NULL," ,.-", &next_token);
            #else            
                deviceStr = strtok (NULL," ,.-");
            #endif
        }

        free(deviceList);
    } 
    else 
    {
        // Find out how many GPU's to compute on all available GPUs
        size_t nDeviceBytes;
        ciErrNum |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes);
        ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id);

        if (ciErrNum != CL_SUCCESS)
        {
            shrLog(" Error %i in clGetDeviceIDs call !!!\n\n", ciErrNum);
            return ciErrNum;
        }
        else if (ciDeviceCount == 0)
        {
            shrLog(" There are no devices supporting OpenCL (return code %i)\n\n", ciErrNum);
            return -1;
        } 

        // create command-queues
        for(unsigned int i = 0; i < ciDeviceCount; ++i) 
        {
            // get and print the device for this queue
            cl_device_id device = oclGetDev(cxGPUContext, i);
            shrLog("Device %d: ", i);
            oclPrintDevName(LOGBOTH, device);            
            shrLog("\n");

            // create command queue
            commandQueue[i] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
            if (ciErrNum != CL_SUCCESS)
            {
                shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
                return ciErrNum;
            }
        }
    }
 
    // allocate and initalize host memory
    float* h_idata = (float*)malloc(mem_size);
    float* h_odata = (float*) malloc(mem_size);
    srand(15235911);
    shrFillArray(h_idata, (size_x * size_y));

    // Program Setup
    size_t program_length;
    char* source_path = shrFindFilePath("transpose.cl", argv[0]);
    //oclCheckError(source_path != NULL, shrTRUE);
    char *source = oclLoadProgSource(source_path, "", &program_length);
    //oclCheckError(source != NULL, shrTRUE);
    size_t kernel_size;
    cl_int binary_status = 0;
    cl_device_id device_id;
    // create the program
    rv_program = clCreateProgramWithBinary(
        cxGPUContext, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, NULL);
    //rv_program = clCreateProgramWithSource(cxGPUContext, 1,
                     // (const char **)&source, &program_length, &ciErrNum);
    //oclCheckError(ciErrNum, CL_SUCCESS);
    
    // build the program
    ciErrNum = clBuildProgram(rv_program, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        // write out standard error, Build Log and PTX, then return error
        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
        oclLogBuildInfo(rv_program, oclGetFirstDev(cxGPUContext));
        oclLogPtx(rv_program, oclGetFirstDev(cxGPUContext), "oclTranspose.ptx");
        return(EXIT_FAILURE); 
    }
    
    // Run Naive Kernel
#ifdef GPU_PROFILING
    // Matrix Copy kernel runs to measure reference performance.
    double uncoalescedCopyTime = transposeGPU("uncoalesced_copy", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
    double simpleCopyTime = transposeGPU("simple_copy", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
    double sharedCopyTime = transposeGPU("shared_copy", true, ciDeviceCount, h_idata, h_odata, size_x, size_y);
#endif

    double naiveTime = transposeGPU("transpose_naive", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
    double optimizedTime = transposeGPU("transpose", true, ciDeviceCount, h_idata, h_odata, size_x, size_y);

#ifdef GPU_PROFILING
    // log times

    shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-simple copy, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n", 
           (1.0e-9 * double(size_x * size_y * sizeof(float))/simpleCopyTime), simpleCopyTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM); 

    shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-shared memory copy, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n", 
           (1.0e-9 * double(size_x * size_y * sizeof(float))/sharedCopyTime), sharedCopyTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM); 

    shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-uncoalesced copy, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n", 
           (1.0e-9 * double(size_x * size_y * sizeof(float))/uncoalescedCopyTime), uncoalescedCopyTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM); 

    shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-naive, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n", 
           (1.0e-9 * double(size_x * size_y * sizeof(float))/naiveTime), naiveTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM); 
    
    shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-optimized, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n", 
          (1.0e-9 * double(size_x * size_y * sizeof(float))/optimizedTime), optimizedTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM); 

#endif
  
    // compute reference solution and cross check results
    float* reference = (float*)malloc( mem_size);
    computeGold( reference, h_idata, size_x, size_y);
    shrLog("\nComparing results with CPU computation... \n\n");
    shrBOOL res = shrComparef( reference, h_odata, size_x * size_y);

    // cleanup memory
    free(h_idata);
    free(h_odata);
    free(reference);
    free(source);
    free(source_path);

    // cleanup OpenCL
    ciErrNum = clReleaseProgram(rv_program);    
    for(unsigned int i = 0; i < ciDeviceCount; ++i) 
    {
        ciErrNum |= clReleaseCommandQueue(commandQueue[i]);
    }    
    ciErrNum |= clReleaseContext(cxGPUContext);
    //oclCheckError(ciErrNum, CL_SUCCESS);

    // pass or fail (cumulative... all tests in the loop)
    shrQAFinishExit(argc, (const char **)argv, (1 == res) ? QA_PASSED : QA_FAILED);

    return 0;
}