Vortex 2.0 changes:

+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes minor update minor update minor update minor update minor update minor update cleanup cleanup cache bindings and memory perf refactory minor update minor update hw unit tests fixes minor update minor update minor update minor update minor update minor udpate minor update minor update minor update minor update minor update minor update minor update minor updates minor updates minor update minor update minor update minor update minor update minor update minor updates minor updates minor updates minor updates minor update minor update
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit c1e168fdbe
1309 changed files with 247412 additions and 311463 deletions
--- a/tests/opencl/transpose/main.cc
+++ b/tests/opencl/transpose/main.cc
@@ -55,10 +55,10 @@ int main( int argc, const char** argv)

    // run the main test
    int result = runTest(argc, argv);
-    //oclCheckError(result, 0);
+    oclCheckError(result, 0);
 }

-double transposeGPU(const char* kernelName, bool useLocalMem,  cl_uint ciDeviceCount, float* h_idata, float* h_odata, unsigned int size_x, unsigned int size_y)
+static double transposeGPU(const char* kernelName, bool useLocalMem,  cl_uint ciDeviceCount, float* h_idata, float* h_odata, unsigned int size_x, unsigned int size_y)
 {
    cl_mem d_odata[MAX_GPU_COUNT];
    cl_mem d_idata[MAX_GPU_COUNT];
@@ -79,16 +79,16 @@ double transposeGPU(const char* kernelName, bool useLocalMem,  cl_uint ciDeviceC
        // allocate device memory and copy host to device memory
        d_idata[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
                                    mem_size, h_idata, &ciErrNum);
-        //oclCheckError(ciErrNum, CL_SUCCESS);
+        oclCheckError(ciErrNum, CL_SUCCESS);

        // create buffer to store output
        d_odata[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY ,
                                    sizePerGPU*size_y*sizeof(float), NULL, &ciErrNum);
-        //oclCheckError(ciErrNum, CL_SUCCESS);
+        oclCheckError(ciErrNum, CL_SUCCESS);

        // create the naive transpose kernel
        ckKernel[i] = clCreateKernel(rv_program, kernelName, &ciErrNum);
-        //oclCheckError(ciErrNum, CL_SUCCESS);
+        oclCheckError(ciErrNum, CL_SUCCESS);
        
        // set the args values for the naive kernel
        size_t offset = i * sizePerGPU;
@@ -97,12 +97,11 @@ double transposeGPU(const char* kernelName, bool useLocalMem,  cl_uint ciDeviceC
        ciErrNum |= clSetKernelArg(ckKernel[i], 2, sizeof(int), &offset);
        ciErrNum |= clSetKernelArg(ckKernel[i], 3, sizeof(int), &size_x);
        ciErrNum |= clSetKernelArg(ckKernel[i], 4, sizeof(int), &size_y);
-        if(useLocalMem)
-        {
+        if (useLocalMem) {
            ciErrNum |= clSetKernelArg(ckKernel[i], 5, (BLOCK_DIM + 1) * BLOCK_DIM * sizeof(float), 0 );
        }
    }
-    //oclCheckError(ciErrNum, CL_SUCCESS);
+    oclCheckError(ciErrNum, CL_SUCCESS);

    // set up execution configuration
    szLocalWorkSize[0] = BLOCK_DIM;
@@ -111,18 +110,16 @@ double transposeGPU(const char* kernelName, bool useLocalMem,  cl_uint ciDeviceC
    szGlobalWorkSize[1] = shrRoundUp(BLOCK_DIM, size_y);
    
    // execute the kernel numIterations times
-    int numIterations = 100;
+    //int numIterations = 100;
+    int numIterations = 1;
    shrLog("\nProcessing a %d by %d matrix of floats...\n\n", size_x, size_y);
-    for (int i = -1; i < numIterations; ++i)
-    {
-        // Start time measurement after warmup
-        if( i == 0 ) shrDeltaT(0);
-
-        for(unsigned int k=0; k < ciDeviceCount; ++k){
-            ciErrNum |= clEnqueueNDRangeKernel(commandQueue[k], ckKernel[k], 2, NULL,                                           
-                                szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
+    for (int i = -1; i < numIterations; ++i) {
+        if (i == 0) 
+            shrDeltaT(0);
+        for (unsigned int k=0; k < ciDeviceCount; ++k) {
+            ciErrNum |= clEnqueueNDRangeKernel(commandQueue[k], ckKernel[k], 2, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
        }
-        //oclCheckError(ciErrNum, CL_SUCCESS);
+        oclCheckError(ciErrNum, CL_SUCCESS);
    }    

    // Block CPU till GPU is done
@@ -130,7 +127,7 @@ double transposeGPU(const char* kernelName, bool useLocalMem,  cl_uint ciDeviceC
        ciErrNum |= clFinish(commandQueue[k]);
    }
    double time = shrDeltaT(0)/(double)numIterations;
-    //oclCheckError(ciErrNum, CL_SUCCESS);
+    oclCheckError(ciErrNum, CL_SUCCESS);

    // Copy back to host
    for(unsigned int i = 0; i < ciDeviceCount; ++i){
@@ -141,17 +138,18 @@ double transposeGPU(const char* kernelName, bool useLocalMem,  cl_uint ciDeviceC
                                size * size_y * sizeof(float), &h_odata[offset * size_y], 
                                0, NULL, NULL);
    }
-    //oclCheckError(ciErrNum, CL_SUCCESS);
+    oclCheckError(ciErrNum, CL_SUCCESS);

    for(unsigned int i = 0; i < ciDeviceCount; ++i){
        ciErrNum |= clReleaseMemObject(d_idata[i]);
        ciErrNum |= clReleaseMemObject(d_odata[i]);
        ciErrNum |= clReleaseKernel(ckKernel[i]);
    }
-    //oclCheckError(ciErrNum, CL_SUCCESS);
+    oclCheckError(ciErrNum, CL_SUCCESS);

    return time;
 }
+
 uint8_t *kernel_bin = NULL;

 static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
@@ -174,14 +172,17 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
  
  return 0;
 }
+
 //! Run a simple test for CUDA
 // *********************************************************************
 int runTest( const int argc, const char** argv) 
 {
    cl_int ciErrNum;
    cl_uint ciDeviceCount;
-    unsigned int size_x = 2048;
-    unsigned int size_y = 2048;
+    //unsigned int size_x = 2048;
+    //unsigned int size_y = 2048;
+    unsigned int size_x = 64;
+    unsigned int size_y = 64;

    int temp;
    if( shrGetCmdLineArgumenti( argc, argv,"width", &temp) ){
@@ -197,18 +198,18 @@ int runTest( const int argc, const char** argv)

    //Get the NVIDIA platform
    ciErrNum = oclGetPlatformID(&cpPlatform);
-    //oclCheckError(ciErrNum, CL_SUCCESS);
+    oclCheckError(ciErrNum, CL_SUCCESS);

    //Get the devices
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
-    //oclCheckError(ciErrNum, CL_SUCCESS);
+    oclCheckError(ciErrNum, CL_SUCCESS);
    cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
-    //oclCheckError(ciErrNum, CL_SUCCESS);
+    oclCheckError(ciErrNum, CL_SUCCESS);

    //Create the context
    cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
-    //oclCheckError(ciErrNum, CL_SUCCESS);
+    oclCheckError(ciErrNum, CL_SUCCESS);
  
    if(shrCheckCmdLineFlag(argc, (const char**)argv, "device"))
    {
@@ -301,26 +302,27 @@ int runTest( const int argc, const char** argv)
    srand(15235911);
    shrFillArray(h_idata, (size_x * size_y));

-    // Program Setup
-    size_t program_length;
-    char* source_path = shrFindFilePath("transpose.cl", argv[0]);
-    //oclCheckError(source_path != NULL, shrTRUE);
-    char *source = oclLoadProgSource(source_path, "", &program_length);
-    //oclCheckError(source != NULL, shrTRUE);
-    size_t kernel_size;
-    cl_int binary_status = 0;
-    cl_device_id device_id;
    // create the program
-    rv_program = clCreateProgramWithBinary(
-        cxGPUContext, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, NULL);
    //rv_program = clCreateProgramWithSource(cxGPUContext, 1,
                     // (const char **)&source, &program_length, &ciErrNum);
-    //oclCheckError(ciErrNum, CL_SUCCESS);
+    uint8_t *kernel_bin = NULL;
+    size_t kernel_size;
+    cl_int binary_status = 0;  
+    ciErrNum = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
+    if (ciErrNum != CL_SUCCESS) {
+        shrLog(" Error %i in read_kernel_file call !!!\n\n", ciErrNum);
+        return ciErrNum;
+    }
+    rv_program = clCreateProgramWithBinary(
+        cxGPUContext, 1, cdDevices, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &ciErrNum);
+    if (ciErrNum != CL_SUCCESS) {
+        shrLog(" Error %i in clCreateProgramWithBinary call !!!\n\n", ciErrNum);
+        return ciErrNum;
+    }
    
    // build the program
    ciErrNum = clBuildProgram(rv_program, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
-    if (ciErrNum != CL_SUCCESS)
-    {
+    if (ciErrNum != CL_SUCCESS) {
        // write out standard error, Build Log and PTX, then return error
        shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
        oclLogBuildInfo(rv_program, oclGetFirstDev(cxGPUContext));
@@ -331,13 +333,13 @@ int runTest( const int argc, const char** argv)
    // Run Naive Kernel
 #ifdef GPU_PROFILING
    // Matrix Copy kernel runs to measure reference performance.
-    double uncoalescedCopyTime = transposeGPU("uncoalesced_copy", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
-    double simpleCopyTime = transposeGPU("simple_copy", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
-    double sharedCopyTime = transposeGPU("shared_copy", true, ciDeviceCount, h_idata, h_odata, size_x, size_y);
+    //double uncoalescedCopyTime = transposeGPU("uncoalesced_copy", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
+    //double simpleCopyTime = transposeGPU("simple_copy", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
+    //double sharedCopyTime = transposeGPU("shared_copy", true, ciDeviceCount, h_idata, h_odata, size_x, size_y);
 #endif

    double naiveTime = transposeGPU("transpose_naive", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
-    double optimizedTime = transposeGPU("transpose", true, ciDeviceCount, h_idata, h_odata, size_x, size_y);
+    //double optimizedTime = transposeGPU("transpose", true, ciDeviceCount, h_idata, h_odata, size_x, size_y);

 #ifdef GPU_PROFILING
    // log times
@@ -369,8 +371,8 @@ int runTest( const int argc, const char** argv)
    free(h_idata);
    free(h_odata);
    free(reference);
-    free(source);
-    free(source_path);
+    //free(source);
+    //free(source_path);

    // cleanup OpenCL
    ciErrNum = clReleaseProgram(rv_program);    
@@ -379,7 +381,7 @@ int runTest( const int argc, const char** argv)
        ciErrNum |= clReleaseCommandQueue(commandQueue[i]);
    }    
    ciErrNum |= clReleaseContext(cxGPUContext);
-    //oclCheckError(ciErrNum, CL_SUCCESS);
+    oclCheckError(ciErrNum, CL_SUCCESS);

    // pass or fail (cumulative... all tests in the loop)
    shrQAFinishExit(argc, (const char **)argv, (1 == res) ? QA_PASSED : QA_FAILED);