+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes minor update minor update minor update minor update minor update minor update cleanup cleanup cache bindings and memory perf refactory minor update minor update hw unit tests fixes minor update minor update minor update minor update minor update minor udpate minor update minor update minor update minor update minor update minor update minor update minor updates minor updates minor update minor update minor update minor update minor update minor update minor updates minor updates minor updates minor updates minor update minor update
391 lines
15 KiB
C++
391 lines
15 KiB
C++
/*
|
|
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
|
*
|
|
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
|
* with this source code for terms and conditions that govern your use of
|
|
* this software. Any use, reproduction, disclosure, or distribution of
|
|
* this software and related documentation outside the terms of the EULA
|
|
* is strictly prohibited.
|
|
*
|
|
*/
|
|
|
|
/* Matrix transpose with Cuda
|
|
* Host code.
|
|
|
|
* This example transposes arbitrary-size matrices. It compares a naive
|
|
* transpose kernel that suffers from non-coalesced writes, to an optimized
|
|
* transpose with fully coalesced memory access and no bank conflicts. On
|
|
* a G80 GPU, the optimized transpose can be more than 10x faster for large
|
|
* matrices.
|
|
*/
|
|
|
|
// standard utility and system includes
|
|
#include "oclUtils.h"
|
|
#include "shrQATest.h"
|
|
|
|
#define BLOCK_DIM 16
|
|
|
|
// max GPU's to manage for multi-GPU parallel compute
|
|
const unsigned int MAX_GPU_COUNT = 8;
|
|
|
|
// global variables
|
|
cl_platform_id cpPlatform;
|
|
cl_uint uiNumDevices;
|
|
cl_device_id* cdDevices;
|
|
cl_context cxGPUContext;
|
|
cl_kernel ckKernel[MAX_GPU_COUNT];
|
|
cl_command_queue commandQueue[MAX_GPU_COUNT];
|
|
cl_program rv_program;
|
|
|
|
// forward declarations
|
|
// *********************************************************************
|
|
int runTest( int argc, const char** argv);
|
|
extern "C" void computeGold( float* reference, float* idata,
|
|
const unsigned int size_x, const unsigned int size_y );
|
|
|
|
// Main Program
|
|
// *********************************************************************
|
|
int main( int argc, const char** argv)
|
|
{
|
|
shrQAStart(argc, (char **)argv);
|
|
|
|
// set logfile name and start logs
|
|
shrSetLogFileName ("oclTranspose.txt");
|
|
shrLog("%s Starting...\n\n", argv[0]);
|
|
|
|
// run the main test
|
|
int result = runTest(argc, argv);
|
|
oclCheckError(result, 0);
|
|
}
|
|
|
|
static double transposeGPU(const char* kernelName, bool useLocalMem, cl_uint ciDeviceCount, float* h_idata, float* h_odata, unsigned int size_x, unsigned int size_y)
|
|
{
|
|
cl_mem d_odata[MAX_GPU_COUNT];
|
|
cl_mem d_idata[MAX_GPU_COUNT];
|
|
cl_kernel ckKernel[MAX_GPU_COUNT];
|
|
|
|
size_t szGlobalWorkSize[2];
|
|
size_t szLocalWorkSize[2];
|
|
cl_int ciErrNum;
|
|
|
|
// Create buffers for each GPU
|
|
// Each GPU will compute sizePerGPU rows of the result
|
|
size_t sizePerGPU = shrRoundUp(BLOCK_DIM, (size_x+ciDeviceCount-1) / ciDeviceCount);
|
|
|
|
// size of memory required to store the matrix
|
|
const size_t mem_size = sizeof(float) * size_x * size_y;
|
|
|
|
for(unsigned int i = 0; i < ciDeviceCount; ++i){
|
|
// allocate device memory and copy host to device memory
|
|
d_idata[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
|
|
mem_size, h_idata, &ciErrNum);
|
|
oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
// create buffer to store output
|
|
d_odata[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY ,
|
|
sizePerGPU*size_y*sizeof(float), NULL, &ciErrNum);
|
|
oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
// create the naive transpose kernel
|
|
ckKernel[i] = clCreateKernel(rv_program, kernelName, &ciErrNum);
|
|
oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
// set the args values for the naive kernel
|
|
size_t offset = i * sizePerGPU;
|
|
ciErrNum = clSetKernelArg(ckKernel[i], 0, sizeof(cl_mem), (void *) &d_odata[i]);
|
|
ciErrNum |= clSetKernelArg(ckKernel[i], 1, sizeof(cl_mem), (void *) &d_idata[0]);
|
|
ciErrNum |= clSetKernelArg(ckKernel[i], 2, sizeof(int), &offset);
|
|
ciErrNum |= clSetKernelArg(ckKernel[i], 3, sizeof(int), &size_x);
|
|
ciErrNum |= clSetKernelArg(ckKernel[i], 4, sizeof(int), &size_y);
|
|
if (useLocalMem) {
|
|
ciErrNum |= clSetKernelArg(ckKernel[i], 5, (BLOCK_DIM + 1) * BLOCK_DIM * sizeof(float), 0 );
|
|
}
|
|
}
|
|
oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
// set up execution configuration
|
|
szLocalWorkSize[0] = BLOCK_DIM;
|
|
szLocalWorkSize[1] = BLOCK_DIM;
|
|
szGlobalWorkSize[0] = sizePerGPU;
|
|
szGlobalWorkSize[1] = shrRoundUp(BLOCK_DIM, size_y);
|
|
|
|
// execute the kernel numIterations times
|
|
//int numIterations = 100;
|
|
int numIterations = 1;
|
|
shrLog("\nProcessing a %d by %d matrix of floats...\n\n", size_x, size_y);
|
|
for (int i = -1; i < numIterations; ++i) {
|
|
if (i == 0)
|
|
shrDeltaT(0);
|
|
for (unsigned int k=0; k < ciDeviceCount; ++k) {
|
|
ciErrNum |= clEnqueueNDRangeKernel(commandQueue[k], ckKernel[k], 2, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
|
|
}
|
|
oclCheckError(ciErrNum, CL_SUCCESS);
|
|
}
|
|
|
|
// Block CPU till GPU is done
|
|
for(unsigned int k=0; k < ciDeviceCount; ++k){
|
|
ciErrNum |= clFinish(commandQueue[k]);
|
|
}
|
|
double time = shrDeltaT(0)/(double)numIterations;
|
|
oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
// Copy back to host
|
|
for(unsigned int i = 0; i < ciDeviceCount; ++i){
|
|
size_t offset = i * sizePerGPU;
|
|
size_t size = MIN(size_x - i * sizePerGPU, sizePerGPU);
|
|
|
|
ciErrNum |= clEnqueueReadBuffer(commandQueue[i], d_odata[i], CL_TRUE, 0,
|
|
size * size_y * sizeof(float), &h_odata[offset * size_y],
|
|
0, NULL, NULL);
|
|
}
|
|
oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
for(unsigned int i = 0; i < ciDeviceCount; ++i){
|
|
ciErrNum |= clReleaseMemObject(d_idata[i]);
|
|
ciErrNum |= clReleaseMemObject(d_odata[i]);
|
|
ciErrNum |= clReleaseKernel(ckKernel[i]);
|
|
}
|
|
oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
return time;
|
|
}
|
|
|
|
uint8_t *kernel_bin = NULL;
|
|
|
|
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
|
|
if (nullptr == filename || nullptr == data || 0 == size)
|
|
return -1;
|
|
|
|
FILE* fp = fopen(filename, "r");
|
|
if (NULL == fp) {
|
|
fprintf(stderr, "Failed to load kernel.");
|
|
return -1;
|
|
}
|
|
fseek(fp , 0 , SEEK_END);
|
|
long fsize = ftell(fp);
|
|
rewind(fp);
|
|
|
|
*data = (uint8_t*)malloc(fsize);
|
|
*size = fread(*data, 1, fsize, fp);
|
|
|
|
fclose(fp);
|
|
|
|
return 0;
|
|
}
|
|
|
|
//! Run a simple test for CUDA
|
|
// *********************************************************************
|
|
int runTest( const int argc, const char** argv)
|
|
{
|
|
cl_int ciErrNum;
|
|
cl_uint ciDeviceCount;
|
|
//unsigned int size_x = 2048;
|
|
//unsigned int size_y = 2048;
|
|
unsigned int size_x = 64;
|
|
unsigned int size_y = 64;
|
|
|
|
int temp;
|
|
if( shrGetCmdLineArgumenti( argc, argv,"width", &temp) ){
|
|
size_x = temp;
|
|
}
|
|
|
|
if( shrGetCmdLineArgumenti( argc, argv,"height", &temp) ){
|
|
size_y = temp;
|
|
}
|
|
|
|
// size of memory required to store the matrix
|
|
const size_t mem_size = sizeof(float) * size_x * size_y;
|
|
|
|
//Get the NVIDIA platform
|
|
ciErrNum = oclGetPlatformID(&cpPlatform);
|
|
oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
//Get the devices
|
|
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
|
|
oclCheckError(ciErrNum, CL_SUCCESS);
|
|
cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
|
|
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
|
|
oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
//Create the context
|
|
cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
|
|
oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
if(shrCheckCmdLineFlag(argc, (const char**)argv, "device"))
|
|
{
|
|
ciDeviceCount = 0;
|
|
// User specified GPUs
|
|
char* deviceList;
|
|
char* deviceStr;
|
|
|
|
shrGetCmdLineArgumentstr(argc, (const char**)argv, "device", &deviceList);
|
|
|
|
#ifdef WIN32
|
|
char* next_token;
|
|
deviceStr = strtok_s (deviceList," ,.-", &next_token);
|
|
#else
|
|
deviceStr = strtok (deviceList," ,.-");
|
|
#endif
|
|
ciDeviceCount = 0;
|
|
while(deviceStr != NULL)
|
|
{
|
|
// get and print the device for this queue
|
|
cl_device_id device = oclGetDev(cxGPUContext, atoi(deviceStr));
|
|
if( device == (cl_device_id)-1 ) {
|
|
shrLog(" Invalid Device: %s\n\n", deviceStr);
|
|
return -1;
|
|
}
|
|
|
|
shrLog("Device %d: ", atoi(deviceStr));
|
|
oclPrintDevName(LOGBOTH, device);
|
|
shrLog("\n");
|
|
|
|
// create command queue
|
|
commandQueue[ciDeviceCount] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
|
|
if (ciErrNum != CL_SUCCESS)
|
|
{
|
|
shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
|
|
return ciErrNum;
|
|
}
|
|
|
|
++ciDeviceCount;
|
|
|
|
#ifdef WIN32
|
|
deviceStr = strtok_s (NULL," ,.-", &next_token);
|
|
#else
|
|
deviceStr = strtok (NULL," ,.-");
|
|
#endif
|
|
}
|
|
|
|
free(deviceList);
|
|
}
|
|
else
|
|
{
|
|
// Find out how many GPU's to compute on all available GPUs
|
|
size_t nDeviceBytes;
|
|
ciErrNum |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes);
|
|
ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id);
|
|
|
|
if (ciErrNum != CL_SUCCESS)
|
|
{
|
|
shrLog(" Error %i in clGetDeviceIDs call !!!\n\n", ciErrNum);
|
|
return ciErrNum;
|
|
}
|
|
else if (ciDeviceCount == 0)
|
|
{
|
|
shrLog(" There are no devices supporting OpenCL (return code %i)\n\n", ciErrNum);
|
|
return -1;
|
|
}
|
|
|
|
// create command-queues
|
|
for(unsigned int i = 0; i < ciDeviceCount; ++i)
|
|
{
|
|
// get and print the device for this queue
|
|
cl_device_id device = oclGetDev(cxGPUContext, i);
|
|
shrLog("Device %d: ", i);
|
|
oclPrintDevName(LOGBOTH, device);
|
|
shrLog("\n");
|
|
|
|
// create command queue
|
|
commandQueue[i] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
|
|
if (ciErrNum != CL_SUCCESS)
|
|
{
|
|
shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
|
|
return ciErrNum;
|
|
}
|
|
}
|
|
}
|
|
|
|
// allocate and initalize host memory
|
|
float* h_idata = (float*)malloc(mem_size);
|
|
float* h_odata = (float*) malloc(mem_size);
|
|
srand(15235911);
|
|
shrFillArray(h_idata, (size_x * size_y));
|
|
|
|
// create the program
|
|
//rv_program = clCreateProgramWithSource(cxGPUContext, 1,
|
|
// (const char **)&source, &program_length, &ciErrNum);
|
|
uint8_t *kernel_bin = NULL;
|
|
size_t kernel_size;
|
|
cl_int binary_status = 0;
|
|
ciErrNum = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
|
|
if (ciErrNum != CL_SUCCESS) {
|
|
shrLog(" Error %i in read_kernel_file call !!!\n\n", ciErrNum);
|
|
return ciErrNum;
|
|
}
|
|
rv_program = clCreateProgramWithBinary(
|
|
cxGPUContext, 1, cdDevices, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &ciErrNum);
|
|
if (ciErrNum != CL_SUCCESS) {
|
|
shrLog(" Error %i in clCreateProgramWithBinary call !!!\n\n", ciErrNum);
|
|
return ciErrNum;
|
|
}
|
|
|
|
// build the program
|
|
ciErrNum = clBuildProgram(rv_program, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
|
|
if (ciErrNum != CL_SUCCESS) {
|
|
// write out standard error, Build Log and PTX, then return error
|
|
shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
|
|
oclLogBuildInfo(rv_program, oclGetFirstDev(cxGPUContext));
|
|
oclLogPtx(rv_program, oclGetFirstDev(cxGPUContext), "oclTranspose.ptx");
|
|
return(EXIT_FAILURE);
|
|
}
|
|
|
|
// Run Naive Kernel
|
|
#ifdef GPU_PROFILING
|
|
// Matrix Copy kernel runs to measure reference performance.
|
|
//double uncoalescedCopyTime = transposeGPU("uncoalesced_copy", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
|
|
//double simpleCopyTime = transposeGPU("simple_copy", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
|
|
//double sharedCopyTime = transposeGPU("shared_copy", true, ciDeviceCount, h_idata, h_odata, size_x, size_y);
|
|
#endif
|
|
|
|
double naiveTime = transposeGPU("transpose_naive", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
|
|
//double optimizedTime = transposeGPU("transpose", true, ciDeviceCount, h_idata, h_odata, size_x, size_y);
|
|
|
|
#ifdef GPU_PROFILING
|
|
// log times
|
|
|
|
shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-simple copy, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n",
|
|
(1.0e-9 * double(size_x * size_y * sizeof(float))/simpleCopyTime), simpleCopyTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM);
|
|
|
|
shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-shared memory copy, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n",
|
|
(1.0e-9 * double(size_x * size_y * sizeof(float))/sharedCopyTime), sharedCopyTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM);
|
|
|
|
shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-uncoalesced copy, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n",
|
|
(1.0e-9 * double(size_x * size_y * sizeof(float))/uncoalescedCopyTime), uncoalescedCopyTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM);
|
|
|
|
shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-naive, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n",
|
|
(1.0e-9 * double(size_x * size_y * sizeof(float))/naiveTime), naiveTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM);
|
|
|
|
shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-optimized, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n",
|
|
(1.0e-9 * double(size_x * size_y * sizeof(float))/optimizedTime), optimizedTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM);
|
|
|
|
#endif
|
|
|
|
// compute reference solution and cross check results
|
|
float* reference = (float*)malloc( mem_size);
|
|
computeGold( reference, h_idata, size_x, size_y);
|
|
shrLog("\nComparing results with CPU computation... \n\n");
|
|
shrBOOL res = shrComparef( reference, h_odata, size_x * size_y);
|
|
|
|
// cleanup memory
|
|
free(h_idata);
|
|
free(h_odata);
|
|
free(reference);
|
|
//free(source);
|
|
//free(source_path);
|
|
|
|
// cleanup OpenCL
|
|
ciErrNum = clReleaseProgram(rv_program);
|
|
for(unsigned int i = 0; i < ciDeviceCount; ++i)
|
|
{
|
|
ciErrNum |= clReleaseCommandQueue(commandQueue[i]);
|
|
}
|
|
ciErrNum |= clReleaseContext(cxGPUContext);
|
|
oclCheckError(ciErrNum, CL_SUCCESS);
|
|
|
|
// pass or fail (cumulative... all tests in the loop)
|
|
shrQAFinishExit(argc, (const char **)argv, (1 == res) ? QA_PASSED : QA_FAILED);
|
|
|
|
return 0;
|
|
}
|