transpose
This commit is contained in:
committed by
GitHub Enterprise
parent
59a5d45cbf
commit
80e290c500
365
benchmarks/opencl/transpose/main.cc
Normal file
365
benchmarks/opencl/transpose/main.cc
Normal file
@@ -0,0 +1,365 @@
|
||||
/*
|
||||
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
|
||||
*
|
||||
* Please refer to the NVIDIA end user license agreement (EULA) associated
|
||||
* with this source code for terms and conditions that govern your use of
|
||||
* this software. Any use, reproduction, disclosure, or distribution of
|
||||
* this software and related documentation outside the terms of the EULA
|
||||
* is strictly prohibited.
|
||||
*
|
||||
*/
|
||||
|
||||
/* Matrix transpose with Cuda
|
||||
* Host code.
|
||||
|
||||
* This example transposes arbitrary-size matrices. It compares a naive
|
||||
* transpose kernel that suffers from non-coalesced writes, to an optimized
|
||||
* transpose with fully coalesced memory access and no bank conflicts. On
|
||||
* a G80 GPU, the optimized transpose can be more than 10x faster for large
|
||||
* matrices.
|
||||
*/
|
||||
|
||||
// standard utility and system includes
|
||||
#include <oclUtils.h>
|
||||
#include <shrQATest.h>
|
||||
|
||||
#define BLOCK_DIM 16
|
||||
|
||||
// max GPU's to manage for multi-GPU parallel compute
|
||||
const unsigned int MAX_GPU_COUNT = 8;
|
||||
|
||||
// global variables
|
||||
cl_platform_id cpPlatform;
|
||||
cl_uint uiNumDevices;
|
||||
cl_device_id* cdDevices;
|
||||
cl_context cxGPUContext;
|
||||
cl_kernel ckKernel[MAX_GPU_COUNT];
|
||||
cl_command_queue commandQueue[MAX_GPU_COUNT];
|
||||
cl_program rv_program;
|
||||
|
||||
// forward declarations
|
||||
// *********************************************************************
|
||||
int runTest( int argc, const char** argv);
|
||||
extern "C" void computeGold( float* reference, float* idata,
|
||||
const unsigned int size_x, const unsigned int size_y );
|
||||
|
||||
// Main Program
|
||||
// *********************************************************************
|
||||
int main( int argc, const char** argv)
|
||||
{
|
||||
shrQAStart(argc, (char **)argv);
|
||||
|
||||
// set logfile name and start logs
|
||||
shrSetLogFileName ("oclTranspose.txt");
|
||||
shrLog("%s Starting...\n\n", argv[0]);
|
||||
|
||||
// run the main test
|
||||
int result = runTest(argc, argv);
|
||||
//oclCheckError(result, 0);
|
||||
}
|
||||
|
||||
double transposeGPU(const char* kernelName, bool useLocalMem, cl_uint ciDeviceCount, float* h_idata, float* h_odata, unsigned int size_x, unsigned int size_y)
|
||||
{
|
||||
cl_mem d_odata[MAX_GPU_COUNT];
|
||||
cl_mem d_idata[MAX_GPU_COUNT];
|
||||
cl_kernel ckKernel[MAX_GPU_COUNT];
|
||||
|
||||
size_t szGlobalWorkSize[2];
|
||||
size_t szLocalWorkSize[2];
|
||||
cl_int ciErrNum;
|
||||
|
||||
// Create buffers for each GPU
|
||||
// Each GPU will compute sizePerGPU rows of the result
|
||||
size_t sizePerGPU = shrRoundUp(BLOCK_DIM, (size_x+ciDeviceCount-1) / ciDeviceCount);
|
||||
|
||||
// size of memory required to store the matrix
|
||||
const size_t mem_size = sizeof(float) * size_x * size_y;
|
||||
|
||||
for(unsigned int i = 0; i < ciDeviceCount; ++i){
|
||||
// allocate device memory and copy host to device memory
|
||||
d_idata[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
|
||||
mem_size, h_idata, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// create buffer to store output
|
||||
d_odata[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY ,
|
||||
sizePerGPU*size_y*sizeof(float), NULL, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// create the naive transpose kernel
|
||||
ckKernel[i] = clCreateKernel(rv_program, kernelName, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// set the args values for the naive kernel
|
||||
size_t offset = i * sizePerGPU;
|
||||
ciErrNum = clSetKernelArg(ckKernel[i], 0, sizeof(cl_mem), (void *) &d_odata[i]);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[i], 1, sizeof(cl_mem), (void *) &d_idata[0]);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[i], 2, sizeof(int), &offset);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[i], 3, sizeof(int), &size_x);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[i], 4, sizeof(int), &size_y);
|
||||
if(useLocalMem)
|
||||
{
|
||||
ciErrNum |= clSetKernelArg(ckKernel[i], 5, (BLOCK_DIM + 1) * BLOCK_DIM * sizeof(float), 0 );
|
||||
}
|
||||
}
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// set up execution configuration
|
||||
szLocalWorkSize[0] = BLOCK_DIM;
|
||||
szLocalWorkSize[1] = BLOCK_DIM;
|
||||
szGlobalWorkSize[0] = sizePerGPU;
|
||||
szGlobalWorkSize[1] = shrRoundUp(BLOCK_DIM, size_y);
|
||||
|
||||
// execute the kernel numIterations times
|
||||
int numIterations = 100;
|
||||
shrLog("\nProcessing a %d by %d matrix of floats...\n\n", size_x, size_y);
|
||||
for (int i = -1; i < numIterations; ++i)
|
||||
{
|
||||
// Start time measurement after warmup
|
||||
if( i == 0 ) shrDeltaT(0);
|
||||
|
||||
for(unsigned int k=0; k < ciDeviceCount; ++k){
|
||||
ciErrNum |= clEnqueueNDRangeKernel(commandQueue[k], ckKernel[k], 2, NULL,
|
||||
szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
|
||||
}
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
}
|
||||
|
||||
// Block CPU till GPU is done
|
||||
for(unsigned int k=0; k < ciDeviceCount; ++k){
|
||||
ciErrNum |= clFinish(commandQueue[k]);
|
||||
}
|
||||
double time = shrDeltaT(0)/(double)numIterations;
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// Copy back to host
|
||||
for(unsigned int i = 0; i < ciDeviceCount; ++i){
|
||||
size_t offset = i * sizePerGPU;
|
||||
size_t size = MIN(size_x - i * sizePerGPU, sizePerGPU);
|
||||
|
||||
ciErrNum |= clEnqueueReadBuffer(commandQueue[i], d_odata[i], CL_TRUE, 0,
|
||||
size * size_y * sizeof(float), &h_odata[offset * size_y],
|
||||
0, NULL, NULL);
|
||||
}
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
for(unsigned int i = 0; i < ciDeviceCount; ++i){
|
||||
ciErrNum |= clReleaseMemObject(d_idata[i]);
|
||||
ciErrNum |= clReleaseMemObject(d_odata[i]);
|
||||
ciErrNum |= clReleaseKernel(ckKernel[i]);
|
||||
}
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
return time;
|
||||
}
|
||||
|
||||
//! Run a simple test for CUDA
|
||||
// *********************************************************************
|
||||
int runTest( const int argc, const char** argv)
|
||||
{
|
||||
cl_int ciErrNum;
|
||||
cl_uint ciDeviceCount;
|
||||
unsigned int size_x = 2048;
|
||||
unsigned int size_y = 2048;
|
||||
|
||||
int temp;
|
||||
if( shrGetCmdLineArgumenti( argc, argv,"width", &temp) ){
|
||||
size_x = temp;
|
||||
}
|
||||
|
||||
if( shrGetCmdLineArgumenti( argc, argv,"height", &temp) ){
|
||||
size_y = temp;
|
||||
}
|
||||
|
||||
// size of memory required to store the matrix
|
||||
const size_t mem_size = sizeof(float) * size_x * size_y;
|
||||
|
||||
//Get the NVIDIA platform
|
||||
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
//Get the devices
|
||||
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
|
||||
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
//Create the context
|
||||
cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
if(shrCheckCmdLineFlag(argc, (const char**)argv, "device"))
|
||||
{
|
||||
ciDeviceCount = 0;
|
||||
// User specified GPUs
|
||||
char* deviceList;
|
||||
char* deviceStr;
|
||||
|
||||
shrGetCmdLineArgumentstr(argc, (const char**)argv, "device", &deviceList);
|
||||
|
||||
#ifdef WIN32
|
||||
char* next_token;
|
||||
deviceStr = strtok_s (deviceList," ,.-", &next_token);
|
||||
#else
|
||||
deviceStr = strtok (deviceList," ,.-");
|
||||
#endif
|
||||
ciDeviceCount = 0;
|
||||
while(deviceStr != NULL)
|
||||
{
|
||||
// get and print the device for this queue
|
||||
cl_device_id device = oclGetDev(cxGPUContext, atoi(deviceStr));
|
||||
if( device == (cl_device_id)-1 ) {
|
||||
shrLog(" Invalid Device: %s\n\n", deviceStr);
|
||||
return -1;
|
||||
}
|
||||
|
||||
shrLog("Device %d: ", atoi(deviceStr));
|
||||
oclPrintDevName(LOGBOTH, device);
|
||||
shrLog("\n");
|
||||
|
||||
// create command queue
|
||||
commandQueue[ciDeviceCount] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
|
||||
if (ciErrNum != CL_SUCCESS)
|
||||
{
|
||||
shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
|
||||
return ciErrNum;
|
||||
}
|
||||
|
||||
++ciDeviceCount;
|
||||
|
||||
#ifdef WIN32
|
||||
deviceStr = strtok_s (NULL," ,.-", &next_token);
|
||||
#else
|
||||
deviceStr = strtok (NULL," ,.-");
|
||||
#endif
|
||||
}
|
||||
|
||||
free(deviceList);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Find out how many GPU's to compute on all available GPUs
|
||||
size_t nDeviceBytes;
|
||||
ciErrNum |= clGetContextInfo(cxGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes);
|
||||
ciDeviceCount = (cl_uint)nDeviceBytes/sizeof(cl_device_id);
|
||||
|
||||
if (ciErrNum != CL_SUCCESS)
|
||||
{
|
||||
shrLog(" Error %i in clGetDeviceIDs call !!!\n\n", ciErrNum);
|
||||
return ciErrNum;
|
||||
}
|
||||
else if (ciDeviceCount == 0)
|
||||
{
|
||||
shrLog(" There are no devices supporting OpenCL (return code %i)\n\n", ciErrNum);
|
||||
return -1;
|
||||
}
|
||||
|
||||
// create command-queues
|
||||
for(unsigned int i = 0; i < ciDeviceCount; ++i)
|
||||
{
|
||||
// get and print the device for this queue
|
||||
cl_device_id device = oclGetDev(cxGPUContext, i);
|
||||
shrLog("Device %d: ", i);
|
||||
oclPrintDevName(LOGBOTH, device);
|
||||
shrLog("\n");
|
||||
|
||||
// create command queue
|
||||
commandQueue[i] = clCreateCommandQueue(cxGPUContext, device, CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
|
||||
if (ciErrNum != CL_SUCCESS)
|
||||
{
|
||||
shrLog(" Error %i in clCreateCommandQueue call !!!\n\n", ciErrNum);
|
||||
return ciErrNum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// allocate and initalize host memory
|
||||
float* h_idata = (float*)malloc(mem_size);
|
||||
float* h_odata = (float*) malloc(mem_size);
|
||||
srand(15235911);
|
||||
shrFillArray(h_idata, (size_x * size_y));
|
||||
|
||||
// Program Setup
|
||||
size_t program_length;
|
||||
char* source_path = shrFindFilePath("transpose.cl", argv[0]);
|
||||
//oclCheckError(source_path != NULL, shrTRUE);
|
||||
char *source = oclLoadProgSource(source_path, "", &program_length);
|
||||
//oclCheckError(source != NULL, shrTRUE);
|
||||
|
||||
// create the program
|
||||
rv_program =
|
||||
clCreateProgramWithBuiltInKernels(context, 1, &device_id, "transpose", NULL);
|
||||
//rv_program = clCreateProgramWithSource(cxGPUContext, 1,
|
||||
// (const char **)&source, &program_length, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// build the program
|
||||
ciErrNum = clBuildProgram(rv_program, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
|
||||
if (ciErrNum != CL_SUCCESS)
|
||||
{
|
||||
// write out standard error, Build Log and PTX, then return error
|
||||
shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
|
||||
oclLogBuildInfo(rv_program, oclGetFirstDev(cxGPUContext));
|
||||
oclLogPtx(rv_program, oclGetFirstDev(cxGPUContext), "oclTranspose.ptx");
|
||||
return(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
// Run Naive Kernel
|
||||
#ifdef GPU_PROFILING
|
||||
// Matrix Copy kernel runs to measure reference performance.
|
||||
double uncoalescedCopyTime = transposeGPU("uncoalesced_copy", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
|
||||
double simpleCopyTime = transposeGPU("simple_copy", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
|
||||
double sharedCopyTime = transposeGPU("shared_copy", true, ciDeviceCount, h_idata, h_odata, size_x, size_y);
|
||||
#endif
|
||||
|
||||
double naiveTime = transposeGPU("transpose_naive", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
|
||||
double optimizedTime = transposeGPU("transpose", true, ciDeviceCount, h_idata, h_odata, size_x, size_y);
|
||||
|
||||
#ifdef GPU_PROFILING
|
||||
// log times
|
||||
|
||||
shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-simple copy, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n",
|
||||
(1.0e-9 * double(size_x * size_y * sizeof(float))/simpleCopyTime), simpleCopyTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM);
|
||||
|
||||
shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-shared memory copy, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n",
|
||||
(1.0e-9 * double(size_x * size_y * sizeof(float))/sharedCopyTime), sharedCopyTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM);
|
||||
|
||||
shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-uncoalesced copy, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n",
|
||||
(1.0e-9 * double(size_x * size_y * sizeof(float))/uncoalescedCopyTime), uncoalescedCopyTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM);
|
||||
|
||||
shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-naive, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n",
|
||||
(1.0e-9 * double(size_x * size_y * sizeof(float))/naiveTime), naiveTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM);
|
||||
|
||||
shrLogEx(LOGBOTH | MASTER, 0, "oclTranspose-Outer-optimized, Throughput = %.4f GB/s, Time = %.5f s, Size = %u fp32 elements, NumDevsUsed = %u, Workgroup = %u\n",
|
||||
(1.0e-9 * double(size_x * size_y * sizeof(float))/optimizedTime), optimizedTime, (size_x * size_y), ciDeviceCount, BLOCK_DIM * BLOCK_DIM);
|
||||
|
||||
#endif
|
||||
|
||||
// compute reference solution and cross check results
|
||||
float* reference = (float*)malloc( mem_size);
|
||||
computeGold( reference, h_idata, size_x, size_y);
|
||||
shrLog("\nComparing results with CPU computation... \n\n");
|
||||
shrBOOL res = shrComparef( reference, h_odata, size_x * size_y);
|
||||
|
||||
// cleanup memory
|
||||
free(h_idata);
|
||||
free(h_odata);
|
||||
free(reference);
|
||||
free(source);
|
||||
free(source_path);
|
||||
|
||||
// cleanup OpenCL
|
||||
ciErrNum = clReleaseProgram(rv_program);
|
||||
for(unsigned int i = 0; i < ciDeviceCount; ++i)
|
||||
{
|
||||
ciErrNum |= clReleaseCommandQueue(commandQueue[i]);
|
||||
}
|
||||
ciErrNum |= clReleaseContext(cxGPUContext);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// pass or fail (cumulative... all tests in the loop)
|
||||
shrQAFinishExit(argc, (const char **)argv, (1 == res) ? QA_PASSED : QA_FAILED);
|
||||
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user