Vortex 2.0 changes:
+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes minor update minor update minor update minor update minor update minor update cleanup cleanup cache bindings and memory perf refactory minor update minor update hw unit tests fixes minor update minor update minor update minor update minor update minor udpate minor update minor update minor update minor update minor update minor update minor update minor updates minor updates minor update minor update minor update minor update minor update minor update minor updates minor updates minor updates minor updates minor update minor update
This commit is contained in:
@@ -55,10 +55,10 @@ int main( int argc, const char** argv)
|
||||
|
||||
// run the main test
|
||||
int result = runTest(argc, argv);
|
||||
//oclCheckError(result, 0);
|
||||
oclCheckError(result, 0);
|
||||
}
|
||||
|
||||
double transposeGPU(const char* kernelName, bool useLocalMem, cl_uint ciDeviceCount, float* h_idata, float* h_odata, unsigned int size_x, unsigned int size_y)
|
||||
static double transposeGPU(const char* kernelName, bool useLocalMem, cl_uint ciDeviceCount, float* h_idata, float* h_odata, unsigned int size_x, unsigned int size_y)
|
||||
{
|
||||
cl_mem d_odata[MAX_GPU_COUNT];
|
||||
cl_mem d_idata[MAX_GPU_COUNT];
|
||||
@@ -79,16 +79,16 @@ double transposeGPU(const char* kernelName, bool useLocalMem, cl_uint ciDeviceC
|
||||
// allocate device memory and copy host to device memory
|
||||
d_idata[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
|
||||
mem_size, h_idata, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// create buffer to store output
|
||||
d_odata[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY ,
|
||||
sizePerGPU*size_y*sizeof(float), NULL, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// create the naive transpose kernel
|
||||
ckKernel[i] = clCreateKernel(rv_program, kernelName, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// set the args values for the naive kernel
|
||||
size_t offset = i * sizePerGPU;
|
||||
@@ -97,12 +97,11 @@ double transposeGPU(const char* kernelName, bool useLocalMem, cl_uint ciDeviceC
|
||||
ciErrNum |= clSetKernelArg(ckKernel[i], 2, sizeof(int), &offset);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[i], 3, sizeof(int), &size_x);
|
||||
ciErrNum |= clSetKernelArg(ckKernel[i], 4, sizeof(int), &size_y);
|
||||
if(useLocalMem)
|
||||
{
|
||||
if (useLocalMem) {
|
||||
ciErrNum |= clSetKernelArg(ckKernel[i], 5, (BLOCK_DIM + 1) * BLOCK_DIM * sizeof(float), 0 );
|
||||
}
|
||||
}
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// set up execution configuration
|
||||
szLocalWorkSize[0] = BLOCK_DIM;
|
||||
@@ -111,18 +110,16 @@ double transposeGPU(const char* kernelName, bool useLocalMem, cl_uint ciDeviceC
|
||||
szGlobalWorkSize[1] = shrRoundUp(BLOCK_DIM, size_y);
|
||||
|
||||
// execute the kernel numIterations times
|
||||
int numIterations = 100;
|
||||
//int numIterations = 100;
|
||||
int numIterations = 1;
|
||||
shrLog("\nProcessing a %d by %d matrix of floats...\n\n", size_x, size_y);
|
||||
for (int i = -1; i < numIterations; ++i)
|
||||
{
|
||||
// Start time measurement after warmup
|
||||
if( i == 0 ) shrDeltaT(0);
|
||||
|
||||
for(unsigned int k=0; k < ciDeviceCount; ++k){
|
||||
ciErrNum |= clEnqueueNDRangeKernel(commandQueue[k], ckKernel[k], 2, NULL,
|
||||
szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
|
||||
for (int i = -1; i < numIterations; ++i) {
|
||||
if (i == 0)
|
||||
shrDeltaT(0);
|
||||
for (unsigned int k=0; k < ciDeviceCount; ++k) {
|
||||
ciErrNum |= clEnqueueNDRangeKernel(commandQueue[k], ckKernel[k], 2, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
|
||||
}
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
}
|
||||
|
||||
// Block CPU till GPU is done
|
||||
@@ -130,7 +127,7 @@ double transposeGPU(const char* kernelName, bool useLocalMem, cl_uint ciDeviceC
|
||||
ciErrNum |= clFinish(commandQueue[k]);
|
||||
}
|
||||
double time = shrDeltaT(0)/(double)numIterations;
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// Copy back to host
|
||||
for(unsigned int i = 0; i < ciDeviceCount; ++i){
|
||||
@@ -141,17 +138,18 @@ double transposeGPU(const char* kernelName, bool useLocalMem, cl_uint ciDeviceC
|
||||
size * size_y * sizeof(float), &h_odata[offset * size_y],
|
||||
0, NULL, NULL);
|
||||
}
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
for(unsigned int i = 0; i < ciDeviceCount; ++i){
|
||||
ciErrNum |= clReleaseMemObject(d_idata[i]);
|
||||
ciErrNum |= clReleaseMemObject(d_odata[i]);
|
||||
ciErrNum |= clReleaseKernel(ckKernel[i]);
|
||||
}
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
return time;
|
||||
}
|
||||
|
||||
uint8_t *kernel_bin = NULL;
|
||||
|
||||
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
|
||||
@@ -174,14 +172,17 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
//! Run a simple test for CUDA
|
||||
// *********************************************************************
|
||||
int runTest( const int argc, const char** argv)
|
||||
{
|
||||
cl_int ciErrNum;
|
||||
cl_uint ciDeviceCount;
|
||||
unsigned int size_x = 2048;
|
||||
unsigned int size_y = 2048;
|
||||
//unsigned int size_x = 2048;
|
||||
//unsigned int size_y = 2048;
|
||||
unsigned int size_x = 64;
|
||||
unsigned int size_y = 64;
|
||||
|
||||
int temp;
|
||||
if( shrGetCmdLineArgumenti( argc, argv,"width", &temp) ){
|
||||
@@ -197,18 +198,18 @@ int runTest( const int argc, const char** argv)
|
||||
|
||||
//Get the NVIDIA platform
|
||||
ciErrNum = oclGetPlatformID(&cpPlatform);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
//Get the devices
|
||||
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
|
||||
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
//Create the context
|
||||
cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
if(shrCheckCmdLineFlag(argc, (const char**)argv, "device"))
|
||||
{
|
||||
@@ -301,26 +302,27 @@ int runTest( const int argc, const char** argv)
|
||||
srand(15235911);
|
||||
shrFillArray(h_idata, (size_x * size_y));
|
||||
|
||||
// Program Setup
|
||||
size_t program_length;
|
||||
char* source_path = shrFindFilePath("transpose.cl", argv[0]);
|
||||
//oclCheckError(source_path != NULL, shrTRUE);
|
||||
char *source = oclLoadProgSource(source_path, "", &program_length);
|
||||
//oclCheckError(source != NULL, shrTRUE);
|
||||
size_t kernel_size;
|
||||
cl_int binary_status = 0;
|
||||
cl_device_id device_id;
|
||||
// create the program
|
||||
rv_program = clCreateProgramWithBinary(
|
||||
cxGPUContext, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, NULL);
|
||||
//rv_program = clCreateProgramWithSource(cxGPUContext, 1,
|
||||
// (const char **)&source, &program_length, &ciErrNum);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
uint8_t *kernel_bin = NULL;
|
||||
size_t kernel_size;
|
||||
cl_int binary_status = 0;
|
||||
ciErrNum = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
|
||||
if (ciErrNum != CL_SUCCESS) {
|
||||
shrLog(" Error %i in read_kernel_file call !!!\n\n", ciErrNum);
|
||||
return ciErrNum;
|
||||
}
|
||||
rv_program = clCreateProgramWithBinary(
|
||||
cxGPUContext, 1, cdDevices, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &ciErrNum);
|
||||
if (ciErrNum != CL_SUCCESS) {
|
||||
shrLog(" Error %i in clCreateProgramWithBinary call !!!\n\n", ciErrNum);
|
||||
return ciErrNum;
|
||||
}
|
||||
|
||||
// build the program
|
||||
ciErrNum = clBuildProgram(rv_program, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
|
||||
if (ciErrNum != CL_SUCCESS)
|
||||
{
|
||||
if (ciErrNum != CL_SUCCESS) {
|
||||
// write out standard error, Build Log and PTX, then return error
|
||||
shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
|
||||
oclLogBuildInfo(rv_program, oclGetFirstDev(cxGPUContext));
|
||||
@@ -331,13 +333,13 @@ int runTest( const int argc, const char** argv)
|
||||
// Run Naive Kernel
|
||||
#ifdef GPU_PROFILING
|
||||
// Matrix Copy kernel runs to measure reference performance.
|
||||
double uncoalescedCopyTime = transposeGPU("uncoalesced_copy", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
|
||||
double simpleCopyTime = transposeGPU("simple_copy", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
|
||||
double sharedCopyTime = transposeGPU("shared_copy", true, ciDeviceCount, h_idata, h_odata, size_x, size_y);
|
||||
//double uncoalescedCopyTime = transposeGPU("uncoalesced_copy", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
|
||||
//double simpleCopyTime = transposeGPU("simple_copy", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
|
||||
//double sharedCopyTime = transposeGPU("shared_copy", true, ciDeviceCount, h_idata, h_odata, size_x, size_y);
|
||||
#endif
|
||||
|
||||
double naiveTime = transposeGPU("transpose_naive", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
|
||||
double optimizedTime = transposeGPU("transpose", true, ciDeviceCount, h_idata, h_odata, size_x, size_y);
|
||||
//double optimizedTime = transposeGPU("transpose", true, ciDeviceCount, h_idata, h_odata, size_x, size_y);
|
||||
|
||||
#ifdef GPU_PROFILING
|
||||
// log times
|
||||
@@ -369,8 +371,8 @@ int runTest( const int argc, const char** argv)
|
||||
free(h_idata);
|
||||
free(h_odata);
|
||||
free(reference);
|
||||
free(source);
|
||||
free(source_path);
|
||||
//free(source);
|
||||
//free(source_path);
|
||||
|
||||
// cleanup OpenCL
|
||||
ciErrNum = clReleaseProgram(rv_program);
|
||||
@@ -379,7 +381,7 @@ int runTest( const int argc, const char** argv)
|
||||
ciErrNum |= clReleaseCommandQueue(commandQueue[i]);
|
||||
}
|
||||
ciErrNum |= clReleaseContext(cxGPUContext);
|
||||
//oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
oclCheckError(ciErrNum, CL_SUCCESS);
|
||||
|
||||
// pass or fail (cumulative... all tests in the loop)
|
||||
shrQAFinishExit(argc, (const char **)argv, (1 == res) ? QA_PASSED : QA_FAILED);
|
||||
|
||||
Reference in New Issue
Block a user