Vortex 2.0 changes:

+ Microarchitecture optimizations
+ 64-bit support
+ Xilinx FPGA support
+ LLVM-16 support
+ Refactoring and quality control fixes
This commit is contained in:
Blaise Tine
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit d47cccc157
1300 changed files with 247321 additions and 311189 deletions

View File

@@ -55,10 +55,10 @@ int main( int argc, const char** argv)
// run the main test
int result = runTest(argc, argv);
//oclCheckError(result, 0);
oclCheckError(result, 0);
}
double transposeGPU(const char* kernelName, bool useLocalMem, cl_uint ciDeviceCount, float* h_idata, float* h_odata, unsigned int size_x, unsigned int size_y)
static double transposeGPU(const char* kernelName, bool useLocalMem, cl_uint ciDeviceCount, float* h_idata, float* h_odata, unsigned int size_x, unsigned int size_y)
{
cl_mem d_odata[MAX_GPU_COUNT];
cl_mem d_idata[MAX_GPU_COUNT];
@@ -79,16 +79,16 @@ double transposeGPU(const char* kernelName, bool useLocalMem, cl_uint ciDeviceC
// allocate device memory and copy host to device memory
d_idata[i] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
mem_size, h_idata, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
// create buffer to store output
d_odata[i] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY ,
sizePerGPU*size_y*sizeof(float), NULL, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
// create the naive transpose kernel
ckKernel[i] = clCreateKernel(rv_program, kernelName, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
// set the args values for the naive kernel
size_t offset = i * sizePerGPU;
@@ -97,12 +97,11 @@ double transposeGPU(const char* kernelName, bool useLocalMem, cl_uint ciDeviceC
ciErrNum |= clSetKernelArg(ckKernel[i], 2, sizeof(int), &offset);
ciErrNum |= clSetKernelArg(ckKernel[i], 3, sizeof(int), &size_x);
ciErrNum |= clSetKernelArg(ckKernel[i], 4, sizeof(int), &size_y);
if(useLocalMem)
{
if (useLocalMem) {
ciErrNum |= clSetKernelArg(ckKernel[i], 5, (BLOCK_DIM + 1) * BLOCK_DIM * sizeof(float), 0 );
}
}
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
// set up execution configuration
szLocalWorkSize[0] = BLOCK_DIM;
@@ -111,18 +110,16 @@ double transposeGPU(const char* kernelName, bool useLocalMem, cl_uint ciDeviceC
szGlobalWorkSize[1] = shrRoundUp(BLOCK_DIM, size_y);
// execute the kernel numIterations times
int numIterations = 100;
//int numIterations = 100;
int numIterations = 1;
shrLog("\nProcessing a %d by %d matrix of floats...\n\n", size_x, size_y);
for (int i = -1; i < numIterations; ++i)
{
// Start time measurement after warmup
if( i == 0 ) shrDeltaT(0);
for(unsigned int k=0; k < ciDeviceCount; ++k){
ciErrNum |= clEnqueueNDRangeKernel(commandQueue[k], ckKernel[k], 2, NULL,
szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
for (int i = -1; i < numIterations; ++i) {
if (i == 0)
shrDeltaT(0);
for (unsigned int k=0; k < ciDeviceCount; ++k) {
ciErrNum |= clEnqueueNDRangeKernel(commandQueue[k], ckKernel[k], 2, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
}
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
}
// Block CPU till GPU is done
@@ -130,7 +127,7 @@ double transposeGPU(const char* kernelName, bool useLocalMem, cl_uint ciDeviceC
ciErrNum |= clFinish(commandQueue[k]);
}
double time = shrDeltaT(0)/(double)numIterations;
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
// Copy back to host
for(unsigned int i = 0; i < ciDeviceCount; ++i){
@@ -141,17 +138,18 @@ double transposeGPU(const char* kernelName, bool useLocalMem, cl_uint ciDeviceC
size * size_y * sizeof(float), &h_odata[offset * size_y],
0, NULL, NULL);
}
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
for(unsigned int i = 0; i < ciDeviceCount; ++i){
ciErrNum |= clReleaseMemObject(d_idata[i]);
ciErrNum |= clReleaseMemObject(d_odata[i]);
ciErrNum |= clReleaseKernel(ckKernel[i]);
}
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
return time;
}
uint8_t *kernel_bin = NULL;
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
@@ -174,14 +172,17 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
return 0;
}
//! Run a simple test for CUDA
// *********************************************************************
int runTest( const int argc, const char** argv)
{
cl_int ciErrNum;
cl_uint ciDeviceCount;
unsigned int size_x = 2048;
unsigned int size_y = 2048;
//unsigned int size_x = 2048;
//unsigned int size_y = 2048;
unsigned int size_x = 64;
unsigned int size_y = 64;
int temp;
if( shrGetCmdLineArgumenti( argc, argv,"width", &temp) ){
@@ -197,18 +198,18 @@ int runTest( const int argc, const char** argv)
//Get the NVIDIA platform
ciErrNum = oclGetPlatformID(&cpPlatform);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
//Get the devices
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, 0, NULL, &uiNumDevices);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_DEFAULT, uiNumDevices, cdDevices, NULL);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
//Create the context
cxGPUContext = clCreateContext(0, uiNumDevices, cdDevices, NULL, NULL, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
if(shrCheckCmdLineFlag(argc, (const char**)argv, "device"))
{
@@ -301,26 +302,27 @@ int runTest( const int argc, const char** argv)
srand(15235911);
shrFillArray(h_idata, (size_x * size_y));
// Program Setup
size_t program_length;
char* source_path = shrFindFilePath("transpose.cl", argv[0]);
//oclCheckError(source_path != NULL, shrTRUE);
char *source = oclLoadProgSource(source_path, "", &program_length);
//oclCheckError(source != NULL, shrTRUE);
size_t kernel_size;
cl_int binary_status = 0;
cl_device_id device_id;
// create the program
rv_program = clCreateProgramWithBinary(
cxGPUContext, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, NULL);
//rv_program = clCreateProgramWithSource(cxGPUContext, 1,
// (const char **)&source, &program_length, &ciErrNum);
//oclCheckError(ciErrNum, CL_SUCCESS);
uint8_t *kernel_bin = NULL;
size_t kernel_size;
cl_int binary_status = 0;
ciErrNum = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
if (ciErrNum != CL_SUCCESS) {
shrLog(" Error %i in read_kernel_file call !!!\n\n", ciErrNum);
return ciErrNum;
}
rv_program = clCreateProgramWithBinary(
cxGPUContext, 1, cdDevices, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &ciErrNum);
if (ciErrNum != CL_SUCCESS) {
shrLog(" Error %i in clCreateProgramWithBinary call !!!\n\n", ciErrNum);
return ciErrNum;
}
// build the program
ciErrNum = clBuildProgram(rv_program, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL);
if (ciErrNum != CL_SUCCESS)
{
if (ciErrNum != CL_SUCCESS) {
// write out standard error, Build Log and PTX, then return error
shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR);
oclLogBuildInfo(rv_program, oclGetFirstDev(cxGPUContext));
@@ -331,13 +333,13 @@ int runTest( const int argc, const char** argv)
// Run Naive Kernel
#ifdef GPU_PROFILING
// Matrix Copy kernel runs to measure reference performance.
double uncoalescedCopyTime = transposeGPU("uncoalesced_copy", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
double simpleCopyTime = transposeGPU("simple_copy", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
double sharedCopyTime = transposeGPU("shared_copy", true, ciDeviceCount, h_idata, h_odata, size_x, size_y);
//double uncoalescedCopyTime = transposeGPU("uncoalesced_copy", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
//double simpleCopyTime = transposeGPU("simple_copy", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
//double sharedCopyTime = transposeGPU("shared_copy", true, ciDeviceCount, h_idata, h_odata, size_x, size_y);
#endif
double naiveTime = transposeGPU("transpose_naive", false, ciDeviceCount, h_idata, h_odata, size_x, size_y);
double optimizedTime = transposeGPU("transpose", true, ciDeviceCount, h_idata, h_odata, size_x, size_y);
//double optimizedTime = transposeGPU("transpose", true, ciDeviceCount, h_idata, h_odata, size_x, size_y);
#ifdef GPU_PROFILING
// log times
@@ -369,8 +371,8 @@ int runTest( const int argc, const char** argv)
free(h_idata);
free(h_odata);
free(reference);
free(source);
free(source_path);
//free(source);
//free(source_path);
// cleanup OpenCL
ciErrNum = clReleaseProgram(rv_program);
@@ -379,7 +381,7 @@ int runTest( const int argc, const char** argv)
ciErrNum |= clReleaseCommandQueue(commandQueue[i]);
}
ciErrNum |= clReleaseContext(cxGPUContext);
//oclCheckError(ciErrNum, CL_SUCCESS);
oclCheckError(ciErrNum, CL_SUCCESS);
// pass or fail (cumulative... all tests in the loop)
shrQAFinishExit(argc, (const char **)argv, (1 == res) ? QA_PASSED : QA_FAILED);