+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes minor update minor update minor update minor update minor update minor update cleanup cleanup cache bindings and memory perf refactory minor update minor update hw unit tests fixes minor update minor update minor update minor update minor update minor udpate minor update minor update minor update minor update minor update minor update minor update minor updates minor updates minor update minor update minor update minor update minor update minor update minor updates minor updates minor updates minor updates minor update minor update
322 lines
9.2 KiB
C++
322 lines
9.2 KiB
C++
/***************************************************************************
|
|
*cr
|
|
*cr (C) Copyright 2007 The Board of Trustees of the
|
|
*cr University of Illinois
|
|
*cr All Rights Reserved
|
|
*cr
|
|
***************************************************************************/
|
|
|
|
/*
|
|
* C code for creating the Q data structure for fast convolution-based
|
|
* Hessian multiplication for arbitrary k-space trajectories.
|
|
*
|
|
* Inputs:
|
|
* kx - VECTOR of kx values, same length as ky and kz
|
|
* ky - VECTOR of ky values, same length as kx and kz
|
|
* kz - VECTOR of kz values, same length as kx and ky
|
|
* x - VECTOR of x values, same length as y and z
|
|
* y - VECTOR of y values, same length as x and z
|
|
* z - VECTOR of z values, same length as x and y
|
|
* phi - VECTOR of the Fourier transform of the spatial basis
|
|
* function, evaluated at [kx, ky, kz]. Same length as kx, ky, and kz.
|
|
*
|
|
* recommended g++ options:
|
|
* -O3 -lm -ffast-math -funroll-all-loops
|
|
*/
|
|
|
|
#include <stdio.h>
|
|
#include <sys/time.h>
|
|
#include <parboil.h>
|
|
#include <CL/cl.h>
|
|
|
|
#include "ocl.h"
|
|
#include "file.h"
|
|
#include "macros.h"
|
|
#include "computeQ.h"
|
|
|
|
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
|
|
if (nullptr == filename || nullptr == data || 0 == size)
|
|
return CL_INVALID_VALUE;
|
|
|
|
FILE* fp = fopen(filename, "r");
|
|
if (NULL == fp) {
|
|
fprintf(stderr, "Failed to load kernel.");
|
|
return CL_INVALID_VALUE;
|
|
}
|
|
fseek(fp , 0 , SEEK_END);
|
|
long fsize = ftell(fp);
|
|
rewind(fp);
|
|
|
|
*data = (uint8_t*)malloc(fsize);
|
|
*size = fread(*data, 1, fsize, fp);
|
|
|
|
fclose(fp);
|
|
|
|
return CL_SUCCESS;
|
|
}
|
|
|
|
static void
|
|
setupMemoryGPU(int num, int size, cl_mem* dev_ptr, float* host_ptr,clPrmtr* clPrm)
|
|
{
|
|
cl_int clStatus;
|
|
*dev_ptr = clCreateBuffer(clPrm->clContext,CL_MEM_READ_ONLY,num*size,NULL,&clStatus);
|
|
CHECK_ERROR("clCreateBuffer");
|
|
clStatus = clEnqueueWriteBuffer(clPrm->clCommandQueue,*dev_ptr,CL_TRUE,0,num*size,host_ptr,0,NULL,NULL);
|
|
CHECK_ERROR("clEnequeueWriteBuffer");
|
|
}
|
|
|
|
static void
|
|
cleanupMemoryGPU(int num, int size, cl_mem* dev_ptr, float* host_ptr, clPrmtr* clPrm)
|
|
{
|
|
cl_int clStatus;
|
|
clStatus = clEnqueueReadBuffer(clPrm->clCommandQueue,*dev_ptr,CL_TRUE,0,num*size,host_ptr,0,NULL,NULL);
|
|
CHECK_ERROR("clEnqueueReadBuffer")
|
|
clStatus = clReleaseMemObject(*dev_ptr);
|
|
CHECK_ERROR("clReleaseMemObject")
|
|
}
|
|
|
|
int
|
|
main (int argc, char *argv[]) {
|
|
int numX, numK; /* Number of X and K values */
|
|
int original_numK; /* Number of K values in input file */
|
|
float *kx, *ky, *kz; /* K trajectory (3D vectors) */
|
|
float *x, *y, *z; /* X coordinates (3D vectors) */
|
|
float *phiR, *phiI; /* Phi values (complex) */
|
|
float *phiMag; /* Magnitude of Phi */
|
|
float *Qr, *Qi; /* Q signal (complex) */
|
|
|
|
struct kValues* kVals;
|
|
|
|
struct pb_Parameters *params;
|
|
struct pb_TimerSet timers;
|
|
|
|
pb_InitializeTimerSet(&timers);
|
|
|
|
/* Read command line */
|
|
params = pb_ReadParameters(&argc, argv);
|
|
|
|
params->inpFiles = (char **)malloc(sizeof(char *) * 2);
|
|
params->inpFiles[0] = (char *)malloc(100);
|
|
params->inpFiles[1] = NULL;
|
|
strncpy(params->inpFiles[0], "32_32_32_dataset.bin", 100);
|
|
|
|
if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL))
|
|
{
|
|
fprintf(stderr, "Expecting one input filename\n");
|
|
exit(-1);
|
|
}
|
|
|
|
/* Read in data */
|
|
pb_SwitchToTimer(&timers, pb_TimerID_IO);
|
|
inputData(params->inpFiles[0],
|
|
&original_numK, &numX,
|
|
&kx, &ky, &kz,
|
|
&x, &y, &z,
|
|
&phiR, &phiI);
|
|
|
|
/* Reduce the number of k-space samples if a number is given
|
|
* on the command line */
|
|
if (argc < 2)
|
|
numK = original_numK;
|
|
else
|
|
{
|
|
int inputK;
|
|
char *end;
|
|
inputK = strtol(argv[1], &end, 10);
|
|
if (end == argv[1])
|
|
{
|
|
fprintf(stderr, "Expecting an integer parameter\n");
|
|
exit(-1);
|
|
}
|
|
|
|
numK = MIN(inputK, original_numK);
|
|
}
|
|
|
|
printf("%d pixels in output; %d samples in trajectory; using %d samples\n",
|
|
numX, original_numK, numK);
|
|
|
|
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
|
|
|
|
clPrmtr clPrm;
|
|
|
|
pb_Context* pb_context;
|
|
pb_context = pb_InitOpenCLContext(params);
|
|
if (pb_context == NULL) {
|
|
fprintf (stderr, "Error: No OpenCL platform/device can be found.");
|
|
return -1;
|
|
}
|
|
|
|
cl_int clStatus;
|
|
cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId;
|
|
cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId;
|
|
clPrm.clContext = (cl_context) pb_context->clContext;
|
|
|
|
clPrm.clCommandQueue = clCreateCommandQueue(clPrm.clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
|
|
CHECK_ERROR("clCreateCommandQueue")
|
|
|
|
pb_SetOpenCL(&(clPrm.clContext), &(clPrm.clCommandQueue));
|
|
|
|
#ifdef HOSTGPU
|
|
const char* clSource[] = {readFile("kernel.cl")};
|
|
CHECK_ERROR("clCreateProgramWithSource")
|
|
cl_program clProgram = clCreateProgramWithSource(clPrm.clContext,1,clSource,NULL,&clStatus);
|
|
#else
|
|
uint8_t *kernel_bin = NULL;
|
|
size_t kernel_size;
|
|
cl_int binary_status = 0;
|
|
CHECK_ERROR("read_kernel_file")
|
|
clStatus = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
|
|
CHECK_ERROR("clCreateProgramWithSource")
|
|
cl_program clProgram = clCreateProgramWithBinary(
|
|
clPrm.clContext, 1, &clDevice, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &clStatus);
|
|
#endif
|
|
|
|
char options[50];
|
|
sprintf(options,"-I src/opencl_nvidia");
|
|
clStatus = clBuildProgram(clProgram,0,NULL,options,NULL,NULL);
|
|
if (clStatus != CL_SUCCESS) {
|
|
char buf[4096];
|
|
clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 4096, buf, NULL);
|
|
printf ("%s\n", buf);
|
|
CHECK_ERROR("clBuildProgram")
|
|
}
|
|
|
|
/* Create CPU data structures */
|
|
createDataStructsCPU(numK, numX, &phiMag, &Qr, &Qi);
|
|
|
|
/* GPU section 1 (precompute PhiMag) */
|
|
{
|
|
clPrm.clKernel = clCreateKernel(clProgram,"ComputePhiMag_GPU",&clStatus);
|
|
CHECK_ERROR("clCreateKernel")
|
|
|
|
/* Mirror several data structures on the device */
|
|
cl_mem phiR_d;
|
|
cl_mem phiI_d;
|
|
cl_mem phiMag_d;
|
|
|
|
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
|
|
|
|
setupMemoryGPU(numK,sizeof(float),&phiR_d,phiR,&clPrm);
|
|
setupMemoryGPU(numK,sizeof(float),&phiI_d,phiI,&clPrm);
|
|
phiMag_d = clCreateBuffer(clPrm.clContext,CL_MEM_WRITE_ONLY,numK*sizeof(float),NULL,&clStatus);
|
|
CHECK_ERROR("clCreateBuffer")
|
|
|
|
clStatus = clFinish(clPrm.clCommandQueue);
|
|
CHECK_ERROR("clFinish")
|
|
|
|
pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
|
|
|
|
computePhiMag_GPU(numK, phiR_d, phiI_d, phiMag_d, &clPrm);
|
|
|
|
clStatus = clFinish(clPrm.clCommandQueue);
|
|
CHECK_ERROR("clFinish")
|
|
|
|
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
|
|
|
|
cleanupMemoryGPU(numK,sizeof(float),&phiMag_d,phiMag,&clPrm);
|
|
|
|
clStatus = clReleaseMemObject(phiR_d);
|
|
CHECK_ERROR("clReleaseMemObject")
|
|
clStatus = clReleaseMemObject(phiI_d);
|
|
CHECK_ERROR("clReleaseMemObject")
|
|
}
|
|
|
|
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
|
|
|
|
kVals = (struct kValues*)calloc(numK, sizeof (struct kValues));
|
|
|
|
int k;
|
|
for (k = 0; k < numK; k++) {
|
|
kVals[k].Kx = kx[k];
|
|
kVals[k].Ky = ky[k];
|
|
kVals[k].Kz = kz[k];
|
|
kVals[k].PhiMag = phiMag[k];
|
|
}
|
|
|
|
free(phiMag);
|
|
|
|
clStatus = clReleaseKernel(clPrm.clKernel);
|
|
|
|
/* GPU section 2 */
|
|
{
|
|
clPrm.clKernel = clCreateKernel(clProgram,"ComputeQ_GPU",&clStatus);
|
|
CHECK_ERROR("clCreateKernel")
|
|
|
|
cl_mem x_d;
|
|
cl_mem y_d;
|
|
cl_mem z_d;
|
|
cl_mem Qr_d;
|
|
cl_mem Qi_d;
|
|
|
|
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
|
|
|
|
setupMemoryGPU(numX,sizeof(float),&x_d,x,&clPrm);
|
|
setupMemoryGPU(numX,sizeof(float),&y_d,y,&clPrm);
|
|
setupMemoryGPU(numX,sizeof(float),&z_d,z,&clPrm);
|
|
|
|
Qr_d = clCreateBuffer(clPrm.clContext,CL_MEM_READ_WRITE,numX*sizeof(float),NULL,&clStatus);
|
|
CHECK_ERROR("clCreateBuffer")
|
|
clMemSet(&clPrm,Qr_d,0,numX*sizeof(float));
|
|
Qi_d = clCreateBuffer(clPrm.clContext,CL_MEM_READ_WRITE,numX*sizeof(float),NULL,&clStatus);
|
|
CHECK_ERROR("clCreateBuffer")
|
|
clMemSet(&clPrm,Qi_d,0,numX*sizeof(float));
|
|
|
|
clStatus = clFinish(clPrm.clCommandQueue);
|
|
CHECK_ERROR("clFinish")
|
|
|
|
pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
|
|
|
|
computeQ_GPU(numK, numX, x_d, y_d, z_d, kVals, Qr_d, Qi_d, &clPrm);
|
|
|
|
clStatus = clFinish(clPrm.clCommandQueue);
|
|
CHECK_ERROR("clFinish")
|
|
|
|
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
|
|
|
|
clStatus = clReleaseMemObject(x_d);
|
|
CHECK_ERROR("clReleaseMemObject")
|
|
clStatus = clReleaseMemObject(y_d);
|
|
CHECK_ERROR("clReleaseMemObject")
|
|
clStatus = clReleaseMemObject(z_d);
|
|
CHECK_ERROR("clReleaseMemObject")
|
|
cleanupMemoryGPU(numX,sizeof(float),&Qr_d,Qr,&clPrm);
|
|
cleanupMemoryGPU(numX,sizeof(float),&Qi_d,Qi,&clPrm);
|
|
}
|
|
|
|
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
|
|
|
|
if (params->outFile)
|
|
{
|
|
/* Write Q to file */
|
|
pb_SwitchToTimer(&timers, pb_TimerID_IO);
|
|
outputData(params->outFile, Qr, Qi, numX);
|
|
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
|
|
}
|
|
|
|
free (kx);
|
|
free (ky);
|
|
free (kz);
|
|
free (x);
|
|
free (y);
|
|
free (z);
|
|
free (phiR);
|
|
free (phiI);
|
|
free (kVals);
|
|
free (Qr);
|
|
free (Qi);
|
|
|
|
//free((void*)clSource[0]);
|
|
|
|
clStatus = clReleaseKernel(clPrm.clKernel);
|
|
clStatus = clReleaseProgram(clProgram);
|
|
clStatus = clReleaseCommandQueue(clPrm.clCommandQueue);
|
|
clStatus = clReleaseContext(clPrm.clContext);
|
|
|
|
pb_SwitchToTimer(&timers, pb_TimerID_NONE);
|
|
pb_PrintTimerSet(&timers);
|
|
|
|
pb_FreeParameters(params);
|
|
|
|
return 0;
|
|
}
|