vortex/tests/opencl/mri-q/main.cc

/***************************************************************************
 *cr
 *cr            (C) Copyright 2007 The Board of Trustees of the
 *cr                        University of Illinois
 *cr                         All Rights Reserved
 *cr
 ***************************************************************************/

/*
 * C code for creating the Q data structure for fast convolution-based
 * Hessian multiplication for arbitrary k-space trajectories.
 *
 * Inputs:
 * kx - VECTOR of kx values, same length as ky and kz
 * ky - VECTOR of ky values, same length as kx and kz
 * kz - VECTOR of kz values, same length as kx and ky
 * x  - VECTOR of x values, same length as y and z
 * y  - VECTOR of y values, same length as x and z
 * z  - VECTOR of z values, same length as x and y
 * phi - VECTOR of the Fourier transform of the spatial basis
 *      function, evaluated at [kx, ky, kz].  Same length as kx, ky, and kz.
 *
 * recommended g++ options:
 *  -O3 -lm -ffast-math -funroll-all-loops
 */

#include <stdio.h>
#include <sys/time.h>
#include <parboil.h>
#include <CL/cl.h>

#include "ocl.h"
#include "file.h"
#include "macros.h"
#include "computeQ.h"

static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
  if (nullptr == filename || nullptr == data || 0 == size)
    return CL_INVALID_VALUE;

  FILE* fp = fopen(filename, "r");
  if (NULL == fp) {
    fprintf(stderr, "Failed to load kernel.");
    return CL_INVALID_VALUE;
  }
  fseek(fp , 0 , SEEK_END);
  long fsize = ftell(fp);
  rewind(fp);

  *data = (uint8_t*)malloc(fsize);
  *size = fread(*data, 1, fsize, fp);

  fclose(fp);

  return CL_SUCCESS;
}

static void
setupMemoryGPU(int num, int size, cl_mem* dev_ptr, float* host_ptr,clPrmtr* clPrm)
{
  cl_int clStatus;
  *dev_ptr = clCreateBuffer(clPrm->clContext,CL_MEM_READ_ONLY,num*size,NULL,&clStatus);
  CHECK_ERROR("clCreateBuffer");
  clStatus = clEnqueueWriteBuffer(clPrm->clCommandQueue,*dev_ptr,CL_TRUE,0,num*size,host_ptr,0,NULL,NULL);
  CHECK_ERROR("clEnequeueWriteBuffer");
}

static void
cleanupMemoryGPU(int num, int size, cl_mem* dev_ptr, float* host_ptr, clPrmtr* clPrm)
{
  cl_int clStatus;
  clStatus = clEnqueueReadBuffer(clPrm->clCommandQueue,*dev_ptr,CL_TRUE,0,num*size,host_ptr,0,NULL,NULL);
  CHECK_ERROR("clEnqueueReadBuffer")
  clStatus = clReleaseMemObject(*dev_ptr);
  CHECK_ERROR("clReleaseMemObject")
}

int
main (int argc, char *argv[]) {
  int numX, numK;		/* Number of X and K values */
  int original_numK;		/* Number of K values in input file */
  float *kx, *ky, *kz;		/* K trajectory (3D vectors) */
  float *x, *y, *z;		/* X coordinates (3D vectors) */
  float *phiR, *phiI;		/* Phi values (complex) */
  float *phiMag;		/* Magnitude of Phi */
  float *Qr, *Qi;		/* Q signal (complex) */

  struct kValues* kVals;

  struct pb_Parameters *params;
  struct pb_TimerSet timers;

  pb_InitializeTimerSet(&timers);

  /* Read command line */
  params = pb_ReadParameters(&argc, argv);

  params->inpFiles = (char **)malloc(sizeof(char *) * 2);
  params->inpFiles[0] = (char *)malloc(100);
  params->inpFiles[1] = NULL;
  strncpy(params->inpFiles[0], "32_32_32_dataset.bin", 100);

  if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL))
    {
      fprintf(stderr, "Expecting one input filename\n");
      exit(-1);
    }

  /* Read in data */
  pb_SwitchToTimer(&timers, pb_TimerID_IO);
  inputData(params->inpFiles[0],
	    &original_numK, &numX,
	    &kx, &ky, &kz,
	    &x, &y, &z,
	    &phiR, &phiI);

  /* Reduce the number of k-space samples if a number is given
   * on the command line */
  if (argc < 2)
    numK = original_numK;
  else
    {
      int inputK;
      char *end;
      inputK = strtol(argv[1], &end, 10);
      if (end == argv[1])
	{
	  fprintf(stderr, "Expecting an integer parameter\n");
	  exit(-1);
	}

      numK = MIN(inputK, original_numK);
    }

  printf("%d pixels in output; %d samples in trajectory; using %d samples\n",
         numX, original_numK, numK);

  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);

  clPrmtr clPrm;

  pb_Context* pb_context;
  pb_context = pb_InitOpenCLContext(params);
  if (pb_context == NULL) {
    fprintf (stderr, "Error: No OpenCL platform/device can be found.");
    return -1;
  }

  cl_int clStatus;
  cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId;
  cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId;
  clPrm.clContext = (cl_context) pb_context->clContext;

  clPrm.clCommandQueue = clCreateCommandQueue(clPrm.clContext,clDevice,CL_QUEUE_PROFILING_ENABLE,&clStatus);
  CHECK_ERROR("clCreateCommandQueue")

  pb_SetOpenCL(&(clPrm.clContext), &(clPrm.clCommandQueue));

#ifdef HOSTGPU
  const char* clSource[] = {readFile("kernel.cl")};
  CHECK_ERROR("clCreateProgramWithSource")
  cl_program clProgram = clCreateProgramWithSource(clPrm.clContext,1,clSource,NULL,&clStatus);
#else
  uint8_t *kernel_bin = NULL;
  size_t kernel_size;
  cl_int binary_status = 0;
  CHECK_ERROR("read_kernel_file")
  clStatus = read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size);
  CHECK_ERROR("clCreateProgramWithSource")
	cl_program clProgram = clCreateProgramWithBinary(
      clPrm.clContext, 1, &clDevice, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &clStatus);
#endif

  char options[50];
  sprintf(options,"-I src/opencl_nvidia");
  clStatus = clBuildProgram(clProgram,0,NULL,options,NULL,NULL);
  if (clStatus != CL_SUCCESS) {
    char buf[4096];
    clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 4096, buf, NULL);
    printf ("%s\n", buf);
    CHECK_ERROR("clBuildProgram")
  }

  /* Create CPU data structures */
  createDataStructsCPU(numK, numX, &phiMag, &Qr, &Qi);

  /* GPU section 1 (precompute PhiMag) */
  {
    clPrm.clKernel = clCreateKernel(clProgram,"ComputePhiMag_GPU",&clStatus);
    CHECK_ERROR("clCreateKernel")

    /* Mirror several data structures on the device */
    cl_mem phiR_d;
    cl_mem phiI_d;
    cl_mem phiMag_d;

    pb_SwitchToTimer(&timers, pb_TimerID_COPY);

    setupMemoryGPU(numK,sizeof(float),&phiR_d,phiR,&clPrm);
    setupMemoryGPU(numK,sizeof(float),&phiI_d,phiI,&clPrm);
    phiMag_d = clCreateBuffer(clPrm.clContext,CL_MEM_WRITE_ONLY,numK*sizeof(float),NULL,&clStatus);
    CHECK_ERROR("clCreateBuffer")

    clStatus = clFinish(clPrm.clCommandQueue);
    CHECK_ERROR("clFinish")

    pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);

    computePhiMag_GPU(numK, phiR_d, phiI_d, phiMag_d, &clPrm);

    clStatus = clFinish(clPrm.clCommandQueue);
    CHECK_ERROR("clFinish")

    pb_SwitchToTimer(&timers, pb_TimerID_COPY);

    cleanupMemoryGPU(numK,sizeof(float),&phiMag_d,phiMag,&clPrm);

    clStatus = clReleaseMemObject(phiR_d);
    CHECK_ERROR("clReleaseMemObject")
    clStatus = clReleaseMemObject(phiI_d);
    CHECK_ERROR("clReleaseMemObject")
  }

  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);

  kVals = (struct kValues*)calloc(numK, sizeof (struct kValues));

  int k;
  for (k = 0; k < numK; k++) {
    kVals[k].Kx = kx[k];
    kVals[k].Ky = ky[k];
    kVals[k].Kz = kz[k];
    kVals[k].PhiMag = phiMag[k];
  }

  free(phiMag);

  clStatus = clReleaseKernel(clPrm.clKernel);

  /* GPU section 2 */
  {
    clPrm.clKernel = clCreateKernel(clProgram,"ComputeQ_GPU",&clStatus);
    CHECK_ERROR("clCreateKernel")

    cl_mem x_d;
    cl_mem y_d;
    cl_mem z_d;
    cl_mem Qr_d;
    cl_mem Qi_d;

    pb_SwitchToTimer(&timers, pb_TimerID_COPY);

    setupMemoryGPU(numX,sizeof(float),&x_d,x,&clPrm);
    setupMemoryGPU(numX,sizeof(float),&y_d,y,&clPrm);
    setupMemoryGPU(numX,sizeof(float),&z_d,z,&clPrm);

    Qr_d = clCreateBuffer(clPrm.clContext,CL_MEM_READ_WRITE,numX*sizeof(float),NULL,&clStatus);
    CHECK_ERROR("clCreateBuffer")
    clMemSet(&clPrm,Qr_d,0,numX*sizeof(float));
    Qi_d = clCreateBuffer(clPrm.clContext,CL_MEM_READ_WRITE,numX*sizeof(float),NULL,&clStatus);
    CHECK_ERROR("clCreateBuffer")
    clMemSet(&clPrm,Qi_d,0,numX*sizeof(float));

    clStatus = clFinish(clPrm.clCommandQueue);
    CHECK_ERROR("clFinish")

    pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);

    computeQ_GPU(numK, numX, x_d, y_d, z_d, kVals, Qr_d, Qi_d, &clPrm);

    clStatus = clFinish(clPrm.clCommandQueue);
    CHECK_ERROR("clFinish")

    pb_SwitchToTimer(&timers, pb_TimerID_COPY);

    clStatus = clReleaseMemObject(x_d);
    CHECK_ERROR("clReleaseMemObject")
    clStatus = clReleaseMemObject(y_d);
    CHECK_ERROR("clReleaseMemObject")
    clStatus = clReleaseMemObject(z_d);
    CHECK_ERROR("clReleaseMemObject")
    cleanupMemoryGPU(numX,sizeof(float),&Qr_d,Qr,&clPrm);
    cleanupMemoryGPU(numX,sizeof(float),&Qi_d,Qi,&clPrm);
  }

  pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);

  if (params->outFile)
    {
      /* Write Q to file */
      pb_SwitchToTimer(&timers, pb_TimerID_IO);
      outputData(params->outFile, Qr, Qi, numX);
      pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
    }

  free (kx);
  free (ky);
  free (kz);
  free (x);
  free (y);
  free (z);
  free (phiR);
  free (phiI);
  free (kVals);
  free (Qr);
  free (Qi);

  //free((void*)clSource[0]);

  clStatus = clReleaseKernel(clPrm.clKernel);
  clStatus = clReleaseProgram(clProgram);
  clStatus = clReleaseCommandQueue(clPrm.clCommandQueue);
  clStatus = clReleaseContext(clPrm.clContext);

  pb_SwitchToTimer(&timers, pb_TimerID_NONE);
  pb_PrintTimerSet(&timers);

  pb_FreeParameters(params);

  return 0;
}