Vortex 2.0 changes:

+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes minor update minor update minor update minor update minor update minor update cleanup cleanup cache bindings and memory perf refactory minor update minor update hw unit tests fixes minor update minor update minor update minor update minor update minor udpate minor update minor update minor update minor update minor update minor update minor update minor updates minor updates minor update minor update minor update minor update minor update minor update minor updates minor updates minor updates minor updates minor update minor update
2023-11-10 02:47:05 -08:00
parent d69a64c32c
commit c1e168fdbe
1309 changed files with 247412 additions and 311463 deletions
@@ -0,0 +1,7 @@
+PROJECT = matmul
+
+SRCS = main.cc
+
+OPTS ?= -n16
+
+include ../common.mk
@@ -0,0 +1,73 @@
+__kernel void matmul(__global float *A, 
+                     __global float *B, 
+                     __global float *C, 
+                     const unsigned int N, 
+                     __local float *localA, 
+                     __local float *localB)
+{
+    int row = get_global_id(1);
+    int col = get_global_id(0);
+    int localRow = get_local_id(1);
+    int localCol = get_local_id(0);
+    int localSize = get_local_size(0);  // assuming square local size
+
+    float sum = 0.0f;
+
+    // Loop over all blocks of both matrices
+    for (int k = 0; k < N; k += localSize) {
+        // Load block of matrix A to local memory
+        localA[localRow * localSize + localCol] = A[row * N + k + localCol];
+
+        // Load block of matrix B to local memory, adjusting for column-major access
+        localB[localRow * localSize + localCol] = B[(k + localRow) * N + col];
+
+        // Synchronize to make sure the tiles are loaded
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // Multiply the two matrix blocks and accumulate result
+        for (int j = 0; j < localSize; j++) {
+            sum += localA[localRow * localSize + j] * localB[j * localSize + localCol];
+        }
+
+        // Synchronize before loading the next block
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    C[row * N + col] = sum;
+}
+
+/*__kernel void matmul(__global float *A, __global float *B, __global float *C, const unsigned int N)
+{
+    int globalRow = get_global_id(1);
+    int globalCol = get_global_id(0);
+    int localRow = get_local_id(1);
+    int localCol = get_local_id(0);
+
+    // Static local memory declaration
+    __local float localA[16][16];
+    __local float localB[16][16];
+
+    float sum = 0.0f;
+
+    // Iterate over blocks
+    for (int k = 0; k < N; k += 16) {
+        // Load a block of matrix A into local memory
+        localA[localRow][localCol] = A[globalRow * N + k + localCol];
+
+        // Load a block of matrix B into local memory
+        localB[localRow][localCol] = B[(k + localRow) * N + globalCol];
+
+        // Ensure the entire block is loaded
+        barrier(CLK_LOCAL_MEM_FENCE);
+
+        // Compute multiplication for this block
+        for (int j = 0; j < 16; j++) {
+            sum += localA[localRow][j] * localB[j][localCol];
+        }
+
+        // Wait until all threads have computed before loading the next block
+        barrier(CLK_LOCAL_MEM_FENCE);
+    }
+
+    C[globalRow * N + globalCol] = sum;
+}*/
@@ -0,0 +1,246 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <CL/opencl.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h> 
+#include <chrono>
+#include <vector>
+
+#define LOCAL_SIZE 16
+
+#define KERNEL_NAME "matmul"
+
+#define CL_CHECK(_expr)                                                \
+   do {                                                                \
+     cl_int _err = _expr;                                              \
+     if (_err == CL_SUCCESS)                                           \
+       break;                                                          \
+     printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);   \
+	 cleanup();			                                                     \
+     exit(-1);                                                         \
+   } while (0)
+
+#define CL_CHECK2(_expr)                                               \
+   ({                                                                  \
+     cl_int _err = CL_INVALID_VALUE;                                   \
+     decltype(_expr) _ret = _expr;                                     \
+     if (_err != CL_SUCCESS) {                                         \
+       printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
+	   cleanup();			                                                   \
+       exit(-1);                                                       \
+     }                                                                 \
+     _ret;                                                             \
+   })
+
+static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
+  if (nullptr == filename || nullptr == data || 0 == size)
+    return -1;
+
+  FILE* fp = fopen(filename, "r");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to load kernel.");
+    return -1;
+  }
+  
+  fseek(fp , 0 , SEEK_END);
+  long fsize = ftell(fp);
+  rewind(fp);
+
+  *data = (uint8_t*)malloc(fsize);
+  *size = fread(*data, 1, fsize, fp);
+  
+  fclose(fp);
+  
+  return 0;
+}
+
+static bool compare_equal(float a, float b, int ulp = 21) {
+  union fi_t { int i; float f; };
+  fi_t fa, fb;
+  fa.f = a;
+  fb.f = b;
+  return std::abs(fa.i - fb.i) <= ulp;
+}
+
+static void matrix_multiply_cpu(float *A, float *B, float *C, int N) {
+    for (int i = 0; i < N; i++) {
+        for (int j = 0; j < N; j++) {
+            float sum = 0.0f;
+            for (int k = 0; k < N; k++) {
+                sum += A[i * N + k] * B[k * N + j];
+            }
+            C[i * N + j] = sum;
+        }
+    }
+}
+
+cl_device_id device_id = NULL;
+cl_context context = NULL;
+cl_command_queue commandQueue = NULL;
+cl_program program = NULL;
+cl_kernel kernel = NULL;
+cl_mem a_memobj = NULL;
+cl_mem b_memobj = NULL;
+cl_mem c_memobj = NULL;
+uint8_t *kernel_bin = NULL;
+
+static void cleanup() {
+  if (commandQueue) clReleaseCommandQueue(commandQueue);
+  if (kernel) clReleaseKernel(kernel);
+  if (program) clReleaseProgram(program);
+  if (a_memobj) clReleaseMemObject(a_memobj);
+  if (b_memobj) clReleaseMemObject(b_memobj);
+  if (c_memobj) clReleaseMemObject(c_memobj);
+  if (context) clReleaseContext(context);
+  if (device_id) clReleaseDevice(device_id);  
+  if (kernel_bin) free(kernel_bin);
+}
+
+int size = 64;
+
+static void show_usage() {
+  printf("Usage: [-n size] [-h: help]\n");
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "fn:h?")) != -1) {
+    switch (c) {
+    case 'n':
+      size = atoi(optarg);
+      break;
+    case 'h':
+    case '?': {
+      show_usage();
+      exit(0);
+    } break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+}
+
+int main (int argc, char **argv) {
+  // parse command arguments
+  parse_args(argc, argv);
+
+  printf("Matrix size=%d\n", size);
+  if ((size / LOCAL_SIZE) * LOCAL_SIZE != size) {
+    printf("Error: matrix size must be a multiple of %d\n", LOCAL_SIZE);
+    return -1;
+  }
+  
+  cl_platform_id platform_id;
+  size_t kernel_size;
+  
+  // Getting platform and device information
+  CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
+  CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
+
+  printf("Create context\n");
+  context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL,  &_err));
+
+  char device_string[1024];
+  clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
+  printf("Using device: %s\n", device_string);
+
+  printf("Allocate device buffers\n");
+  size_t nbytes = size * size * sizeof(float);
+  a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
+  b_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
+  c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
+
+  printf("Create program from kernel source\n");
+#ifdef HOSTGPU
+  if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
+    return -1;
+  program = CL_CHECK2(clCreateProgramWithSource(
+    context, 1, (const char**)&kernel_bin, &kernel_size, &_err));  
+#else
+  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
+    return -1;
+  program = CL_CHECK2(clCreateProgramWithBinary(
+    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
+#endif  
+  if (program == NULL) {
+    cleanup();
+    return -1;
+  }
+
+  // Build program
+  CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
+  
+  // Create kernel
+  kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
+
+  size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE};
+  size_t global_size[2] = {size, size};
+
+  // Set kernel arguments
+  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));	
+  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_memobj));
+  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_memobj));
+  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(uint32_t), &size));
+  CL_CHECK(clSetKernelArg(kernel, 4, local_size[0]*local_size[1]*sizeof(float), NULL));
+  CL_CHECK(clSetKernelArg(kernel, 5, local_size[0]*local_size[1]*sizeof(float), NULL));
+
+ // Allocate memories for input arrays and output arrays.
+ std::vector<float> h_a(size * size);
+ std::vector<float> h_b(size * size);
+ std::vector<float> h_c(size * size);
+	
+  // Initialize values for array members.  
+  for (int i = 0; i < (size * size); ++i) {
+  #ifdef USE_FLOAT
+    h_a[i] = (float)rand() / (float)RAND_MAX;
+    h_b[i] = (float)rand() / (float)RAND_MAX;
+  #else
+    h_a[i] = rand();
+    h_b[i] = rand();
+  #endif
+    h_c[i] = 0xdeadbeef;
+  }
+
+  // Creating command queue
+  commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err));  
+
+	printf("Upload source buffers\n");
+  CL_CHECK(clEnqueueWriteBuffer(commandQueue, a_memobj, CL_TRUE, 0, nbytes, h_a.data(), 0, NULL, NULL));
+  CL_CHECK(clEnqueueWriteBuffer(commandQueue, b_memobj, CL_TRUE, 0, nbytes, h_b.data(), 0, NULL, NULL));
+
+  printf("Execute the kernel\n");
+  auto time_start = std::chrono::high_resolution_clock::now();
+  CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, global_size, local_size, 0, NULL, NULL));
+  CL_CHECK(clFinish(commandQueue));
+  auto time_end = std::chrono::high_resolution_clock::now();
+  double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
+  printf("Elapsed time: %lg ms\n", elapsed);
+
+  printf("Download destination buffer\n");
+  CL_CHECK(clEnqueueReadBuffer(commandQueue, c_memobj, CL_TRUE, 0, nbytes, h_c.data(), 0, NULL, NULL));
+
+  printf("Verify result\n");
+  std::vector<float> ref_vec(size * size);
+  matrix_multiply_cpu(h_a.data(), h_b.data(), ref_vec.data(), size);
+  int errors = 0;
+  for (int i = 0; i < (size * size); i++) {
+    if (!compare_equal(h_c[i], ref_vec[i])) {
+      if (errors < 100) 
+        printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_c[i]);
+      ++errors;
+    }
+  }
+  if (errors != 0) {
+    printf("FAILED! - %d errors\n", errors);    
+  } else {
+    printf("PASSED!\n");
+  }
+
+  // Clean up		
+  cleanup();  
+
+  return errors;
+}