Add 'smemcoherence' kernel

This commit is contained in:
Hansung Kim
2024-01-02 20:28:40 -08:00
parent edb385f138
commit a5c50b60c8
12 changed files with 562 additions and 0 deletions

View File

@@ -0,0 +1,8 @@
main.o: main.cc \
/scratch/hansung/build/pocl-vortex2/runtime/include/CL/opencl.h \
/scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl.h \
/scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_version.h \
/scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_platform.h \
/scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_gl.h \
/scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_ext.h \
/scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_ext_pocl.h

6
tests/opencl/flops/.gitignore vendored Normal file
View File

@@ -0,0 +1,6 @@
flops
*.o
*.bin*
*.pocl
*.dump
*.vcd

View File

@@ -0,0 +1,7 @@
PROJECT = flops
SRCS = main.cc
OPTS ?= -n64
include ../common.mk

View File

View File

@@ -0,0 +1,13 @@
__kernel void flops (__global volatile const float *src,
__global volatile float *dst,
__local volatile float *smem)
{
int gid = get_global_id(0);
float f = 0.0f;
float incr = src[0];
__attribute__((opencl_unroll_hint))
for (int i = 0; i < 5000; i++) {
f += incr;
}
dst[gid] = f;
}

237
tests/opencl/flops/main.cc Normal file
View File

@@ -0,0 +1,237 @@
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <math.h>
#include <CL/opencl.h>
#include <unistd.h>
#include <string.h>
#include <chrono>
#define KERNEL_NAME "flops"
#define CL_CHECK(_expr) \
do { \
cl_int _err = _expr; \
if (_err == CL_SUCCESS) \
break; \
printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
cleanup(); \
exit(-1); \
} while (0)
#define CL_CHECK2(_expr) \
({ \
cl_int _err = CL_INVALID_VALUE; \
decltype(_expr) _ret = _expr; \
if (_err != CL_SUCCESS) { \
printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
cleanup(); \
exit(-1); \
} \
_ret; \
})
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
if (nullptr == filename || nullptr == data || 0 == size)
return -1;
FILE* fp = fopen(filename, "r");
if (NULL == fp) {
fprintf(stderr, "Failed to load kernel.");
return -1;
}
fseek(fp , 0 , SEEK_END);
long fsize = ftell(fp);
rewind(fp);
*data = (uint8_t*)malloc(fsize);
*size = fread(*data, 1, fsize, fp);
fclose(fp);
return 0;
}
static int write_operand_file(const char* filename, void* data, size_t size) {
if (nullptr == filename || nullptr == data || 0 == size)
return -1;
FILE* fp = fopen(filename, "wb");
if (NULL == fp) {
fprintf(stderr, "Failed to write operand data.\n");
return -1;
}
size_t wsize = fwrite(data, size, 1, fp);
if (wsize != 1) {
fprintf(stderr, "Failed to write operand data.\n");
return -1;
}
fclose(fp);
return 0;
}
static bool almost_equal(float a, float b, int ulp = 4) {
union fi_t { int i; float f; };
fi_t fa, fb;
fa.f = a;
fb.f = b;
return std::abs(fa.i - fb.i) <= ulp;
}
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue commandQueue = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_mem src_memobj = NULL;
cl_mem dst_memobj = NULL;
float *h_src = NULL;
float *h_dst = NULL;
uint8_t *kernel_bin = NULL;
static void cleanup() {
if (commandQueue) clReleaseCommandQueue(commandQueue);
if (kernel) clReleaseKernel(kernel);
if (program) clReleaseProgram(program);
if (src_memobj) clReleaseMemObject(src_memobj);
if (dst_memobj) clReleaseMemObject(dst_memobj);
if (context) clReleaseContext(context);
if (device_id) clReleaseDevice(device_id);
if (kernel_bin) free(kernel_bin);
if (h_src) free(h_src);
if (h_dst) free(h_dst);
}
int size = 64;
static void show_usage() {
printf("Usage: [-n size] [-h: help]\n");
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:h?")) != -1) {
switch (c) {
case 'n':
size = atoi(optarg);
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
printf("Workload size=%d\n", size);
}
int main (int argc, char **argv) {
// parse command arguments
parse_args(argc, argv);
cl_platform_id platform_id;
size_t kernel_size;
cl_int binary_status;
// read kernel binary from file
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
return -1;
// Getting platform and device information
CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
printf("Create context\n");
context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err));
printf("Allocate device buffers\n");
size_t nbytes = size * sizeof(float);
src_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
dst_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
printf("Create program from kernel source\n");
cl_int _err;
program = clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err);
if (program == NULL) {
cleanup();
return -1;
}
// Build program
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
// Create kernel
kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
// store entire array to sharedmem
size_t local_size = size;
// Set kernel arguments
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&src_memobj));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&dst_memobj));
CL_CHECK(clSetKernelArg(kernel, 2, local_size*sizeof(float), NULL));
// Allocate memories for input arrays and output arrays.
h_src = (float*)malloc(nbytes);
h_dst = (float*)malloc(nbytes);
// Initialize values for array members.
for (int i = 0; i < size; ++i) {
h_src[i] = sinf(i)*sinf(i);
h_dst[i] = 0xdeadbeef;
//printf("*** [%d]: h_src=%f, h_dst=%f\n", i, h_src[i], h_dst[i]);
}
// NOTE(hansung): Dump operand buffer to a file
if (write_operand_file("flops.input.src.bin", h_src, nbytes) != 0)
return EXIT_FAILURE;
// Creating command queue
commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err));
printf("Upload source buffers\n");
CL_CHECK(clEnqueueWriteBuffer(commandQueue, src_memobj, CL_TRUE, 0, nbytes, h_src, 0, NULL, NULL));
printf("Execute the kernel\n");
size_t global_work_size[1] = {size};
size_t local_work_size[1] = {1};
auto time_start = std::chrono::high_resolution_clock::now();
CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL));
CL_CHECK(clFinish(commandQueue));
auto time_end = std::chrono::high_resolution_clock::now();
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
printf("Elapsed time: %lg ms\n", elapsed);
printf("Download destination buffer\n");
CL_CHECK(clEnqueueReadBuffer(commandQueue, dst_memobj, CL_TRUE, 0, nbytes, h_dst, 0, NULL, NULL));
printf("Verify result\n");
int errors = 0;
for (int i = 0; i < size; ++i) {
float ref = h_src[i];
if (!almost_equal(h_dst[i], ref)) {
if (errors < 100)
printf("*** error: [%d] expected=%f, actual=%f, src=%f\n", i, ref, h_dst[i], h_src[i]);
++errors;
}
}
if (0 == errors) {
printf("PASSED!\n");
} else {
printf("FAILED! - %d errors\n", errors);
}
// Clean up
cleanup();
return errors;
}

View File

@@ -0,0 +1,8 @@
main.o: main.cc \
/scratch/hansung/build/pocl-vortex2/runtime/include/CL/opencl.h \
/scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl.h \
/scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_version.h \
/scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_platform.h \
/scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_gl.h \
/scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_ext.h \
/scratch/hansung/build/pocl-vortex2/runtime/include/CL/cl_ext_pocl.h

5
tests/opencl/smemcoherence/.gitignore vendored Normal file
View File

@@ -0,0 +1,5 @@
smemcoherence
*.bin*
*.pocl
*.dump
*.o

View File

@@ -0,0 +1,7 @@
PROJECT = smemcoherence
SRCS = main.cc
OPTS ?= -n64
include ../common.mk

View File

View File

@@ -0,0 +1,33 @@
__kernel void smemcoherence (__global volatile const int *src,
__global volatile int *dst,
__local volatile int *smem,
int n)
{
__local volatile int *markers = (__local int *)((__local unsigned char *)smem + 0x1000);
int gid = get_global_id(0);
// assumes total store ordering on smem
markers[gid] = 0;
smem[gid] = gid;
markers[gid] = 1;
// 0-th thread checks if all threads finished writing
if (gid == 0) {
int gridsize = get_global_size(0);
int retry = 0;
for (;; retry++) {
for (int i = 0; i < gridsize; i++) {
if (markers[i] != 1) {
goto try_again;
}
}
break;
try_again:;
}
for (int i = 0; i < n; i++) {
dst[i] = smem[i];
}
dst[n] = retry;
}
}

View File

@@ -0,0 +1,238 @@
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <math.h>
#include <CL/opencl.h>
#include <unistd.h>
#include <string.h>
#include <chrono>
#define KERNEL_NAME "smemcoherence"
#define CL_CHECK(_expr) \
do { \
cl_int _err = _expr; \
if (_err == CL_SUCCESS) \
break; \
printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
cleanup(); \
exit(-1); \
} while (0)
#define CL_CHECK2(_expr) \
({ \
cl_int _err = CL_INVALID_VALUE; \
decltype(_expr) _ret = _expr; \
if (_err != CL_SUCCESS) { \
printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
cleanup(); \
exit(-1); \
} \
_ret; \
})
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
if (nullptr == filename || nullptr == data || 0 == size)
return -1;
FILE* fp = fopen(filename, "r");
if (NULL == fp) {
fprintf(stderr, "Failed to load kernel.");
return -1;
}
fseek(fp , 0 , SEEK_END);
long fsize = ftell(fp);
rewind(fp);
*data = (uint8_t*)malloc(fsize);
*size = fread(*data, 1, fsize, fp);
fclose(fp);
return 0;
}
static int write_operand_file(const char* filename, void* data, size_t size) {
if (nullptr == filename || nullptr == data || 0 == size)
return -1;
FILE* fp = fopen(filename, "wb");
if (NULL == fp) {
fprintf(stderr, "Failed to write operand data.\n");
return -1;
}
size_t wsize = fwrite(data, size, 1, fp);
if (wsize != 1) {
fprintf(stderr, "Failed to write operand data.\n");
return -1;
}
fclose(fp);
return 0;
}
static bool almost_equal(float a, float b, int ulp = 4) {
union fi_t { int i; float f; };
fi_t fa, fb;
fa.f = a;
fb.f = b;
return std::abs(fa.i - fb.i) <= ulp;
}
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue commandQueue = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_mem src_memobj = NULL;
cl_mem dst_memobj = NULL;
int *h_src = NULL;
int *h_dst = NULL;
uint8_t *kernel_bin = NULL;
static void cleanup() {
if (commandQueue) clReleaseCommandQueue(commandQueue);
if (kernel) clReleaseKernel(kernel);
if (program) clReleaseProgram(program);
if (src_memobj) clReleaseMemObject(src_memobj);
if (dst_memobj) clReleaseMemObject(dst_memobj);
if (context) clReleaseContext(context);
if (device_id) clReleaseDevice(device_id);
if (kernel_bin) free(kernel_bin);
if (h_src) free(h_src);
if (h_dst) free(h_dst);
}
int size = 64;
static void show_usage() {
printf("Usage: [-n size] [-h: help]\n");
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:h?")) != -1) {
switch (c) {
case 'n':
size = atoi(optarg);
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
printf("Workload size=%d\n", size);
}
int main (int argc, char **argv) {
// parse command arguments
parse_args(argc, argv);
cl_platform_id platform_id;
size_t kernel_size;
cl_int binary_status;
// read kernel binary from file
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
return -1;
// Getting platform and device information
CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
printf("Create context\n");
context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err));
printf("Allocate device buffers\n");
// + 1 for the trial value
size_t nbytes = (size + 1) * sizeof(int);
src_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
dst_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
printf("Create program from kernel source\n");
cl_int _err;
program = clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err);
if (program == NULL) {
cleanup();
return -1;
}
// Build program
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
// Create kernel
kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
size_t local_nbytes = 0x2000;
// Set kernel arguments
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&src_memobj));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&dst_memobj));
CL_CHECK(clSetKernelArg(kernel, 2, local_nbytes, NULL));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(uint32_t), &size));
// Allocate memories for input arrays and output arrays.
h_src = (int*)malloc(nbytes);
h_dst = (int*)malloc(nbytes);
// Initialize values for array members.
for (int i = 0; i < size; ++i) {
h_src[i] = i;
h_dst[i] = 0xdeadbeef;
//printf("*** [%d]: h_src=%f, h_dst=%f\n", i, h_src[i], h_dst[i]);
}
// NOTE(hansung): Dump operand buffer to a file
if (write_operand_file("smemcoherence.input.src.bin", h_src, nbytes) != 0)
return EXIT_FAILURE;
// Creating command queue
commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err));
printf("Upload source buffers\n");
CL_CHECK(clEnqueueWriteBuffer(commandQueue, src_memobj, CL_TRUE, 0, nbytes, h_src, 0, NULL, NULL));
printf("Execute the kernel\n");
size_t global_work_size[1] = {size};
size_t local_work_size[1] = {1};
auto time_start = std::chrono::high_resolution_clock::now();
CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL));
CL_CHECK(clFinish(commandQueue));
auto time_end = std::chrono::high_resolution_clock::now();
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
printf("Elapsed time: %lg ms\n", elapsed);
printf("Download destination buffer\n");
CL_CHECK(clEnqueueReadBuffer(commandQueue, dst_memobj, CL_TRUE, 0, nbytes, h_dst, 0, NULL, NULL));
printf("Verify result\n");
int errors = 0;
for (int i = 0; i < size; ++i) {
int ref = i;
if (h_dst[i] != ref) {
printf("*** error: [%d] expected=%d, actual=%d\n", i, ref, h_dst[i]);
++errors;
}
}
printf("smem check re-trial count: %d\n", h_dst[size]);
if (0 == errors) {
printf("PASSED!\n");
} else {
printf("FAILED! - %d errors\n", errors);
}
// Clean up
cleanup();
return errors;
}