Vortex 2.0 changes:
+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes minor update minor update minor update minor update minor update minor update cleanup cleanup cache bindings and memory perf refactory minor update minor update hw unit tests fixes minor update minor update minor update minor update minor update minor udpate minor update minor update minor update minor update minor update minor update minor update minor updates minor updates minor update minor update minor update minor update minor update minor update minor updates minor updates minor updates minor updates minor update minor update
This commit is contained in:
@@ -1,71 +1,7 @@
|
||||
XLEN ?= 32
|
||||
|
||||
LLVM_PREFIX ?= /opt/llvm-riscv
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
|
||||
POCL_CC_PATH ?= /opt/pocl/compiler
|
||||
POCL_RT_PATH ?= /opt/pocl/runtime
|
||||
|
||||
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
|
||||
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
||||
|
||||
OPTS ?= -n16
|
||||
|
||||
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
|
||||
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
|
||||
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
|
||||
|
||||
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing
|
||||
|
||||
CXXFLAGS += -I$(POCL_RT_PATH)/include
|
||||
|
||||
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
CXXFLAGS += -g -O0
|
||||
else
|
||||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
||||
PROJECT = sfilter
|
||||
|
||||
SRCS = main.cc
|
||||
|
||||
all: $(PROJECT) kernel.pocl
|
||||
OPTS ?= -n16
|
||||
|
||||
kernel.pocl: kernel.cl
|
||||
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
run-fpga: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-asesim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-vlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-simx: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-rtlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) *.o .depend
|
||||
|
||||
clean-all: clean
|
||||
rm -rf *.pocl *.dump
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
||||
include ../common.mk
|
||||
|
||||
@@ -6,16 +6,18 @@ __kernel void sfilter(__global float *src, __global float *dst, long ldc,
|
||||
{
|
||||
long x = get_global_id(0);
|
||||
long y = get_global_id(1);
|
||||
|
||||
float i0 = src[(x-1)+(y-1)*ldc]*m0;
|
||||
float i1 = src[(x) +(y-1)*ldc]*m1;
|
||||
float i2 = src[(x+1)+(y-1)*ldc]*m2;
|
||||
float i3 = src[(x-1)+(y) *ldc]*m3;
|
||||
float i4 = src[(x) + y * ldc]*m4;
|
||||
float i5 = src[(x+1)+(y) *ldc]*m5;
|
||||
float i6 = src[(x-1)+(y+1)*ldc]*m6;
|
||||
float i7 = src[(x) +(y+1)*ldc]*m7;
|
||||
float i8 = src[(x+1)+(y+1)*ldc]*m8;
|
||||
|
||||
dst[x+y*ldc] = i0 + i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8;
|
||||
int addr = x + y * ldc;
|
||||
|
||||
float i0 = src[addr-1-1*ldc]*m0;
|
||||
float i1 = src[addr+0-1*ldc]*m1;
|
||||
float i2 = src[addr+1-1*ldc]*m2;
|
||||
float i3 = src[addr-1+0*ldc]*m3;
|
||||
float i4 = src[addr+0+0*ldc]*m4;
|
||||
float i5 = src[addr+1+0*ldc]*m5;
|
||||
float i6 = src[addr-1+1*ldc]*m6;
|
||||
float i7 = src[addr+0+1*ldc]*m7;
|
||||
float i8 = src[addr+1+1*ldc]*m8;
|
||||
|
||||
dst[addr] = i0 + i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8;
|
||||
}
|
||||
|
||||
Binary file not shown.
@@ -34,6 +34,7 @@
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <chrono>
|
||||
#include <vector>
|
||||
|
||||
#define CL_CHECK(_expr) \
|
||||
do { \
|
||||
@@ -47,7 +48,7 @@
|
||||
#define CL_CHECK_ERR(_expr) \
|
||||
({ \
|
||||
cl_int _err = CL_INVALID_VALUE; \
|
||||
decltype(_expr) _ret = _expr; \
|
||||
decltype(_expr) _ret = _expr; \
|
||||
if (_err != CL_SUCCESS) { \
|
||||
fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
|
||||
abort(); \
|
||||
@@ -81,85 +82,17 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint8_t *kernel_bin = NULL;
|
||||
|
||||
// inlcude pocl float to half conversions
|
||||
typedef union {
|
||||
int32_t i;
|
||||
float f;
|
||||
} FloatConvUnion;
|
||||
cl_half poclu_float_to_cl_half(float value) {
|
||||
FloatConvUnion u;
|
||||
u.f = value;
|
||||
cl_half half = (u.i >> 16) & 0x8000; // sign
|
||||
cl_half fraction =
|
||||
(u.i >> 12) & 0x007ff; // fraction with extra bit for rounding
|
||||
cl_half exponent = (u.i >> 23) & 0xff; // exponent
|
||||
|
||||
if (exponent < 0x0067) // Return signed zero if zero or value is too small for
|
||||
// denormal half
|
||||
return half;
|
||||
|
||||
if (exponent > 0x008e) { // value was NaN or Inf
|
||||
half |= 0x7c00u; // Make into inf
|
||||
half |= exponent == 255 &&
|
||||
(u.i & 0x007fffffu); // If value was NaN make this into NaN
|
||||
return half;
|
||||
}
|
||||
|
||||
if (exponent < 0x0071) { // Denormal
|
||||
fraction |= 0x0800u;
|
||||
|
||||
// rounding
|
||||
half |= (fraction >> (0x0072 - exponent)) +
|
||||
((fraction >> (0x0071 - exponent)) & 1);
|
||||
return half;
|
||||
}
|
||||
|
||||
half |= ((exponent - 0x0070) << 10) | (fraction >> 1);
|
||||
half += fraction & 1; // rounding
|
||||
return half;
|
||||
}
|
||||
#ifndef INFINITY
|
||||
#define INFINITY 1.0 / 0.0
|
||||
#endif
|
||||
|
||||
#ifndef NAN
|
||||
#define NAN 0.0 / 0.0
|
||||
#endif
|
||||
|
||||
float poclu_cl_half_to_float(cl_half value) {
|
||||
if (value == 0xFC00) {
|
||||
return -INFINITY;
|
||||
}
|
||||
if (value == 0x7C00) {
|
||||
return INFINITY;
|
||||
}
|
||||
|
||||
int sgn = ((value & 0x8000) >> 15);
|
||||
int exp = (value & 0x7C00) >> 10;
|
||||
int mant = value & 0x03FF;
|
||||
|
||||
if (exp == 0x1F && mant != 0) {
|
||||
return NAN;
|
||||
}
|
||||
|
||||
float v = (exp == 0) ? mant : mant | 0x0400; // 1.x if not denormal
|
||||
v /= 0x400;
|
||||
float mul = exp2((float)exp - 15);
|
||||
v *= mul;
|
||||
if (sgn) {
|
||||
v *= -1;
|
||||
}
|
||||
return v;
|
||||
static bool almost_equal(float a, float b, int ulp = 4) {
|
||||
union fi_t { int i; float f; };
|
||||
fi_t fa, fb;
|
||||
fa.f = a;
|
||||
fb.f = b;
|
||||
return std::abs(fa.i - fb.i) <= ulp;
|
||||
}
|
||||
|
||||
///
|
||||
// Cleanup any created OpenCL resources
|
||||
//
|
||||
void Cleanup(cl_device_id device_id, cl_context context, cl_command_queue commandQueue,
|
||||
cl_program program, cl_kernel kernel, cl_mem memObjects[2]) {
|
||||
if (kernel_bin)
|
||||
void Cleanup(uint8_t *kernel_bin, cl_device_id device_id, cl_context context,
|
||||
cl_command_queue commandQueue, cl_program program, cl_kernel kernel, cl_mem memObjects[2]) {
|
||||
if (kernel_bin != NULL)
|
||||
free(kernel_bin);
|
||||
|
||||
if (commandQueue != 0)
|
||||
@@ -183,18 +116,18 @@ void Cleanup(cl_device_id device_id, cl_context context, cl_command_queue comman
|
||||
clReleaseDevice(device_id);
|
||||
}
|
||||
|
||||
int size = 16+2;
|
||||
|
||||
static void show_usage() {
|
||||
printf("Usage: [-n size] [-h: help]\n");
|
||||
}
|
||||
|
||||
int size = 16;
|
||||
|
||||
static void parse_args(int argc, char **argv) {
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "n:h?")) != -1) {
|
||||
switch (c) {
|
||||
case 'n':
|
||||
size = atoi(optarg)+2;
|
||||
size = atoi(optarg);
|
||||
break;
|
||||
case 'h':
|
||||
case '?': {
|
||||
@@ -218,6 +151,7 @@ int main(int argc, char **argv) {
|
||||
cl_device_id device_id;
|
||||
size_t kernel_size;
|
||||
cl_int binary_status = 0;
|
||||
uint8_t *kernel_bin = NULL;
|
||||
|
||||
// read kernel binary from file
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
@@ -237,17 +171,17 @@ int main(int argc, char **argv) {
|
||||
cl_mem memObjects[2] = {0, 0};
|
||||
|
||||
// Create OpenCL program - first attempt to load cached binary.
|
||||
// If that is not available, then create the program from source
|
||||
// and store the binary for future use.
|
||||
std::cout << "Attempting to create program from binary..." << std::endl;
|
||||
// If that is not available, then create the program from source
|
||||
// and store the binary for future use.
|
||||
printf("create program from binary...\n");
|
||||
cl_program program = CL_CHECK_ERR(clCreateProgramWithBinary(
|
||||
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
|
||||
if (program == NULL) {
|
||||
std::cerr << "Failed to write program binary" << std::endl;
|
||||
Cleanup(device_id, context, queue, program, kernel, memObjects);
|
||||
Cleanup(kernel_bin, device_id, context, queue, program, kernel, memObjects);
|
||||
return 1;
|
||||
} else {
|
||||
std::cout << "Read program from binary." << std::endl;
|
||||
printf("Read program from binary.");
|
||||
}
|
||||
|
||||
// Build program
|
||||
@@ -255,11 +189,11 @@ int main(int argc, char **argv) {
|
||||
|
||||
size_t nbytes = sizeof(float) * size * size;
|
||||
|
||||
printf("attempting to create input buffer\n");
|
||||
printf("create input buffer\n");
|
||||
cl_mem input_buffer;
|
||||
input_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
|
||||
|
||||
printf("attempting to create output buffer\n");
|
||||
printf("create output buffer\n");
|
||||
cl_mem output_buffer;
|
||||
output_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
|
||||
|
||||
@@ -267,7 +201,6 @@ int main(int argc, char **argv) {
|
||||
memObjects[1] = output_buffer;
|
||||
|
||||
long long ldc = size;
|
||||
|
||||
float m0 = 1.0;
|
||||
float m1 = 1.0;
|
||||
float m2 = 1.0;
|
||||
@@ -278,8 +211,9 @@ int main(int argc, char **argv) {
|
||||
float m7 = 1.0;
|
||||
float m8 = 1.0;
|
||||
|
||||
printf("attempting to create kernel\n");
|
||||
printf("create kernel\n");
|
||||
kernel = CL_CHECK_ERR(clCreateKernel(program, "sfilter", &_err));
|
||||
|
||||
printf("setting up kernel args\n");
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(input_buffer), &input_buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(output_buffer), &output_buffer));
|
||||
@@ -294,38 +228,84 @@ int main(int argc, char **argv) {
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(m7), (&m7)));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(m8), (&m8)));
|
||||
|
||||
printf("attempting to enqueue write buffer\n");
|
||||
float* h_src = (float*)malloc(nbytes);
|
||||
for (int i = 0; i < size * size; i++) {
|
||||
h_src[i] = ((float)rand() / (float)(RAND_MAX)) * 100.0;
|
||||
}
|
||||
CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, 0, nbytes, h_src, 0, NULL, NULL));
|
||||
free(h_src);
|
||||
|
||||
size_t global_offset[2] = {1, 1};
|
||||
size_t global_work_size[2] = {size - 2, size - 2}; // avoid the edges
|
||||
const size_t local_work_size[2] = {size - 2, 1};
|
||||
printf("attempting to enqueue kernel\n");
|
||||
size_t global_work_size[2] = {size - 2, size - 2};
|
||||
size_t local_work_size[2] = {size - 2, 1};
|
||||
|
||||
printf("enqueue write buffer\n");
|
||||
std::vector<float> ref_vec(size * size);
|
||||
{
|
||||
std::vector<float> src_vec(size * size);
|
||||
std::vector<float> dst_vec(size * size, 0.0f);
|
||||
|
||||
for (int i = 0; i < size * size; ++i) {
|
||||
src_vec[i] = ((float)rand() / (float)(RAND_MAX)) * 100.0;
|
||||
}
|
||||
|
||||
CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, 0, nbytes, src_vec.data(), 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueWriteBuffer(queue, output_buffer, CL_TRUE, 0, nbytes, dst_vec.data(), 0, NULL, NULL));
|
||||
|
||||
// reference generation
|
||||
size_t num_groups_y = global_work_size[1] / local_work_size[1];
|
||||
size_t num_groups_x = global_work_size[0] / local_work_size[0];
|
||||
for (size_t workgroup_id_y = 0; workgroup_id_y < num_groups_y; ++workgroup_id_y) {
|
||||
for (size_t workgroup_id_x = 0; workgroup_id_x < num_groups_x; ++workgroup_id_x) {
|
||||
for (size_t local_id_y = 0; local_id_y < local_work_size[1]; ++local_id_y) {
|
||||
for (size_t local_id_x = 0; local_id_x < local_work_size[0]; ++local_id_x) {
|
||||
// calculate global ID for the work-item
|
||||
int global_id_x = global_offset[0] + local_work_size[0] * workgroup_id_x + local_id_x;
|
||||
int global_id_y = global_offset[1] + local_work_size[1] * workgroup_id_y + local_id_y;
|
||||
// kernel operation
|
||||
int x = global_id_x;
|
||||
int y = global_id_y;
|
||||
float i0 = src_vec.at((x-1) + (y-1) * ldc) * m0;
|
||||
float i1 = src_vec.at((x+0) + (y-1) * ldc) * m1;
|
||||
float i2 = src_vec.at((x+1) + (y-1) * ldc) * m2;
|
||||
float i3 = src_vec.at((x-1) + (y+0) * ldc) * m3;
|
||||
float i4 = src_vec.at((x+0) + (y+0) * ldc) * m4;
|
||||
float i5 = src_vec.at((x+1) + (y+0) * ldc) * m5;
|
||||
float i6 = src_vec.at((x-1) + (y+1) * ldc) * m6;
|
||||
float i7 = src_vec.at((x+0) + (y+1) * ldc) * m7;
|
||||
float i8 = src_vec.at((x+1) + (y+1) * ldc) * m8;
|
||||
float v = i0 + i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8;
|
||||
//printf("*** x=%d, y=%d, v=%f\n", x, y, v);
|
||||
ref_vec.at(x + y * ldc) = v;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
printf("enqueue kernel\n");
|
||||
auto time_start = std::chrono::high_resolution_clock::now();
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, global_offset,
|
||||
global_work_size, local_work_size, 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, global_offset, global_work_size, local_work_size, 0, NULL, NULL));
|
||||
CL_CHECK(clFinish(queue));
|
||||
auto time_end = std::chrono::high_resolution_clock::now();
|
||||
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
|
||||
printf("Elapsed time: %lg ms\n", elapsed);
|
||||
|
||||
printf("Download destination buffer\n");
|
||||
float* h_dst = (float*)malloc(nbytes);
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, 0, nbytes, h_dst, 0, NULL, NULL));
|
||||
printf("Verify result\n");
|
||||
int errors = 0;
|
||||
{
|
||||
std::vector<float> dst_vec(size * size);
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, 0, nbytes, dst_vec.data(), 0, NULL, NULL));
|
||||
|
||||
for (int i = 0; i < size * size; ++i) {
|
||||
if (!almost_equal(dst_vec[i], ref_vec[i])) {
|
||||
if (errors < 100)
|
||||
printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], dst_vec[i]);
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
|
||||
/*printf("Result:");
|
||||
for (int i = 0; i < size; i++) {
|
||||
float data = h_dst[i];
|
||||
printf(" %f", data);
|
||||
}*/
|
||||
free(h_dst);
|
||||
if (0 == errors) {
|
||||
printf("PASSED!\n");
|
||||
} else {
|
||||
printf("FAILED! - %d errors\n", errors);
|
||||
}
|
||||
}
|
||||
|
||||
Cleanup(device_id, context, queue, program, kernel, memObjects);
|
||||
Cleanup(kernel_bin, device_id, context, queue, program, kernel, memObjects);
|
||||
|
||||
return 0;
|
||||
return errors;
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user