Vortex 2.0 changes:

+ Microarchitecture optimizations
+ 64-bit support
+ Xilinx FPGA support
+ LLVM-16 support
+ Refactoring and quality control fixes

minor update

minor update

minor update

minor update

minor update

minor update

cleanup

cleanup

cache bindings and memory perf refactory

minor update

minor update

hw unit tests fixes

minor update

minor update

minor update

minor update

minor update

minor udpate

minor update

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor updates

minor updates

minor update

minor update
This commit is contained in:
Blaise Tine
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit c1e168fdbe
1309 changed files with 247412 additions and 311463 deletions

View File

@@ -1,71 +1,7 @@
XLEN ?= 32
LLVM_PREFIX ?= /opt/llvm-riscv
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
POCL_CC_PATH ?= /opt/pocl/compiler
POCL_RT_PATH ?= /opt/pocl/runtime
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
OPTS ?= -n16
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing
CXXFLAGS += -I$(POCL_RT_PATH)/include
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = sfilter
SRCS = main.cc
all: $(PROJECT) kernel.pocl
OPTS ?= -n16
kernel.pocl: kernel.cl
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-fpga: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.pocl *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

View File

@@ -6,16 +6,18 @@ __kernel void sfilter(__global float *src, __global float *dst, long ldc,
{
long x = get_global_id(0);
long y = get_global_id(1);
float i0 = src[(x-1)+(y-1)*ldc]*m0;
float i1 = src[(x) +(y-1)*ldc]*m1;
float i2 = src[(x+1)+(y-1)*ldc]*m2;
float i3 = src[(x-1)+(y) *ldc]*m3;
float i4 = src[(x) + y * ldc]*m4;
float i5 = src[(x+1)+(y) *ldc]*m5;
float i6 = src[(x-1)+(y+1)*ldc]*m6;
float i7 = src[(x) +(y+1)*ldc]*m7;
float i8 = src[(x+1)+(y+1)*ldc]*m8;
dst[x+y*ldc] = i0 + i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8;
int addr = x + y * ldc;
float i0 = src[addr-1-1*ldc]*m0;
float i1 = src[addr+0-1*ldc]*m1;
float i2 = src[addr+1-1*ldc]*m2;
float i3 = src[addr-1+0*ldc]*m3;
float i4 = src[addr+0+0*ldc]*m4;
float i5 = src[addr+1+0*ldc]*m5;
float i6 = src[addr-1+1*ldc]*m6;
float i7 = src[addr+0+1*ldc]*m7;
float i8 = src[addr+1+1*ldc]*m8;
dst[addr] = i0 + i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8;
}

Binary file not shown.

View File

@@ -34,6 +34,7 @@
#include <string.h>
#include <unistd.h>
#include <chrono>
#include <vector>
#define CL_CHECK(_expr) \
do { \
@@ -47,7 +48,7 @@
#define CL_CHECK_ERR(_expr) \
({ \
cl_int _err = CL_INVALID_VALUE; \
decltype(_expr) _ret = _expr; \
decltype(_expr) _ret = _expr; \
if (_err != CL_SUCCESS) { \
fprintf(stderr, "OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
abort(); \
@@ -81,85 +82,17 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
return 0;
}
uint8_t *kernel_bin = NULL;
// inlcude pocl float to half conversions
typedef union {
int32_t i;
float f;
} FloatConvUnion;
cl_half poclu_float_to_cl_half(float value) {
FloatConvUnion u;
u.f = value;
cl_half half = (u.i >> 16) & 0x8000; // sign
cl_half fraction =
(u.i >> 12) & 0x007ff; // fraction with extra bit for rounding
cl_half exponent = (u.i >> 23) & 0xff; // exponent
if (exponent < 0x0067) // Return signed zero if zero or value is too small for
// denormal half
return half;
if (exponent > 0x008e) { // value was NaN or Inf
half |= 0x7c00u; // Make into inf
half |= exponent == 255 &&
(u.i & 0x007fffffu); // If value was NaN make this into NaN
return half;
}
if (exponent < 0x0071) { // Denormal
fraction |= 0x0800u;
// rounding
half |= (fraction >> (0x0072 - exponent)) +
((fraction >> (0x0071 - exponent)) & 1);
return half;
}
half |= ((exponent - 0x0070) << 10) | (fraction >> 1);
half += fraction & 1; // rounding
return half;
}
#ifndef INFINITY
#define INFINITY 1.0 / 0.0
#endif
#ifndef NAN
#define NAN 0.0 / 0.0
#endif
float poclu_cl_half_to_float(cl_half value) {
if (value == 0xFC00) {
return -INFINITY;
}
if (value == 0x7C00) {
return INFINITY;
}
int sgn = ((value & 0x8000) >> 15);
int exp = (value & 0x7C00) >> 10;
int mant = value & 0x03FF;
if (exp == 0x1F && mant != 0) {
return NAN;
}
float v = (exp == 0) ? mant : mant | 0x0400; // 1.x if not denormal
v /= 0x400;
float mul = exp2((float)exp - 15);
v *= mul;
if (sgn) {
v *= -1;
}
return v;
static bool almost_equal(float a, float b, int ulp = 4) {
union fi_t { int i; float f; };
fi_t fa, fb;
fa.f = a;
fb.f = b;
return std::abs(fa.i - fb.i) <= ulp;
}
///
// Cleanup any created OpenCL resources
//
void Cleanup(cl_device_id device_id, cl_context context, cl_command_queue commandQueue,
cl_program program, cl_kernel kernel, cl_mem memObjects[2]) {
if (kernel_bin)
void Cleanup(uint8_t *kernel_bin, cl_device_id device_id, cl_context context,
cl_command_queue commandQueue, cl_program program, cl_kernel kernel, cl_mem memObjects[2]) {
if (kernel_bin != NULL)
free(kernel_bin);
if (commandQueue != 0)
@@ -183,18 +116,18 @@ void Cleanup(cl_device_id device_id, cl_context context, cl_command_queue comman
clReleaseDevice(device_id);
}
int size = 16+2;
static void show_usage() {
printf("Usage: [-n size] [-h: help]\n");
}
int size = 16;
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:h?")) != -1) {
switch (c) {
case 'n':
size = atoi(optarg)+2;
size = atoi(optarg);
break;
case 'h':
case '?': {
@@ -218,6 +151,7 @@ int main(int argc, char **argv) {
cl_device_id device_id;
size_t kernel_size;
cl_int binary_status = 0;
uint8_t *kernel_bin = NULL;
// read kernel binary from file
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
@@ -237,17 +171,17 @@ int main(int argc, char **argv) {
cl_mem memObjects[2] = {0, 0};
// Create OpenCL program - first attempt to load cached binary.
// If that is not available, then create the program from source
// and store the binary for future use.
std::cout << "Attempting to create program from binary..." << std::endl;
// If that is not available, then create the program from source
// and store the binary for future use.
printf("create program from binary...\n");
cl_program program = CL_CHECK_ERR(clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
if (program == NULL) {
std::cerr << "Failed to write program binary" << std::endl;
Cleanup(device_id, context, queue, program, kernel, memObjects);
Cleanup(kernel_bin, device_id, context, queue, program, kernel, memObjects);
return 1;
} else {
std::cout << "Read program from binary." << std::endl;
printf("Read program from binary.");
}
// Build program
@@ -255,11 +189,11 @@ int main(int argc, char **argv) {
size_t nbytes = sizeof(float) * size * size;
printf("attempting to create input buffer\n");
printf("create input buffer\n");
cl_mem input_buffer;
input_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
printf("attempting to create output buffer\n");
printf("create output buffer\n");
cl_mem output_buffer;
output_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
@@ -267,7 +201,6 @@ int main(int argc, char **argv) {
memObjects[1] = output_buffer;
long long ldc = size;
float m0 = 1.0;
float m1 = 1.0;
float m2 = 1.0;
@@ -278,8 +211,9 @@ int main(int argc, char **argv) {
float m7 = 1.0;
float m8 = 1.0;
printf("attempting to create kernel\n");
printf("create kernel\n");
kernel = CL_CHECK_ERR(clCreateKernel(program, "sfilter", &_err));
printf("setting up kernel args\n");
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(input_buffer), &input_buffer));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(output_buffer), &output_buffer));
@@ -294,38 +228,84 @@ int main(int argc, char **argv) {
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(m7), (&m7)));
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(m8), (&m8)));
printf("attempting to enqueue write buffer\n");
float* h_src = (float*)malloc(nbytes);
for (int i = 0; i < size * size; i++) {
h_src[i] = ((float)rand() / (float)(RAND_MAX)) * 100.0;
}
CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, 0, nbytes, h_src, 0, NULL, NULL));
free(h_src);
size_t global_offset[2] = {1, 1};
size_t global_work_size[2] = {size - 2, size - 2}; // avoid the edges
const size_t local_work_size[2] = {size - 2, 1};
printf("attempting to enqueue kernel\n");
size_t global_work_size[2] = {size - 2, size - 2};
size_t local_work_size[2] = {size - 2, 1};
printf("enqueue write buffer\n");
std::vector<float> ref_vec(size * size);
{
std::vector<float> src_vec(size * size);
std::vector<float> dst_vec(size * size, 0.0f);
for (int i = 0; i < size * size; ++i) {
src_vec[i] = ((float)rand() / (float)(RAND_MAX)) * 100.0;
}
CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, 0, nbytes, src_vec.data(), 0, NULL, NULL));
CL_CHECK(clEnqueueWriteBuffer(queue, output_buffer, CL_TRUE, 0, nbytes, dst_vec.data(), 0, NULL, NULL));
// reference generation
size_t num_groups_y = global_work_size[1] / local_work_size[1];
size_t num_groups_x = global_work_size[0] / local_work_size[0];
for (size_t workgroup_id_y = 0; workgroup_id_y < num_groups_y; ++workgroup_id_y) {
for (size_t workgroup_id_x = 0; workgroup_id_x < num_groups_x; ++workgroup_id_x) {
for (size_t local_id_y = 0; local_id_y < local_work_size[1]; ++local_id_y) {
for (size_t local_id_x = 0; local_id_x < local_work_size[0]; ++local_id_x) {
// calculate global ID for the work-item
int global_id_x = global_offset[0] + local_work_size[0] * workgroup_id_x + local_id_x;
int global_id_y = global_offset[1] + local_work_size[1] * workgroup_id_y + local_id_y;
// kernel operation
int x = global_id_x;
int y = global_id_y;
float i0 = src_vec.at((x-1) + (y-1) * ldc) * m0;
float i1 = src_vec.at((x+0) + (y-1) * ldc) * m1;
float i2 = src_vec.at((x+1) + (y-1) * ldc) * m2;
float i3 = src_vec.at((x-1) + (y+0) * ldc) * m3;
float i4 = src_vec.at((x+0) + (y+0) * ldc) * m4;
float i5 = src_vec.at((x+1) + (y+0) * ldc) * m5;
float i6 = src_vec.at((x-1) + (y+1) * ldc) * m6;
float i7 = src_vec.at((x+0) + (y+1) * ldc) * m7;
float i8 = src_vec.at((x+1) + (y+1) * ldc) * m8;
float v = i0 + i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8;
//printf("*** x=%d, y=%d, v=%f\n", x, y, v);
ref_vec.at(x + y * ldc) = v;
}
}
}
}
}
printf("enqueue kernel\n");
auto time_start = std::chrono::high_resolution_clock::now();
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, global_offset,
global_work_size, local_work_size, 0, NULL, NULL));
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 2, global_offset, global_work_size, local_work_size, 0, NULL, NULL));
CL_CHECK(clFinish(queue));
auto time_end = std::chrono::high_resolution_clock::now();
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
printf("Elapsed time: %lg ms\n", elapsed);
printf("Download destination buffer\n");
float* h_dst = (float*)malloc(nbytes);
CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, 0, nbytes, h_dst, 0, NULL, NULL));
printf("Verify result\n");
int errors = 0;
{
std::vector<float> dst_vec(size * size);
CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, 0, nbytes, dst_vec.data(), 0, NULL, NULL));
for (int i = 0; i < size * size; ++i) {
if (!almost_equal(dst_vec[i], ref_vec[i])) {
if (errors < 100)
printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], dst_vec[i]);
++errors;
}
}
/*printf("Result:");
for (int i = 0; i < size; i++) {
float data = h_dst[i];
printf(" %f", data);
}*/
free(h_dst);
if (0 == errors) {
printf("PASSED!\n");
} else {
printf("FAILED! - %d errors\n", errors);
}
}
Cleanup(device_id, context, queue, program, kernel, memObjects);
Cleanup(kernel_bin, device_id, context, queue, program, kernel, memObjects);
return 0;
return errors;
}

File diff suppressed because it is too large Load Diff