Vortex 2.0 changes:

+ Microarchitecture optimizations
+ 64-bit support
+ Xilinx FPGA support
+ LLVM-16 support
+ Refactoring and quality control fixes

minor update

minor update

minor update

minor update

minor update

minor update

cleanup

cleanup

cache bindings and memory perf refactory

minor update

minor update

hw unit tests fixes

minor update

minor update

minor update

minor update

minor update

minor udpate

minor update

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor updates

minor updates

minor update

minor update
This commit is contained in:
Blaise Tine
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit c1e168fdbe
1309 changed files with 247412 additions and 311463 deletions

View File

@@ -1,71 +1,7 @@
XLEN ?= 32
LLVM_PREFIX ?= /opt/llvm-riscv
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
POCL_CC_PATH ?= /opt/pocl/compiler
POCL_RT_PATH ?= /opt/pocl/runtime
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
OPTS ?= -n1024
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing
CXXFLAGS += -I$(POCL_RT_PATH)/include
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = saxpy
SRCS = main.cc
all: $(PROJECT) kernel.pocl
OPTS ?= -n1024
kernel.pocl: kernel.cl
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-fpga: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.pocl *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

Binary file not shown.

View File

@@ -31,6 +31,7 @@
#include <string.h>
#include <unistd.h>
#include <chrono>
#include <vector>
#define CL_CHECK(_expr) \
do { \
@@ -78,6 +79,14 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
return 0;
}
static bool almost_equal(float a, float b, int ulp = 4) {
union fi_t { int i; float f; };
fi_t fa, fb;
fa.f = a;
fb.f = b;
return std::abs(fa.i - fb.i) <= ulp;
}
uint8_t *kernel_bin = NULL;
///
@@ -142,7 +151,11 @@ int main(int argc, char **argv) {
cl_platform_id platform_id;
cl_device_id device_id;
cl_mem input_buffer;
cl_mem output_buffer;
size_t kernel_size;
cl_context context;
cl_command_queue queue;
cl_int binary_status = 0;
// read kernel binary from file
@@ -153,10 +166,7 @@ int main(int argc, char **argv) {
CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
cl_context context;
context = CL_CHECK_ERR(clCreateContext(NULL, 1, &device_id, &pfn_notify, NULL, &_err));
cl_command_queue queue;
context = CL_CHECK_ERR(clCreateContext(NULL, 1, &device_id, &pfn_notify, NULL, &_err));
queue = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &_err));
cl_kernel kernel = 0;
@@ -165,7 +175,7 @@ int main(int argc, char **argv) {
// Create OpenCL program - first attempt to load cached binary.
// If that is not available, then create the program from source
// and store the binary for future use.
std::cout << "Attempting to create program from binary..." << std::endl;
printf("create program from binary...\n");
cl_program program = CL_CHECK_ERR(clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
if (program == NULL) {
@@ -173,7 +183,7 @@ int main(int argc, char **argv) {
Cleanup(device_id, context, queue, program, kernel, memObjects);
return 1;
} else {
std::cout << "Read program from binary." << std::endl;
printf("Read program from binary.\n");
}
// Build program
@@ -181,22 +191,18 @@ int main(int argc, char **argv) {
size_t nbytes = sizeof(float) * size;
printf("attempting to create input buffer\n");
cl_mem input_buffer;
input_buffer = CL_CHECK_ERR(clCreateBuffer(
context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
printf("create input buffer\n");
input_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
printf("attempting to create output buffer\n");
cl_mem output_buffer;
output_buffer = CL_CHECK_ERR(clCreateBuffer(
context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
printf("create output buffer\n");
output_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, nbytes, NULL, &_err));
memObjects[0] = input_buffer;
memObjects[1] = output_buffer;
float factor = ((float)rand() / (float)(RAND_MAX)) * 100.0;
printf("attempting to create kernel\n");
printf("create kernel\n");
kernel = CL_CHECK_ERR(clCreateKernel(program, "saxpy", &_err));
printf("setting up kernel args\n");
@@ -204,36 +210,65 @@ int main(int argc, char **argv) {
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(output_buffer), &output_buffer));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(factor), &factor));
printf("attempting to enqueue write buffer\n");
float* h_src = (float*)malloc(nbytes);
for (int i = 0; i < size; i++) {
h_src[i] = ((float)rand() / (float)(RAND_MAX)) * 100.0;
}
CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, 0, nbytes, h_src, 0, NULL, NULL));
free(h_src);
size_t global_offset[1] = {0};
size_t global_work_size[1] = {size};
size_t local_work_size[1] = {1};
size_t global_work_size[] = {size/2, size/2};
printf("attempting to enqueue kernel\n");
printf("initialize buffers\n");
std::vector<float> ref_vec(size, 0.0f);
{
std::vector<float> dst_vec(size, 0.0f);
std::vector<float> src_vec(size);
for (int i = 0; i < size; i++) {
src_vec[i] = ((float)rand() / (float)(RAND_MAX)) * 100.0;
}
CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, 0, nbytes, src_vec.data(), 0, NULL, NULL));
CL_CHECK(clEnqueueWriteBuffer(queue, output_buffer, CL_TRUE, 0, nbytes, dst_vec.data(), 0, NULL, NULL));
size_t num_groups_x = global_work_size[0] / local_work_size[0];
for (size_t workgroup_id_x = 0; workgroup_id_x < num_groups_x; ++workgroup_id_x) {
for (size_t local_id_x = 0; local_id_x < local_work_size[0]; ++local_id_x) {
// Calculate global ID for the work-item
int global_id_x = global_offset[0] + local_work_size[0] * workgroup_id_x + local_id_x;
// kernel operation
int i = global_id_x;
ref_vec[i] += src_vec[i] * factor;
}
}
}
printf("enqueue kernel\n");
auto time_start = std::chrono::high_resolution_clock::now();
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size,
NULL, 0, NULL, NULL));
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, global_offset, global_work_size, local_work_size, 0, NULL, NULL));
CL_CHECK(clFinish(queue));
auto time_end = std::chrono::high_resolution_clock::now();
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
printf("Elapsed time: %lg ms\n", elapsed);
printf("Download destination buffer\n");
float* h_dst = (float*)malloc(nbytes);
CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, 0, nbytes, h_dst, 0, NULL, NULL));
printf("Verify result\n");
int errors = 0;
{
std::vector<float> dst_vec(size);
CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, 0, nbytes, dst_vec.data(), 0, NULL, NULL));
/*printf("Result:");
for (int i = 0; i < size; i++) {
float data = h_dst[i];
printf(" %f", data);
}*/
free(h_dst);
for (int i = 0; i < size; ++i) {
if (!almost_equal(dst_vec[i], ref_vec[i])) {
if (errors < 100)
printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], dst_vec[i]);
++errors;
}
}
if (0 == errors) {
printf("PASSED!\n");
} else {
printf("FAILED! - %d errors\n", errors);
}
}
Cleanup(device_id, context, queue, program, kernel, memObjects);
return 0;
return errors;
}

File diff suppressed because it is too large Load Diff