Vortex 2.0 changes:
+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes minor update minor update minor update minor update minor update minor update cleanup cleanup cache bindings and memory perf refactory minor update minor update hw unit tests fixes minor update minor update minor update minor update minor update minor udpate minor update minor update minor update minor update minor update minor update minor update minor updates minor updates minor update minor update minor update minor update minor update minor update minor updates minor updates minor updates minor updates minor update minor update
This commit is contained in:
@@ -1,71 +1,7 @@
|
||||
XLEN ?= 32
|
||||
|
||||
LLVM_PREFIX ?= /opt/llvm-riscv
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
|
||||
POCL_CC_PATH ?= /opt/pocl/compiler
|
||||
POCL_RT_PATH ?= /opt/pocl/runtime
|
||||
|
||||
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
|
||||
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
||||
|
||||
OPTS ?= -n1024
|
||||
|
||||
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
|
||||
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
|
||||
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
|
||||
|
||||
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing
|
||||
|
||||
CXXFLAGS += -I$(POCL_RT_PATH)/include
|
||||
|
||||
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
CXXFLAGS += -g -O0
|
||||
else
|
||||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
||||
PROJECT = saxpy
|
||||
|
||||
SRCS = main.cc
|
||||
|
||||
all: $(PROJECT) kernel.pocl
|
||||
OPTS ?= -n1024
|
||||
|
||||
kernel.pocl: kernel.cl
|
||||
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
run-fpga: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-asesim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-vlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-simx: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-rtlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) *.o .depend
|
||||
|
||||
clean-all: clean
|
||||
rm -rf *.pocl *.dump
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
||||
include ../common.mk
|
||||
|
||||
Binary file not shown.
@@ -31,6 +31,7 @@
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <chrono>
|
||||
#include <vector>
|
||||
|
||||
#define CL_CHECK(_expr) \
|
||||
do { \
|
||||
@@ -78,6 +79,14 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool almost_equal(float a, float b, int ulp = 4) {
|
||||
union fi_t { int i; float f; };
|
||||
fi_t fa, fb;
|
||||
fa.f = a;
|
||||
fb.f = b;
|
||||
return std::abs(fa.i - fb.i) <= ulp;
|
||||
}
|
||||
|
||||
uint8_t *kernel_bin = NULL;
|
||||
|
||||
///
|
||||
@@ -142,7 +151,11 @@ int main(int argc, char **argv) {
|
||||
|
||||
cl_platform_id platform_id;
|
||||
cl_device_id device_id;
|
||||
cl_mem input_buffer;
|
||||
cl_mem output_buffer;
|
||||
size_t kernel_size;
|
||||
cl_context context;
|
||||
cl_command_queue queue;
|
||||
cl_int binary_status = 0;
|
||||
|
||||
// read kernel binary from file
|
||||
@@ -153,10 +166,7 @@ int main(int argc, char **argv) {
|
||||
CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
|
||||
CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
|
||||
|
||||
cl_context context;
|
||||
context = CL_CHECK_ERR(clCreateContext(NULL, 1, &device_id, &pfn_notify, NULL, &_err));
|
||||
|
||||
cl_command_queue queue;
|
||||
context = CL_CHECK_ERR(clCreateContext(NULL, 1, &device_id, &pfn_notify, NULL, &_err));
|
||||
queue = CL_CHECK_ERR(clCreateCommandQueue(context, device_id, 0, &_err));
|
||||
|
||||
cl_kernel kernel = 0;
|
||||
@@ -165,7 +175,7 @@ int main(int argc, char **argv) {
|
||||
// Create OpenCL program - first attempt to load cached binary.
|
||||
// If that is not available, then create the program from source
|
||||
// and store the binary for future use.
|
||||
std::cout << "Attempting to create program from binary..." << std::endl;
|
||||
printf("create program from binary...\n");
|
||||
cl_program program = CL_CHECK_ERR(clCreateProgramWithBinary(
|
||||
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
|
||||
if (program == NULL) {
|
||||
@@ -173,7 +183,7 @@ int main(int argc, char **argv) {
|
||||
Cleanup(device_id, context, queue, program, kernel, memObjects);
|
||||
return 1;
|
||||
} else {
|
||||
std::cout << "Read program from binary." << std::endl;
|
||||
printf("Read program from binary.\n");
|
||||
}
|
||||
|
||||
// Build program
|
||||
@@ -181,22 +191,18 @@ int main(int argc, char **argv) {
|
||||
|
||||
size_t nbytes = sizeof(float) * size;
|
||||
|
||||
printf("attempting to create input buffer\n");
|
||||
cl_mem input_buffer;
|
||||
input_buffer = CL_CHECK_ERR(clCreateBuffer(
|
||||
context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
|
||||
printf("create input buffer\n");
|
||||
input_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
|
||||
|
||||
printf("attempting to create output buffer\n");
|
||||
cl_mem output_buffer;
|
||||
output_buffer = CL_CHECK_ERR(clCreateBuffer(
|
||||
context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
|
||||
printf("create output buffer\n");
|
||||
output_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_WRITE, nbytes, NULL, &_err));
|
||||
|
||||
memObjects[0] = input_buffer;
|
||||
memObjects[1] = output_buffer;
|
||||
|
||||
float factor = ((float)rand() / (float)(RAND_MAX)) * 100.0;
|
||||
|
||||
printf("attempting to create kernel\n");
|
||||
printf("create kernel\n");
|
||||
kernel = CL_CHECK_ERR(clCreateKernel(program, "saxpy", &_err));
|
||||
|
||||
printf("setting up kernel args\n");
|
||||
@@ -204,36 +210,65 @@ int main(int argc, char **argv) {
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(output_buffer), &output_buffer));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(factor), &factor));
|
||||
|
||||
printf("attempting to enqueue write buffer\n");
|
||||
float* h_src = (float*)malloc(nbytes);
|
||||
for (int i = 0; i < size; i++) {
|
||||
h_src[i] = ((float)rand() / (float)(RAND_MAX)) * 100.0;
|
||||
}
|
||||
CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, 0, nbytes, h_src, 0, NULL, NULL));
|
||||
free(h_src);
|
||||
size_t global_offset[1] = {0};
|
||||
size_t global_work_size[1] = {size};
|
||||
size_t local_work_size[1] = {1};
|
||||
|
||||
size_t global_work_size[] = {size/2, size/2};
|
||||
printf("attempting to enqueue kernel\n");
|
||||
printf("initialize buffers\n");
|
||||
std::vector<float> ref_vec(size, 0.0f);
|
||||
{
|
||||
std::vector<float> dst_vec(size, 0.0f);
|
||||
std::vector<float> src_vec(size);
|
||||
|
||||
for (int i = 0; i < size; i++) {
|
||||
src_vec[i] = ((float)rand() / (float)(RAND_MAX)) * 100.0;
|
||||
}
|
||||
|
||||
CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, 0, nbytes, src_vec.data(), 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueWriteBuffer(queue, output_buffer, CL_TRUE, 0, nbytes, dst_vec.data(), 0, NULL, NULL));
|
||||
|
||||
size_t num_groups_x = global_work_size[0] / local_work_size[0];
|
||||
for (size_t workgroup_id_x = 0; workgroup_id_x < num_groups_x; ++workgroup_id_x) {
|
||||
for (size_t local_id_x = 0; local_id_x < local_work_size[0]; ++local_id_x) {
|
||||
// Calculate global ID for the work-item
|
||||
int global_id_x = global_offset[0] + local_work_size[0] * workgroup_id_x + local_id_x;
|
||||
// kernel operation
|
||||
int i = global_id_x;
|
||||
ref_vec[i] += src_vec[i] * factor;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
printf("enqueue kernel\n");
|
||||
auto time_start = std::chrono::high_resolution_clock::now();
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size,
|
||||
NULL, 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, global_offset, global_work_size, local_work_size, 0, NULL, NULL));
|
||||
CL_CHECK(clFinish(queue));
|
||||
auto time_end = std::chrono::high_resolution_clock::now();
|
||||
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
|
||||
printf("Elapsed time: %lg ms\n", elapsed);
|
||||
|
||||
printf("Download destination buffer\n");
|
||||
float* h_dst = (float*)malloc(nbytes);
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, 0, nbytes, h_dst, 0, NULL, NULL));
|
||||
printf("Verify result\n");
|
||||
int errors = 0;
|
||||
{
|
||||
std::vector<float> dst_vec(size);
|
||||
CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, 0, nbytes, dst_vec.data(), 0, NULL, NULL));
|
||||
|
||||
/*printf("Result:");
|
||||
for (int i = 0; i < size; i++) {
|
||||
float data = h_dst[i];
|
||||
printf(" %f", data);
|
||||
}*/
|
||||
free(h_dst);
|
||||
for (int i = 0; i < size; ++i) {
|
||||
if (!almost_equal(dst_vec[i], ref_vec[i])) {
|
||||
if (errors < 100)
|
||||
printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], dst_vec[i]);
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
|
||||
if (0 == errors) {
|
||||
printf("PASSED!\n");
|
||||
} else {
|
||||
printf("FAILED! - %d errors\n", errors);
|
||||
}
|
||||
}
|
||||
|
||||
Cleanup(device_id, context, queue, program, kernel, memObjects);
|
||||
|
||||
return 0;
|
||||
return errors;
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user