Vortex 2.0 changes:

+ Microarchitecture optimizations
+ 64-bit support
+ Xilinx FPGA support
+ LLVM-16 support
+ Refactoring and quality control fixes

minor update

minor update

minor update

minor update

minor update

minor update

cleanup

cleanup

cache bindings and memory perf refactory

minor update

minor update

hw unit tests fixes

minor update

minor update

minor update

minor update

minor update

minor udpate

minor update

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor updates

minor updates

minor update

minor update
This commit is contained in:
Blaise Tine
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit c1e168fdbe
1309 changed files with 247412 additions and 311463 deletions

View File

@@ -1,78 +1,9 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
OPTS ?= -n16
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O1 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
VX_SRCS = kernel.c
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = diverge
SRCS = main.cpp
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DP) -D kernel.elf > kernel.dump
VX_SRCS = kernel.cpp
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
OPTS ?= -n16
kernel.elf: $(VX_SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-fpga: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

View File

@@ -5,8 +5,8 @@
typedef struct {
uint32_t num_points;
uint32_t src_addr;
uint32_t dst_addr;
uint64_t src_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -1,49 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
// Parallel Selection sort
void kernel_body(int task_id, kernel_arg_t* arg) {
int32_t* src_ptr = (int32_t*)arg->src_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
int value = src_ptr[task_id];
// none taken
__if (task_id >= 0x7fffffff) {
value = 0;
}__else {
value += 2;
}__endif
// diverge
__if (task_id > 1) {
__if (task_id > 2) {
value += 6;
}__else {
value += 5;
}__endif
}__else {
__if (task_id > 0) {
value += 4;
}__else {
value += 3;
}__endif
}__endif
// all taken
__if (task_id >= 0) {
value += 7;
}__else {
value = 0;
}__endif
dst_ptr[task_id] = value;
}
void main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
}

View File

@@ -0,0 +1,83 @@
#include <stdint.h>
#include <assert.h>
#include <algorithm>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
// Parallel Selection sort
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
int32_t* src_ptr = (int32_t*)arg->src_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
int value = src_ptr[task_id];
// none taken
if (task_id >= 0x7fffffff) {
value = 0;
} else {
value += 2;
}
// diverge
if (task_id > 1) {
if (task_id > 2) {
value += 6;
} else {
value += 5;
}
} else {
if (task_id > 0) {
value += 4;
} else {
value += 3;
}
}
// all taken
if (task_id >= 0) {
value += 7;
} else {
value = 0;
}
// loop
for (int i = 0, n = task_id; i < n; ++i) {
value += src_ptr[i];
}
// switch
switch (task_id) {
case 0:
value += 1;
break;
case 1:
value -= 1;
break;
case 2:
value *= 3;
break;
case 3:
value *= 5;
break;
default:
assert(task_id < arg->num_points);
break;
}
// select
value += (task_id >= 0) ? ((task_id > 5) ? src_ptr[0] : task_id) : ((task_id < 5) ? src_ptr[1] : -task_id);
// min/max
value += std::min(src_ptr[task_id], value);
value += std::max(src_ptr[task_id], value);
dst_ptr[task_id] = value;
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View File

@@ -3,6 +3,7 @@
#include <string.h>
#include <vortex.h>
#include <vector>
#include <assert.h>
#include "common.h"
#define RT_CHECK(_expr) \
@@ -24,8 +25,8 @@ std::vector<int> src_data;
std::vector<int> ref_data;
vx_device_h device = nullptr;
vx_buffer_h staging_buf = nullptr;
kernel_arg_t kernel_arg;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
@@ -55,9 +56,6 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (staging_buf) {
vx_buf_free(staging_buf);
}
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
@@ -110,8 +108,38 @@ void gen_ref_data(uint32_t num_points) {
value = 0;
}
// loop
for (int j = 0, n = i; j < n; ++j) {
value += src_data.at(j);
}
// switch
switch (i) {
case 0:
value += 1;
break;
case 1:
value -= 1;
break;
case 2:
value *= 3;
break;
case 3:
value *= 5;
break;
default:
assert(i < (int)num_points);
break;
}
// select
value += (i >= 0) ? ((i > 5) ? src_data.at(0) : i) : ((i < 5) ? src_data.at(1) : -i);
// min/max
value += std::min(src_data.at(i), value);
value += std::max(src_data.at(i), value);
ref_data[i] = value;
//std::cout << std::dec << i << ": result=0x" << std::hex << value << std::endl;
}
}
@@ -124,17 +152,17 @@ int run_test(const kernel_arg_t& kernel_arg,
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = ref_data.at(i);
int cur = buf_ptr[i];
@@ -154,9 +182,7 @@ int run_test(const kernel_arg_t& kernel_arg,
return 0;
}
int main(int argc, char *argv[]) {
size_t value;
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
@@ -190,51 +216,46 @@ int main(int argc, char *argv[]) {
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, src_buf_size, &value));
kernel_arg.src_addr = value;
RT_CHECK(vx_mem_alloc(device, dst_buf_size, &value));
kernel_arg.dst_addr = value;
RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.num_points = num_points;
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(dst_buf_size,
sizeof(kernel_arg_t)));
RT_CHECK(vx_buf_alloc(device, staging_buf_size, &staging_buf));
staging_buf.resize(staging_buf_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
{
auto buf_ptr = (int*)vx_host_ptr(staging_buf);
auto buf_ptr = (int*)staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
// upload source buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = src_data.at(i);
}
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_data.data(), num_points * sizeof(int32_t));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), src_buf_size));
}
std::cout << "upload source buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src_addr, src_buf_size, 0));
// clear destination buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size));
}
std::cout << "clear destination buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_addr, dst_buf_size, 0));
// run tests
std::cout << "run tests" << std::endl;