Vortex 2.0 changes:
+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes minor update minor update minor update minor update minor update minor update cleanup cleanup cache bindings and memory perf refactory minor update minor update hw unit tests fixes minor update minor update minor update minor update minor update minor udpate minor update minor update minor update minor update minor update minor update minor update minor updates minor updates minor update minor update minor update minor update minor update minor update minor updates minor updates minor updates minor updates minor update minor update
This commit is contained in:
@@ -1,78 +1,9 @@
|
||||
XLEN ?= 32
|
||||
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
|
||||
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
||||
|
||||
OPTS ?= -n16
|
||||
|
||||
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
|
||||
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
|
||||
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
|
||||
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||
|
||||
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O1 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
|
||||
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
|
||||
|
||||
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
|
||||
|
||||
VX_SRCS = kernel.c
|
||||
|
||||
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include
|
||||
|
||||
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
CXXFLAGS += -g -O0
|
||||
else
|
||||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
||||
PROJECT = diverge
|
||||
|
||||
SRCS = main.cpp
|
||||
|
||||
all: $(PROJECT) kernel.bin kernel.dump
|
||||
|
||||
kernel.dump: kernel.elf
|
||||
$(VX_DP) -D kernel.elf > kernel.dump
|
||||
VX_SRCS = kernel.cpp
|
||||
|
||||
kernel.bin: kernel.elf
|
||||
$(VX_CP) -O binary kernel.elf kernel.bin
|
||||
OPTS ?= -n16
|
||||
|
||||
kernel.elf: $(VX_SRCS)
|
||||
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
run-simx: $(PROJECT) kernel.bin
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-fpga: $(PROJECT) kernel.bin
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-asesim: $(PROJECT) kernel.bin
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-vlsim: $(PROJECT) kernel.bin
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-rtlsim: $(PROJECT) kernel.bin
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) *.o .depend
|
||||
|
||||
clean-all: clean
|
||||
rm -rf *.elf *.bin *.dump
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
||||
include ../common.mk
|
||||
|
||||
@@ -5,8 +5,8 @@
|
||||
|
||||
typedef struct {
|
||||
uint32_t num_points;
|
||||
uint32_t src_addr;
|
||||
uint32_t dst_addr;
|
||||
uint64_t src_addr;
|
||||
uint64_t dst_addr;
|
||||
} kernel_arg_t;
|
||||
|
||||
#endif
|
||||
@@ -1,49 +0,0 @@
|
||||
#include <stdint.h>
|
||||
#include <vx_intrinsics.h>
|
||||
#include <vx_spawn.h>
|
||||
#include "common.h"
|
||||
|
||||
// Parallel Selection sort
|
||||
|
||||
void kernel_body(int task_id, kernel_arg_t* arg) {
|
||||
int32_t* src_ptr = (int32_t*)arg->src_addr;
|
||||
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
|
||||
|
||||
int value = src_ptr[task_id];
|
||||
|
||||
// none taken
|
||||
__if (task_id >= 0x7fffffff) {
|
||||
value = 0;
|
||||
}__else {
|
||||
value += 2;
|
||||
}__endif
|
||||
|
||||
// diverge
|
||||
__if (task_id > 1) {
|
||||
__if (task_id > 2) {
|
||||
value += 6;
|
||||
}__else {
|
||||
value += 5;
|
||||
}__endif
|
||||
}__else {
|
||||
__if (task_id > 0) {
|
||||
value += 4;
|
||||
}__else {
|
||||
value += 3;
|
||||
}__endif
|
||||
}__endif
|
||||
|
||||
// all taken
|
||||
__if (task_id >= 0) {
|
||||
value += 7;
|
||||
}__else {
|
||||
value = 0;
|
||||
}__endif
|
||||
|
||||
dst_ptr[task_id] = value;
|
||||
}
|
||||
|
||||
void main() {
|
||||
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
|
||||
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
|
||||
}
|
||||
83
tests/regression/diverge/kernel.cpp
Normal file
83
tests/regression/diverge/kernel.cpp
Normal file
@@ -0,0 +1,83 @@
|
||||
#include <stdint.h>
|
||||
#include <assert.h>
|
||||
#include <algorithm>
|
||||
#include <vx_intrinsics.h>
|
||||
#include <vx_spawn.h>
|
||||
#include "common.h"
|
||||
|
||||
// Parallel Selection sort
|
||||
|
||||
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
int32_t* src_ptr = (int32_t*)arg->src_addr;
|
||||
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
|
||||
|
||||
int value = src_ptr[task_id];
|
||||
|
||||
// none taken
|
||||
if (task_id >= 0x7fffffff) {
|
||||
value = 0;
|
||||
} else {
|
||||
value += 2;
|
||||
}
|
||||
|
||||
// diverge
|
||||
if (task_id > 1) {
|
||||
if (task_id > 2) {
|
||||
value += 6;
|
||||
} else {
|
||||
value += 5;
|
||||
}
|
||||
} else {
|
||||
if (task_id > 0) {
|
||||
value += 4;
|
||||
} else {
|
||||
value += 3;
|
||||
}
|
||||
}
|
||||
|
||||
// all taken
|
||||
if (task_id >= 0) {
|
||||
value += 7;
|
||||
} else {
|
||||
value = 0;
|
||||
}
|
||||
|
||||
// loop
|
||||
for (int i = 0, n = task_id; i < n; ++i) {
|
||||
value += src_ptr[i];
|
||||
}
|
||||
|
||||
// switch
|
||||
switch (task_id) {
|
||||
case 0:
|
||||
value += 1;
|
||||
break;
|
||||
case 1:
|
||||
value -= 1;
|
||||
break;
|
||||
case 2:
|
||||
value *= 3;
|
||||
break;
|
||||
case 3:
|
||||
value *= 5;
|
||||
break;
|
||||
default:
|
||||
assert(task_id < arg->num_points);
|
||||
break;
|
||||
}
|
||||
|
||||
// select
|
||||
value += (task_id >= 0) ? ((task_id > 5) ? src_ptr[0] : task_id) : ((task_id < 5) ? src_ptr[1] : -task_id);
|
||||
|
||||
// min/max
|
||||
value += std::min(src_ptr[task_id], value);
|
||||
value += std::max(src_ptr[task_id], value);
|
||||
|
||||
dst_ptr[task_id] = value;
|
||||
}
|
||||
|
||||
int main() {
|
||||
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
|
||||
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
|
||||
return 0;
|
||||
}
|
||||
@@ -3,6 +3,7 @@
|
||||
#include <string.h>
|
||||
#include <vortex.h>
|
||||
#include <vector>
|
||||
#include <assert.h>
|
||||
#include "common.h"
|
||||
|
||||
#define RT_CHECK(_expr) \
|
||||
@@ -24,8 +25,8 @@ std::vector<int> src_data;
|
||||
std::vector<int> ref_data;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
vx_buffer_h staging_buf = nullptr;
|
||||
kernel_arg_t kernel_arg;
|
||||
std::vector<uint8_t> staging_buf;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
||||
static void show_usage() {
|
||||
std::cout << "Vortex Test." << std::endl;
|
||||
@@ -55,9 +56,6 @@ static void parse_args(int argc, char **argv) {
|
||||
}
|
||||
|
||||
void cleanup() {
|
||||
if (staging_buf) {
|
||||
vx_buf_free(staging_buf);
|
||||
}
|
||||
if (device) {
|
||||
vx_mem_free(device, kernel_arg.src_addr);
|
||||
vx_mem_free(device, kernel_arg.dst_addr);
|
||||
@@ -110,8 +108,38 @@ void gen_ref_data(uint32_t num_points) {
|
||||
value = 0;
|
||||
}
|
||||
|
||||
// loop
|
||||
for (int j = 0, n = i; j < n; ++j) {
|
||||
value += src_data.at(j);
|
||||
}
|
||||
|
||||
// switch
|
||||
switch (i) {
|
||||
case 0:
|
||||
value += 1;
|
||||
break;
|
||||
case 1:
|
||||
value -= 1;
|
||||
break;
|
||||
case 2:
|
||||
value *= 3;
|
||||
break;
|
||||
case 3:
|
||||
value *= 5;
|
||||
break;
|
||||
default:
|
||||
assert(i < (int)num_points);
|
||||
break;
|
||||
}
|
||||
|
||||
// select
|
||||
value += (i >= 0) ? ((i > 5) ? src_data.at(0) : i) : ((i < 5) ? src_data.at(1) : -i);
|
||||
|
||||
// min/max
|
||||
value += std::min(src_data.at(i), value);
|
||||
value += std::max(src_data.at(i), value);
|
||||
|
||||
ref_data[i] = value;
|
||||
//std::cout << std::dec << i << ": result=0x" << std::hex << value << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -124,17 +152,17 @@ int run_test(const kernel_arg_t& kernel_arg,
|
||||
|
||||
// wait for completion
|
||||
std::cout << "wait for completion" << std::endl;
|
||||
RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT));
|
||||
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
|
||||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
|
||||
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
{
|
||||
int errors = 0;
|
||||
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
|
||||
auto buf_ptr = (int32_t*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
int ref = ref_data.at(i);
|
||||
int cur = buf_ptr[i];
|
||||
@@ -154,9 +182,7 @@ int run_test(const kernel_arg_t& kernel_arg,
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
size_t value;
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
@@ -190,51 +216,46 @@ int main(int argc, char *argv[]) {
|
||||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
|
||||
RT_CHECK(vx_mem_alloc(device, src_buf_size, &value));
|
||||
kernel_arg.src_addr = value;
|
||||
RT_CHECK(vx_mem_alloc(device, dst_buf_size, &value));
|
||||
kernel_arg.dst_addr = value;
|
||||
RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
|
||||
|
||||
kernel_arg.num_points = num_points;
|
||||
|
||||
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
|
||||
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
|
||||
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
|
||||
|
||||
// allocate shared memory
|
||||
std::cout << "allocate shared memory" << std::endl;
|
||||
// allocate staging buffer
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
|
||||
std::max<uint32_t>(dst_buf_size,
|
||||
sizeof(kernel_arg_t)));
|
||||
RT_CHECK(vx_buf_alloc(device, staging_buf_size, &staging_buf));
|
||||
staging_buf.resize(staging_buf_size);
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
{
|
||||
auto buf_ptr = (int*)vx_host_ptr(staging_buf);
|
||||
auto buf_ptr = (int*)staging_buf.data();
|
||||
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
|
||||
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
|
||||
}
|
||||
|
||||
// upload source buffer
|
||||
{
|
||||
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = src_data.at(i);
|
||||
}
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
auto buf_ptr = staging_buf.data();
|
||||
memcpy(buf_ptr, src_data.data(), num_points * sizeof(int32_t));
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), src_buf_size));
|
||||
}
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src_addr, src_buf_size, 0));
|
||||
|
||||
// clear destination buffer
|
||||
{
|
||||
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
|
||||
std::cout << "clear destination buffer" << std::endl;
|
||||
auto buf_ptr = (int32_t*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = 0xdeadbeef;
|
||||
}
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size));
|
||||
}
|
||||
std::cout << "clear destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_addr, dst_buf_size, 0));
|
||||
|
||||
// run tests
|
||||
std::cout << "run tests" << std::endl;
|
||||
|
||||
Reference in New Issue
Block a user