Vortex 2.0 changes:

+ Microarchitecture optimizations
+ 64-bit support
+ Xilinx FPGA support
+ LLVM-16 support
+ Refactoring and quality control fixes
This commit is contained in:
Blaise Tine
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit d47cccc157
1300 changed files with 247321 additions and 311189 deletions

View File

@@ -10,7 +10,6 @@ all:
$(MAKE) -C fence
$(MAKE) -C no_mf_ext
$(MAKE) -C no_smem
$(MAKE) -C prefetch
run-simx:
$(MAKE) -C basic run-simx
@@ -24,7 +23,6 @@ run-simx:
$(MAKE) -C fence run-simx
$(MAKE) -C no_mf_ext run-simx
$(MAKE) -C no_smem run-simx
$(MAKE) -C prefetch run-simx
run-rtlsim:
$(MAKE) -C basic run-rtlsim
@@ -38,21 +36,19 @@ run-rtlsim:
$(MAKE) -C fence run-rtlsim
$(MAKE) -C no_mf_ext run-rtlsim
$(MAKE) -C no_smem run-rtlsim
$(MAKE) -C prefetch run-rtlsim
run-vlsim:
$(MAKE) -C basic run-vlsim
$(MAKE) -C demo run-vlsim
$(MAKE) -C dogfood run-vlsim
$(MAKE) -C mstress run-vlsim
$(MAKE) -C io_addr run-vlsim
$(MAKE) -C printf run-vlsim
$(MAKE) -C diverge run-vlsim
$(MAKE) -C sort run-vlsim
$(MAKE) -C fence run-vlsim
$(MAKE) -C no_mf_ext run-vlsim
$(MAKE) -C no_smem run-vlsim
$(MAKE) -C prefetch run-vlsim
run-opae:
$(MAKE) -C basic run-opae
$(MAKE) -C demo run-opae
$(MAKE) -C dogfood run-opae
$(MAKE) -C mstress run-opae
$(MAKE) -C io_addr run-opae
$(MAKE) -C printf run-opae
$(MAKE) -C diverge run-opae
$(MAKE) -C sort run-opae
$(MAKE) -C fence run-opae
$(MAKE) -C no_mf_ext run-opae
$(MAKE) -C no_smem run-opae
clean:
$(MAKE) -C basic clean
@@ -66,7 +62,6 @@ clean:
$(MAKE) -C fence clean
$(MAKE) -C no_mf_ext clean
$(MAKE) -C no_smem clean
$(MAKE) -C prefetch clean
clean-all:
$(MAKE) -C basic clean-all
@@ -80,4 +75,3 @@ clean-all:
$(MAKE) -C fence clean-all
$(MAKE) -C no_mf_ext clean-all
$(MAKE) -C no_smem clean-all
$(MAKE) -C prefetch clean-all

View File

@@ -1,78 +1,16 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
OPTS ?= -n256
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
VX_SRCS = kernel.c
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = basic
SRCS = main.cpp
all: $(PROJECT) kernel.bin kernel.dump
VX_SRCS = kernel.cpp ../../../kernel/src/vx_perf.c start.S
kernel.dump: kernel.elf
$(VX_DP) -D kernel.elf > kernel.dump
OPTS ?= -n256
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
include ../common.mk
kernel.elf: $(VX_SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
VX_LDFLAGS = -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR)
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-fpga: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy

View File

@@ -5,8 +5,8 @@
typedef struct {
uint32_t count;
uint32_t src_addr;
uint32_t dst_addr;
uint64_t src_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -2,15 +2,17 @@
#include <vx_intrinsics.h>
#include "common.h"
void main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
int main() {
kernel_arg_t* __UNIFORM__ arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
uint32_t count = arg->count;
int32_t* src_ptr = (int32_t*)arg->src_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
uint32_t offset = vx_core_id() * count;
for (uint32_t i = 0; i < count; ++i) {
dst_ptr[offset + i] = src_ptr[offset + i];
}
}
return 0;
}

View File

@@ -3,6 +3,7 @@
#include <string.h>
#include <vortex.h>
#include <chrono>
#include <vector>
#include "common.h"
#define RT_CHECK(_expr) \
@@ -22,8 +23,8 @@ int test = -1;
uint32_t count = 0;
vx_device_h device = nullptr;
vx_buffer_h staging_buf = nullptr;
kernel_arg_t kernel_arg;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
@@ -56,9 +57,6 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (staging_buf) {
vx_buf_free(staging_buf);
}
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
@@ -77,15 +75,15 @@ int run_memcopy_test(uint32_t dev_addr, uint64_t value, int num_blocks) {
int num_blocks_8 = (64 * num_blocks) / 8;
// update source buffer
// update source buffer
for (int i = 0; i < num_blocks_8; ++i) {
((uint64_t*)vx_host_ptr(staging_buf))[i] = shuffle(i, value);
((uint64_t*)staging_buf.data())[i] = shuffle(i, value);
}
/*for (int i = 0; i < num_blocks; ++i) {
std::cout << "data[" << i << "]=0x";
for (int j = 7; j >= 0; --j) {
std::cout << std::hex << ((uint64_t*)vx_host_ptr(staging_buf))[i * 8 +j];
std::cout << std::hex << ((uint64_t*)staging_buf.data())[i * 8 +j];
}
std::cout << std::endl;
}*/
@@ -93,24 +91,24 @@ int run_memcopy_test(uint32_t dev_addr, uint64_t value, int num_blocks) {
// write source buffer to local memory
std::cout << "write source buffer to local memory" << std::endl;
auto t0 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_copy_to_dev(staging_buf, dev_addr, 64 * num_blocks, 0));
RT_CHECK(vx_copy_to_dev(device, dev_addr, staging_buf.data(), 64 * num_blocks));
auto t1 = std::chrono::high_resolution_clock::now();
// clear destination buffer
for (int i = 0; i < num_blocks_8; ++i) {
((uint64_t*)vx_host_ptr(staging_buf))[i] = 0;
((uint64_t*)staging_buf.data())[i] = 0;
}
// read destination buffer from local memory
std::cout << "read destination buffer from local memory" << std::endl;
auto t2 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_copy_from_dev(staging_buf, dev_addr, 64 * num_blocks, 0));
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), dev_addr, 64 * num_blocks));
auto t3 = std::chrono::high_resolution_clock::now();
// verify result
std::cout << "verify result" << std::endl;
for (int i = 0; i < num_blocks_8; ++i) {
auto curr = ((uint64_t*)vx_host_ptr(staging_buf))[i];
auto curr = ((uint64_t*)staging_buf.data())[i];
auto ref = shuffle(i, value);
if (curr != ref) {
std::cout << "error at 0x" << std::hex << (dev_addr + 8 * i)
@@ -147,44 +145,44 @@ int run_kernel_test(const kernel_arg_t& kernel_arg,
// update source buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i;
}
}
std::cout << "upload source buffer" << std::endl;
}
auto t0 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src_addr, buf_size, 0));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), buf_size));
auto t1 = std::chrono::high_resolution_clock::now();
// clear destination buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
}
std::cout << "clear destination buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
}
// start device
std::cout << "start execution" << std::endl;
auto t2 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_start(device));
RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
auto t3 = std::chrono::high_resolution_clock::now();
// read destination buffer from local memory
std::cout << "read destination buffer from local memory" << std::endl;
auto t4 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
auto t5 = std::chrono::high_resolution_clock::now();
// verify result
std::cout << "verify result" << std::endl;
for (uint32_t i = 0; i < num_points; ++i) {
int32_t curr = ((int32_t*)vx_host_ptr(staging_buf))[i];
int32_t curr = ((int32_t*)staging_buf.data())[i];
int32_t ref = i;
if (curr != ref) {
std::cout << "error at result #" << std::dec << i
@@ -215,9 +213,6 @@ int run_kernel_test(const kernel_arg_t& kernel_arg,
}
int main(int argc, char *argv[]) {
size_t value;
// parse command arguments
parse_args(argc, argv);
@@ -228,10 +223,11 @@ int main(int argc, char *argv[]) {
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t max_cores;
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores));
uint32_t num_points = count;
uint64_t num_cores;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
uint32_t num_points = count * num_cores;
uint32_t num_blocks = (num_points * sizeof(int32_t) + 63) / 64;
uint32_t buf_size = num_blocks * 64;
@@ -239,20 +235,19 @@ int main(int argc, char *argv[]) {
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// allocate device memory
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.src_addr = value;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.dst_addr = value;
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.count = num_points;
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
staging_buf.resize(alloc_size);
// run tests
if (0 == test || -1 == test) {
@@ -268,9 +263,9 @@ int main(int argc, char *argv[]) {
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
{
auto buf_ptr = (void*)vx_host_ptr(staging_buf);
auto buf_ptr = (void*)staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
std::cout << "run kernel test" << std::endl;

View File

@@ -0,0 +1,13 @@
.section .init, "ax"
.global _start
.type _start, @function
_start:
# call main routine
call main
# dump perf counter
call vx_perf_dump
# end execution
.insn r 0x0b, 0, 0, x0, x0, x0
.size _start, .-_start

120
tests/regression/common.mk Normal file
View File

@@ -0,0 +1,120 @@
XLEN ?= 32
TARGET ?= opaesim
XRT_SYN_DIR ?= ../../../hw/syn/xilinx/xrt
XRT_DEVICE_INDEX ?= 0
ifeq ($(XLEN),64)
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
VX_CFLAGS += -march=rv64imafd -mabi=lp64d
STARTUP_ADDR ?= 0x180000000
else
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VX_CFLAGS += -march=rv32imaf -mabi=ilp32f
STARTUP_ADDR ?= 0x80000000
endif
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
FPGA_BIN_DIR ?= $(VORTEX_RT_PATH)/opae
LLVM_VORTEX ?= /opt/llvm-vortex
LLVM_CFLAGS += --sysroot=$(RISCV_SYSROOT)
LLVM_CFLAGS += --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH)
LLVM_CFLAGS += -Xclang -target-feature -Xclang +vortex
#LLVM_CFLAGS += -mllvm -vortex-branch-divergence=2
#LLVM_CFLAGS += -mllvm -print-after-all
#LLVM_CFLAGS += -I$(RISCV_SYSROOT)/include/c++/9.2.0/$(RISCV_PREFIX)
#LLVM_CFLAGS += -I$(RISCV_SYSROOT)/include/c++/9.2.0
#LLVM_CFLAGS += -Wl,-L$(RISCV_TOOLCHAIN_PATH)/lib/gcc/$(RISCV_PREFIX)/9.2.0
#LLVM_CFLAGS += --rtlib=libgcc
VX_CC = $(LLVM_VORTEX)/bin/clang $(LLVM_CFLAGS)
VX_CXX = $(LLVM_VORTEX)/bin/clang++ $(LLVM_CFLAGS)
VX_DP = $(LLVM_VORTEX)/bin/llvm-objdump
VX_CP = $(LLVM_VORTEX)/bin/llvm-objcopy
#VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
#VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-g++
#VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
#VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
VX_CFLAGS += -v -O3 -std=c++17
VX_CFLAGS += -mcmodel=medany -fno-rtti -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw
VX_CFLAGS += -DNDEBUG -DLLVM_VORTEX
VX_LDFLAGS += -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR) $(VORTEX_KN_PATH)/libvortexrt.a
CXXFLAGS += -std=c++17 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_KN_PATH)/../hw
LDFLAGS += -L$(VORTEX_RT_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
ifeq ($(TARGET), fpga)
OPAE_DRV_PATHS ?= libopae-c.so
else
ifeq ($(TARGET), asesim)
OPAE_DRV_PATHS ?= libopae-c-ase.so
else
ifeq ($(TARGET), opaesim)
OPAE_DRV_PATHS ?= libopae-c-sim.so
endif
endif
endif
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DP) -D kernel.elf > kernel.dump
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
kernel.elf: $(VX_SRCS)
$(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(VORTEX_RT_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-opae: $(PROJECT) kernel.bin
SCOPE_JSON_PATH=$(FPGA_BIN_DIR)/scope.json OPAE_DRV_PATHS=$(OPAE_DRV_PATHS) LD_LIBRARY_PATH=$(VORTEX_RT_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(VORTEX_RT_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-xrt: $(PROJECT) kernel.bin
ifeq ($(TARGET), hw)
SCOPE_JSON_PATH=$(FPGA_BIN_DIR)/scope.json XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(VORTEX_RT_PATH)/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
else
XCL_EMULATION_MODE=$(TARGET) XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(VORTEX_RT_PATH)/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
endif
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif

View File

@@ -1,78 +1,9 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
OPTS ?= -n64
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
VX_SRCS = kernel.c
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = demo
SRCS = main.cpp
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DP) -D kernel.elf > kernel.dump
VX_SRCS = kernel.cpp
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
OPTS ?= -n64
kernel.elf: $(VX_SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-fpga: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

View File

@@ -6,9 +6,9 @@
typedef struct {
uint32_t num_tasks;
uint32_t task_size;
uint32_t src0_addr;
uint32_t src1_addr;
uint32_t dst_addr;
uint64_t src0_addr;
uint64_t src1_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -3,7 +3,7 @@
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* arg) {
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
uint32_t count = arg->task_size;
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
@@ -16,7 +16,8 @@ void kernel_body(int task_id, kernel_arg_t* arg) {
}
}
void main() {
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
}
return 0;
}

View File

@@ -1,6 +1,7 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include "common.h"
@@ -20,8 +21,8 @@ const char* kernel_file = "kernel.bin";
uint32_t count = 0;
vx_device_h device = nullptr;
vx_buffer_h staging_buf = nullptr;
kernel_arg_t kernel_arg;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
@@ -51,9 +52,6 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (staging_buf) {
vx_buf_free(staging_buf);
}
if (device) {
vx_mem_free(device, kernel_arg.src0_addr);
vx_mem_free(device, kernel_arg.src1_addr);
@@ -64,24 +62,24 @@ void cleanup() {
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
uint32_t num_points) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = i + i;
int cur = buf_ptr[i];
@@ -101,9 +99,7 @@ int run_test(const kernel_arg_t& kernel_arg,
return 0;
}
int main(int argc, char *argv[]) {
size_t value;
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
@@ -115,12 +111,12 @@ int main(int argc, char *argv[]) {
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t max_cores, max_warps, max_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads));
uint64_t num_cores, num_warps, num_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
uint32_t num_tasks = max_cores * max_warps * max_threads;
uint32_t num_tasks = num_cores * num_warps * num_threads;
uint32_t num_points = count * num_tasks;
uint32_t buf_size = num_points * sizeof(int32_t);
@@ -132,64 +128,60 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.src0_addr = value;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.src1_addr = value;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.dst_addr = value;
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.num_tasks = num_tasks;
kernel_arg.task_size = count;
std::cout << "dev_src0=" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src1=" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
{
auto buf_ptr = (int*)vx_host_ptr(staging_buf);
auto buf_ptr = (int*)staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
// upload source buffer0
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "upload source buffer0" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i-1;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size));
}
std::cout << "upload source buffer0" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src0_addr, buf_size, 0));
// upload source buffer1
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "upload source buffer1" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i+1;
}
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size));
}
std::cout << "upload source buffer1" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src1_addr, buf_size, 0));
// clear destination buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
}
std::cout << "clear destination buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
// run tests
std::cout << "run tests" << std::endl;

View File

@@ -1,78 +1,9 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
OPTS ?= -n16
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O1 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
VX_SRCS = kernel.c
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = diverge
SRCS = main.cpp
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DP) -D kernel.elf > kernel.dump
VX_SRCS = kernel.cpp
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
OPTS ?= -n16
kernel.elf: $(VX_SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-fpga: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

View File

@@ -5,8 +5,8 @@
typedef struct {
uint32_t num_points;
uint32_t src_addr;
uint32_t dst_addr;
uint64_t src_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -1,49 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
// Parallel Selection sort
void kernel_body(int task_id, kernel_arg_t* arg) {
int32_t* src_ptr = (int32_t*)arg->src_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
int value = src_ptr[task_id];
// none taken
__if (task_id >= 0x7fffffff) {
value = 0;
}__else {
value += 2;
}__endif
// diverge
__if (task_id > 1) {
__if (task_id > 2) {
value += 6;
}__else {
value += 5;
}__endif
}__else {
__if (task_id > 0) {
value += 4;
}__else {
value += 3;
}__endif
}__endif
// all taken
__if (task_id >= 0) {
value += 7;
}__else {
value = 0;
}__endif
dst_ptr[task_id] = value;
}
void main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
}

View File

@@ -0,0 +1,83 @@
#include <stdint.h>
#include <assert.h>
#include <algorithm>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
// Parallel Selection sort
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
int32_t* src_ptr = (int32_t*)arg->src_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
int value = src_ptr[task_id];
// none taken
if (task_id >= 0x7fffffff) {
value = 0;
} else {
value += 2;
}
// diverge
if (task_id > 1) {
if (task_id > 2) {
value += 6;
} else {
value += 5;
}
} else {
if (task_id > 0) {
value += 4;
} else {
value += 3;
}
}
// all taken
if (task_id >= 0) {
value += 7;
} else {
value = 0;
}
// loop
for (int i = 0, n = task_id; i < n; ++i) {
value += src_ptr[i];
}
// switch
switch (task_id) {
case 0:
value += 1;
break;
case 1:
value -= 1;
break;
case 2:
value *= 3;
break;
case 3:
value *= 5;
break;
default:
assert(task_id < arg->num_points);
break;
}
// select
value += (task_id >= 0) ? ((task_id > 5) ? src_ptr[0] : task_id) : ((task_id < 5) ? src_ptr[1] : -task_id);
// min/max
value += std::min(src_ptr[task_id], value);
value += std::max(src_ptr[task_id], value);
dst_ptr[task_id] = value;
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View File

@@ -3,6 +3,7 @@
#include <string.h>
#include <vortex.h>
#include <vector>
#include <assert.h>
#include "common.h"
#define RT_CHECK(_expr) \
@@ -24,8 +25,8 @@ std::vector<int> src_data;
std::vector<int> ref_data;
vx_device_h device = nullptr;
vx_buffer_h staging_buf = nullptr;
kernel_arg_t kernel_arg;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
@@ -55,9 +56,6 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (staging_buf) {
vx_buf_free(staging_buf);
}
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
@@ -110,8 +108,38 @@ void gen_ref_data(uint32_t num_points) {
value = 0;
}
// loop
for (int j = 0, n = i; j < n; ++j) {
value += src_data.at(j);
}
// switch
switch (i) {
case 0:
value += 1;
break;
case 1:
value -= 1;
break;
case 2:
value *= 3;
break;
case 3:
value *= 5;
break;
default:
assert(i < (int)num_points);
break;
}
// select
value += (i >= 0) ? ((i > 5) ? src_data.at(0) : i) : ((i < 5) ? src_data.at(1) : -i);
// min/max
value += std::min(src_data.at(i), value);
value += std::max(src_data.at(i), value);
ref_data[i] = value;
//std::cout << std::dec << i << ": result=0x" << std::hex << value << std::endl;
}
}
@@ -124,17 +152,17 @@ int run_test(const kernel_arg_t& kernel_arg,
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = ref_data.at(i);
int cur = buf_ptr[i];
@@ -154,9 +182,7 @@ int run_test(const kernel_arg_t& kernel_arg,
return 0;
}
int main(int argc, char *argv[]) {
size_t value;
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
@@ -190,51 +216,46 @@ int main(int argc, char *argv[]) {
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, src_buf_size, &value));
kernel_arg.src_addr = value;
RT_CHECK(vx_mem_alloc(device, dst_buf_size, &value));
kernel_arg.dst_addr = value;
RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.num_points = num_points;
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(dst_buf_size,
sizeof(kernel_arg_t)));
RT_CHECK(vx_buf_alloc(device, staging_buf_size, &staging_buf));
staging_buf.resize(staging_buf_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
{
auto buf_ptr = (int*)vx_host_ptr(staging_buf);
auto buf_ptr = (int*)staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
// upload source buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = src_data.at(i);
}
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_data.data(), num_points * sizeof(int32_t));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), src_buf_size));
}
std::cout << "upload source buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src_addr, src_buf_size, 0));
// clear destination buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size));
}
std::cout << "clear destination buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_addr, dst_buf_size, 0));
// run tests
std::cout << "run tests" << std::endl;

View File

@@ -1,80 +1,9 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
OPTS ?= -n64
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
VX_LDFLAGS += -lm
VX_SRCS = kernel.c
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include -I$(VORTEX_RT_PATH)/../hw
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = dogfood
SRCS = main.cpp
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DP) -d -r -t kernel.elf > kernel.dump
VX_SRCS = kernel.cpp
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
OPTS ?= -n64 -x19 -x20
kernel.elf: $(VX_SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-fpga: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

View File

@@ -7,9 +7,9 @@ typedef struct {
uint32_t testid;
uint32_t num_tasks;
uint32_t task_size;
uint32_t src0_addr;
uint32_t src1_addr;
uint32_t dst_addr;
uint64_t src0_addr;
uint64_t src1_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -1,334 +0,0 @@
#include <stdint.h>
#include <math.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
typedef void (*PFN_Kernel)(int task_id, kernel_arg_t* arg);
inline float __ieee754_sqrtf (float x) {
asm ("fsqrt.s %0, %1" : "=f" (x) : "f" (x));
return x;
}
void kernel_iadd(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
int32_t a = src0_ptr[offset+i];
int32_t b = src1_ptr[offset+i];
int32_t c = a + b;
dst_ptr[offset+i] = c;
}
}
void kernel_imul(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
int32_t a = src0_ptr[offset+i];
int32_t b = src1_ptr[offset+i];
int32_t c = a * b;
dst_ptr[offset+i] = c;
}
}
void kernel_idiv(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
int32_t a = src0_ptr[offset+i];
int32_t b = src1_ptr[offset+i];
int32_t c = a / b;
dst_ptr[offset+i] = c;
}
}
void kernel_idiv_mul(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
int32_t a = src0_ptr[offset+i];
int32_t b = src1_ptr[offset+i];
int32_t c = a / b;
int32_t d = a * b;
int32_t e = c + d;
dst_ptr[offset+i] = e;
}
}
void kernel_fadd(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a + b;
dst_ptr[offset+i] = c;
}
}
void kernel_fsub(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a - b;
dst_ptr[offset+i] = c;
}
}
void kernel_fmul(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a * b;
dst_ptr[offset+i] = c;
}
}
void kernel_fmadd(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a * b + b;
dst_ptr[offset+i] = c;
}
}
void kernel_fmsub(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a * b - b;
dst_ptr[offset+i] = c;
}
}
void kernel_fnmadd(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c =-a * b - b;
dst_ptr[offset+i] = c;
}
}
void kernel_fnmsub(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c =-a * b + b;
dst_ptr[offset+i] = c;
}
}
void kernel_fnmadd_madd(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c =-a * b - b;
float d = a * b + b;
float e = c + d;
dst_ptr[offset+i] = e;
}
}
void kernel_fdiv(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a / b;
dst_ptr[offset+i] = c;
}
}
void kernel_fdiv2(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a / b;
float d = b / a;
float e = c + d;
dst_ptr[offset+i] = e;
}
}
void kernel_fsqrt(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = __ieee754_sqrtf(a * b);
dst_ptr[offset+i] = c;
}
}
void kernel_ftoi(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a + b;
int32_t d = (int32_t)c;
dst_ptr[offset+i] = d;
}
}
void kernel_ftou(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
uint32_t* dst_ptr = (uint32_t*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a + b;
uint32_t d = (uint32_t)c;
dst_ptr[offset+i] = d;
}
}
void kernel_itof(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
int32_t a = src0_ptr[offset+i];
int32_t b = src1_ptr[offset+i];
int32_t c = a + b;
float d = (float)c;
dst_ptr[offset+i] = d;
}
}
void kernel_utof(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
int32_t a = src0_ptr[offset+i];
int32_t b = src1_ptr[offset+i];
uint32_t c = a + b;
float d = (float)c;
dst_ptr[offset+i] = d;
}
}
static const PFN_Kernel sc_tests[] = {
kernel_iadd,
kernel_imul,
kernel_idiv,
kernel_idiv_mul,
kernel_fadd,
kernel_fsub,
kernel_fmul,
kernel_fmadd,
kernel_fmsub,
kernel_fnmadd,
kernel_fnmsub,
kernel_fnmadd_madd,
kernel_fdiv,
kernel_fdiv2,
kernel_fsqrt,
kernel_ftoi,
kernel_ftou,
kernel_itof,
kernel_utof,
};
void main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)sc_tests[arg->testid], arg);
}

View File

@@ -0,0 +1,396 @@
#include <stdint.h>
#include <math.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
typedef void (*PFN_Kernel)(int task_id, kernel_arg_t* __UNIFORM__ arg);
inline float __ieee754_sqrtf (float x) {
asm ("fsqrt.s %0, %1" : "=f" (x) : "f" (x));
return x;
}
void kernel_iadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
int32_t a = src0_ptr[offset+i];
int32_t b = src1_ptr[offset+i];
int32_t c = a + b;
dst_ptr[offset+i] = c;
}
}
void kernel_imul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a * b;
dst_ptr[offset+i] = c;
}
}
void kernel_idiv(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a / b;
dst_ptr[offset+i] = c;
}
}
void kernel_idiv_mul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a / b;
auto d = a * b;
auto e = c + d;
dst_ptr[offset+i] = e;
}
}
void kernel_fadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a + b;
dst_ptr[offset+i] = c;
}
}
void kernel_fsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a - b;
dst_ptr[offset+i] = c;
}
}
void kernel_fmul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a * b;
dst_ptr[offset+i] = c;
}
}
void kernel_fmadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a * b + b;
dst_ptr[offset+i] = c;
}
}
void kernel_fmsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a * b - b;
dst_ptr[offset+i] = c;
}
}
void kernel_fnmadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c =-a * b - b;
dst_ptr[offset+i] = c;
}
}
void kernel_fnmsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c =-a * b + b;
dst_ptr[offset+i] = c;
}
}
void kernel_fnmadd_madd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c =-a * b - b;
auto d = a * b + b;
auto e = c + d;
dst_ptr[offset+i] = e;
}
}
void kernel_fdiv(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a / b;
dst_ptr[offset+i] = c;
}
}
void kernel_fdiv2(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a / b;
auto d = b / a;
auto e = c + d;
dst_ptr[offset+i] = e;
}
}
void kernel_fsqrt(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = __ieee754_sqrtf(a * b);
dst_ptr[offset+i] = c;
}
}
void kernel_ftoi(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a + b;
auto d = (int32_t)c;
dst_ptr[offset+i] = d;
}
}
void kernel_ftou(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (uint32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a + b;
auto d = (uint32_t)c;
dst_ptr[offset+i] = d;
}
}
void kernel_itof(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a + b;
auto d = (float)c;
dst_ptr[offset+i] = d;
}
}
void kernel_utof(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a + b;
auto d = (float)c;
dst_ptr[offset+i] = d;
}
}
void kernel_bar(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto num_warps = vx_num_warps();
auto num_threads = vx_num_threads();
auto cid = vx_core_id();
auto wid = vx_warp_id();
auto tid = vx_thread_id();
auto src0_ptr = (uint32_t*)arg->src0_addr;
auto dst_ptr = (uint32_t*)arg->dst_addr;
// per warp delay
uint32_t barrier_stall = 0;
for (int i = 0; i <= wid; ++i) {
barrier_stall += src0_ptr[0] * src0_ptr[i];
}
// memory fence
vx_fence();
// local barrier
vx_barrier(0, num_warps);
// update destination
auto src_idx = (cid * num_warps + (num_warps - 1 - wid)) * num_threads + tid;
dst_ptr[task_id] = src0_ptr[src_idx] + barrier_stall;
}
void kernel_gbar(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto num_cores = vx_num_cores();
auto num_warps = vx_num_warps();
auto num_threads = vx_num_threads();
auto cid = vx_core_id();
auto wid = vx_warp_id();
auto tid = vx_thread_id();
auto src0_ptr = (uint32_t*)arg->src0_addr;
auto dst_ptr = (uint32_t*)arg->dst_addr;
// per core delay
uint32_t barrier_stall = 0;
for (int i = 0; i <= cid; ++i) {
for (int j = 0; j <= wid; ++j) {
barrier_stall += src0_ptr[0] * src0_ptr[i + j];
}
}
// memory fence
vx_fence();
// global barrier
vx_barrier(0x80000000, num_cores);
// update destination
auto src_idx = ((num_cores - 1 - cid) * num_warps + (num_warps - 1 - wid)) * num_threads + tid;
dst_ptr[task_id] = src0_ptr[src_idx] + barrier_stall;
}
static const PFN_Kernel sc_tests[] = {
kernel_iadd,
kernel_imul,
kernel_idiv,
kernel_idiv_mul,
kernel_fadd,
kernel_fsub,
kernel_fmul,
kernel_fmadd,
kernel_fmsub,
kernel_fnmadd,
kernel_fnmsub,
kernel_fnmadd_madd,
kernel_fdiv,
kernel_fdiv2,
kernel_fsqrt,
kernel_ftoi,
kernel_ftou,
kernel_itof,
kernel_utof,
kernel_bar,
kernel_gbar
};
int main() {
auto arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)sc_tests[arg->testid], arg);
return 0;
}

View File

@@ -1,109 +1,49 @@
#include <iostream>
#include <vector>
#include <unordered_set>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include <VX_config.h>
#include "testcases.h"
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
class TestMngr {
public:
TestMngr() {
this->add_test("iadd", new Test_IADD());
this->add_test("imul", new Test_IMUL());
this->add_test("idiv", new Test_IDIV());
this->add_test("idiv-mul", new Test_IDIV_MUL());
#ifdef EXT_F_ENABLE
this->add_test("fadd", new Test_FADD());
this->add_test("fsub", new Test_FSUB());
this->add_test("fmul", new Test_FMUL());
this->add_test("fmadd", new Test_FMADD());
this->add_test("fmsub", new Test_FMSUB());
this->add_test("fnmadd", new Test_FNMADD());
this->add_test("fnmsub", new Test_FNMSUB());
this->add_test("fnmadd-madd", new Test_FNMADD_MADD());
this->add_test("fdiv", new Test_FDIV());
this->add_test("fdiv2", new Test_FDIV2());
this->add_test("fsqrt", new Test_FSQRT());
this->add_test("ftoi", new Test_FTOI());
this->add_test("ftou", new Test_FTOU());
this->add_test("itof", new Test_ITOF());
this->add_test("utof", new Test_UTOF());
#endif
}
~TestMngr() {
for (size_t i = 0; i < _tests.size(); ++i) {
delete _tests[i];
}
}
const std::string& get_name(int testid) const {
return _names.at(testid);
}
ITestCase* get_test(int testid) const {
return _tests.at(testid);
}
void add_test(const char* name, ITestCase* test) {
_names.push_back(name);
_tests.push_back(test);
}
size_t size() const {
return _tests.size();
}
private:
std::vector<std::string> _names;
std::vector<ITestCase*> _tests;
};
///////////////////////////////////////////////////////////////////////////////
TestMngr testMngr;
TestSuite* testSuite = nullptr;
const char* kernel_file = "kernel.bin";
int count = 0;
int count = 0;
std::unordered_set<int> included;
std::unordered_set<int> excluded;
int testid_s = 0;
int testid_e = (testMngr.size() - 1);
int testid_e = 0;
bool stop_on_error = true;
vx_device_h device = nullptr;
vx_buffer_h arg_buf = nullptr;
vx_buffer_h src1_buf = nullptr;
vx_buffer_h src2_buf = nullptr;
vx_buffer_h dst_buf = nullptr;
kernel_arg_t kernel_arg;
vx_device_h device = nullptr;
std::vector<uint8_t> arg_buf;
std::vector<uint8_t> src1_buf;
std::vector<uint8_t> src2_buf;
std::vector<uint8_t> dst_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-t:testid] [-s:testid] [-e:testid] [-k: kernel] [-n words] [-c] [-h: help]" << std::endl;
std::cout << "Usage: [-t<testid>: selected test] [-s<testid>: start test] [-e<testid>: end test] [-x<testid>: excluded tests]" << std::endl;
std::cout << " [-k<kernel>] [-n<words>] [-c] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:t:s:e:k:ch?")) != -1) {
while ((c = getopt(argc, argv, "n:t:x:s:e:k:ch?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 't':
testid_s = atoi(optarg);
testid_e = atoi(optarg);
included.insert(atoi(optarg));
break;
case 'x':
excluded.insert(atoi(optarg));
break;
case 's':
testid_s = atoi(optarg);
@@ -130,17 +70,8 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (arg_buf) {
vx_buf_free(arg_buf);
}
if (src1_buf) {
vx_buf_free(src1_buf);
}
if (src2_buf) {
vx_buf_free(src2_buf);
}
if (dst_buf) {
vx_buf_free(dst_buf);
if (testSuite) {
delete testSuite;
}
if (device) {
vx_mem_free(device, kernel_arg.src0_addr);
@@ -152,7 +83,6 @@ void cleanup() {
int main(int argc, char *argv[]) {
int exitcode = 0;
size_t value;
// parse command arguments
parse_args(argc, argv);
@@ -171,12 +101,12 @@ int main(int argc, char *argv[]) {
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t max_cores, max_warps, max_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads));
uint64_t num_cores, num_warps, num_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
int num_tasks = max_cores * max_warps * max_threads;
int num_tasks = num_cores * num_warps * num_threads;
int num_points = count * num_tasks;
size_t buf_size = num_points * sizeof(uint32_t);
@@ -188,59 +118,69 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.src0_addr = value;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.src1_addr = value;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.dst_addr = value;
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.num_tasks = num_tasks;
kernel_arg.task_size = count;
std::cout << "dev_src0=" << std::hex << kernel_arg.src0_addr << std::dec << std::endl;
std::cout << "dev_src1=" << std::hex << kernel_arg.src1_addr << std::dec << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::dec << std::endl;
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::dec << std::endl;
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::dec << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::dec << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
RT_CHECK(vx_buf_alloc(device, sizeof(kernel_arg_t), &arg_buf));
RT_CHECK(vx_buf_alloc(device, buf_size, &src1_buf));
RT_CHECK(vx_buf_alloc(device, buf_size, &src2_buf));
RT_CHECK(vx_buf_alloc(device, buf_size, &dst_buf));
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
arg_buf.resize(sizeof(kernel_arg_t));
src1_buf.resize(buf_size);
src2_buf.resize(buf_size);
dst_buf.resize(buf_size);
// allocate test suite
testSuite = new TestSuite(device);
if (testid_e == 0) {
testid_e = (testSuite->size() - 1);
}
// execute tests
for (int t = testid_s; t <= testid_e; ++t) {
auto name = testMngr.get_name(t);
auto test = testMngr.get_test(t);
if (!included.empty()) {
if (included.count(t) == 0)
continue;
}
if (!excluded.empty()) {
if (excluded.count(t) != 0)
continue;
}
auto test = testSuite->get_test(t);
auto name = test->name();
std::cout << "Test" << t << ": " << name << std::endl;
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
kernel_arg.testid = t;
memcpy((void*)vx_host_ptr(arg_buf), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(arg_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
memcpy(arg_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, arg_buf.data(), sizeof(kernel_arg_t)));
// get test arguments
std::cout << "get test arguments" << std::endl;
test->setup(num_points, (void*)vx_host_ptr(src1_buf), (void*)vx_host_ptr(src2_buf));
RT_CHECK(test->setup(num_points, (void*)src1_buf.data(), (void*)src2_buf.data()));
// upload source buffer0
std::cout << "upload source buffer0" << std::endl;
RT_CHECK(vx_copy_to_dev(src1_buf, kernel_arg.src0_addr, buf_size, 0));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, src1_buf.data(), buf_size));
// upload source buffer1
std::cout << "upload source buffer1" << std::endl;
RT_CHECK(vx_copy_to_dev(src2_buf, kernel_arg.src1_addr, buf_size, 0));
std::cout << "upload source buffer1" << std::endl;
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, src2_buf.data(), buf_size));
// clear destination buffer
std::cout << "clear destination buffer" << std::endl;
for (int i = 0; i < num_points; ++i) {
((uint32_t*)vx_host_ptr(dst_buf))[i] = 0xdeadbeef;
((uint32_t*)dst_buf.data())[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(dst_buf, kernel_arg.dst_addr, buf_size, 0));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, dst_buf.data(), buf_size));
// start device
std::cout << "start device" << std::endl;
@@ -248,18 +188,15 @@ int main(int argc, char *argv[]) {
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(dst_buf, kernel_arg.dst_addr, buf_size, 0));
RT_CHECK(vx_copy_from_dev(device, dst_buf.data(), kernel_arg.dst_addr, buf_size));
// verify destination
std::cout << "verify test result" << std::endl;
int errors = test->verify(num_points,
(void*)vx_host_ptr(dst_buf),
(void*)vx_host_ptr(src1_buf),
(void*)vx_host_ptr(src2_buf));
int errors = test->verify(num_points, dst_buf.data(), src1_buf.data(), src2_buf.data());
if (errors != 0) {
std::cout << "found " << std::dec << errors << " errors!" << std::endl;
std::cout << "Test" << t << "-" << name << " FAILED!" << std::endl << std::flush;

View File

@@ -3,6 +3,19 @@
#include <iostream>
#include <math.h>
#include <limits>
#include <assert.h>
void cleanup();
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
union Float_t {
float f;
@@ -47,33 +60,72 @@ inline bool almost_equal(float a, float b) {
return almost_equal_ulp(a, b);
}
class ITestCase;
class TestSuite {
public:
TestSuite(vx_device_h device);
~TestSuite();
ITestCase* get_test(int testid) const;
void add_test(ITestCase* test);
size_t size() const;
vx_device_h device() const;
private:
std::vector<ITestCase*> _tests;
vx_device_h device_;
};
class ITestCase {
public:
ITestCase() {}
ITestCase(TestSuite* suite, const char* name)
: suite_(suite)
, name_(name)
{}
virtual ~ITestCase() {}
virtual void setup(int n, void* src1, void* src2) = 0;
virtual int verify(int n, void* dst, const void* src1, const void* src2) = 0;
TestSuite* suite() const {
return suite_;
}
const char* name() const {
return name_;
}
virtual int setup(uint32_t n, void* src1, void* src2) = 0;
virtual int verify(uint32_t n, void* dst, const void* src1, const void* src2) = 0;
protected:
TestSuite* suite_;
const char* const name_;
};
class Test_IADD : public ITestCase {
public:
Test_IADD(TestSuite* suite) : ITestCase(suite, "iadd") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = n/2 - i;
b[i] = n/2 + i;
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
auto c = (int32_t*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] + b[i];
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -86,22 +138,24 @@ public:
class Test_IMUL : public ITestCase {
public:
Test_IMUL(TestSuite* suite) : ITestCase(suite, "imul") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = n/2 - i;
b[i] = n/2 + i;
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
auto c = (int32_t*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] * b[i];
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -114,22 +168,24 @@ public:
class Test_IDIV : public ITestCase {
public:
Test_IDIV(TestSuite* suite) : ITestCase(suite, "idiv") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = n/2 - i;
b[i] = n/2 + i;
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
auto c = (int32_t*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] / b[i];
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -142,22 +198,24 @@ public:
class Test_IDIV_MUL : public ITestCase {
public:
Test_IDIV_MUL(TestSuite* suite) : ITestCase(suite, "idiv-mul") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = n/2 - i;
b[i] = n/2 + i;
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
auto c = (int32_t*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] / b[i];
auto y = a[i] * b[i];
auto ref = x + y;
@@ -172,22 +230,24 @@ public:
class Test_FADD : public ITestCase {
public:
Test_FADD(TestSuite* suite) : ITestCase(suite, "fadd") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] + b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -200,22 +260,24 @@ public:
class Test_FSUB : public ITestCase {
public:
Test_FSUB(TestSuite* suite) : ITestCase(suite, "fsub") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] - b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -228,22 +290,24 @@ public:
class Test_FMUL : public ITestCase {
public:
Test_FMUL(TestSuite* suite) : ITestCase(suite, "fmul") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] * b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -256,22 +320,24 @@ public:
class Test_FMADD : public ITestCase {
public:
Test_FMADD(TestSuite* suite) : ITestCase(suite, "fmadd") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] * b[i] + b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -284,22 +350,24 @@ public:
class Test_FMSUB : public ITestCase {
public:
Test_FMSUB(TestSuite* suite) : ITestCase(suite, "fmsub") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] * b[i] - b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -312,22 +380,24 @@ public:
class Test_FNMADD : public ITestCase {
public:
Test_FNMADD(TestSuite* suite) : ITestCase(suite, "fnmadd") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = -a[i] * b[i] - b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -340,22 +410,24 @@ public:
class Test_FNMSUB : public ITestCase {
public:
Test_FNMSUB(TestSuite* suite) : ITestCase(suite, "fnmsub") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = -a[i] * b[i] + b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -368,22 +440,24 @@ public:
class Test_FNMADD_MADD : public ITestCase {
public:
Test_FNMADD_MADD(TestSuite* suite) : ITestCase(suite, "fnmadd-madd") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto x = -a[i] * b[i] - b[i];
auto y = a[i] * b[i] + b[i];
auto ref = x + y;
@@ -398,22 +472,24 @@ public:
class Test_FDIV : public ITestCase {
public:
Test_FDIV(TestSuite* suite) : ITestCase(suite, "fdiv") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] / b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -426,22 +502,24 @@ public:
class Test_FDIV2 : public ITestCase {
public:
Test_FDIV2(TestSuite* suite) : ITestCase(suite, "fdiv2") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] / b[i];
auto y = b[i] / a[i];
auto ref = x + y;
@@ -456,23 +534,25 @@ public:
class Test_FSQRT : public ITestCase {
public:
Test_FSQRT(TestSuite* suite) : ITestCase(suite, "fsqrt") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
float q = 1.0f + (i % 64);
a[i] = q;
b[i] = q;
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = sqrt(a[i] * b[i]);
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -485,22 +565,25 @@ public:
class Test_FTOI : public ITestCase {
public:
Test_FTOI(TestSuite* suite) : ITestCase(suite, "ftoi") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
a[i] = fround((n/2 - i) + (float(i)/n));
b[i] = fround((n/2 - i) + (float(i)/n));
for (uint32_t i = 0; i < n; ++i) {
float q = fround(float(n/2) - i + (float(i) / n));
a[i] = q;
b[i] = q;
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (int32_t*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] + b[i];
auto ref = (int32_t)x;
if (c[i] != ref) {
@@ -514,22 +597,25 @@ public:
class Test_FTOU : public ITestCase {
public:
Test_FTOU(TestSuite* suite) : ITestCase(suite, "ftou") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
a[i] = fround(i + (float(i)/n));
b[i] = fround(i + (float(i)/n));
for (uint32_t i = 0; i < n; ++i) {
float q = fround(i + (float(i) / n));
a[i] = q;
b[i] = q;
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (uint32_t*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] + b[i];
auto ref = (uint32_t)x;
if (c[i] != ref) {
@@ -543,22 +629,24 @@ public:
class Test_ITOF : public ITestCase {
public:
Test_ITOF(TestSuite* suite) : ITestCase(suite, "itof") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = n/2 - i;
b[i] = n/2 - i;
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] + b[i];
auto ref = (float)x;
if (!almost_equal(c[i], ref)) {
@@ -572,22 +660,24 @@ public:
class Test_UTOF : public ITestCase {
public:
Test_UTOF(TestSuite* suite) : ITestCase(suite, "utof") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (uint32_t*)src1;
auto b = (uint32_t*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = i;
b[i] = i;
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (uint32_t*)src1;
auto b = (uint32_t*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] + b[i];
auto ref = (float)x;
if (!almost_equal(c[i], ref)) {
@@ -597,4 +687,135 @@ public:
}
return errors;
}
};
};
class Test_BAR : public ITestCase {
public:
Test_BAR(TestSuite* suite) : ITestCase(suite, "bar") {}
int setup(uint32_t n, void* src1, void* /*src2*/) override {
RT_CHECK(vx_dev_caps(suite_->device(), VX_CAPS_NUM_WARPS, &num_warps_));
if (num_warps_ == 1) {
std::cout << "Error: multiple warps configuration required!" << std::endl;
return -1;
}
RT_CHECK(vx_dev_caps(suite_->device(), VX_CAPS_NUM_THREADS, &num_threads_));
auto a = (uint32_t*)src1;
for (uint32_t i = 0; i < n; ++i) {
a[i] = i;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* /*src2*/) override {
int errors = 0;
auto a = (uint32_t*)src1;
auto c = (uint32_t*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto tid = i % num_threads_;
auto wid = (i / num_threads_) % num_warps_;
auto cid = i / (num_warps_ * num_threads_);
auto src_idx = (cid * num_warps_ + (num_warps_ - 1 - wid)) * num_threads_ + tid;
uint32_t ref = a[src_idx];
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << std::hex << ref << ", actual=" << c[i] << std::endl;
++errors;
}
}
return errors;
}
uint64_t num_warps_;
uint64_t num_threads_;
};
class Test_GBAR : public ITestCase {
public:
Test_GBAR(TestSuite* suite) : ITestCase(suite, "gbar") {}
int setup(uint32_t n, void* src1, void* /*src2*/) override {
RT_CHECK(vx_dev_caps(suite_->device(), VX_CAPS_NUM_CORES, &num_cores_));
if (num_cores_ == 1) {
std::cout << "Error: multiple cores configuration required!" << std::endl;
return -1;
}
RT_CHECK(vx_dev_caps(suite_->device(), VX_CAPS_NUM_WARPS, &num_warps_));
RT_CHECK(vx_dev_caps(suite_->device(), VX_CAPS_NUM_THREADS, &num_threads_));
auto a = (uint32_t*)src1;
for (uint32_t i = 0; i < n; ++i) {
a[i] = i;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* /*src2*/) override {
int errors = 0;
auto a = (uint32_t*)src1;
auto c = (uint32_t*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto tid = i % num_threads_;
auto wid = (i / num_threads_) % num_warps_;
auto cid = i / (num_warps_ * num_threads_);
auto src_idx = ((num_cores_ - 1 - cid) * num_warps_ + (num_warps_ - 1 - wid)) * num_threads_ + tid;
uint32_t ref = a[src_idx];
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << std::hex << ref << ", actual=" << c[i] << std::endl;
++errors;
}
}
return errors;
}
uint64_t num_cores_;
uint64_t num_warps_;
uint64_t num_threads_;
};
///////////////////////////////////////////////////////////////////////////////
TestSuite::TestSuite(vx_device_h device)
: device_(device) {
this->add_test(new Test_IADD(this));
this->add_test(new Test_IMUL(this));
this->add_test(new Test_IDIV(this));
this->add_test(new Test_IDIV_MUL(this));
this->add_test(new Test_FADD(this));
this->add_test(new Test_FSUB(this));
this->add_test(new Test_FMUL(this));
this->add_test(new Test_FMADD(this));
this->add_test(new Test_FMSUB(this));
this->add_test(new Test_FNMADD(this));
this->add_test(new Test_FNMSUB(this));
this->add_test(new Test_FNMADD_MADD(this));
this->add_test(new Test_FDIV(this));
this->add_test(new Test_FDIV2(this));
this->add_test(new Test_FSQRT(this));
this->add_test(new Test_FTOI(this));
this->add_test(new Test_FTOU(this));
this->add_test(new Test_ITOF(this));
this->add_test(new Test_UTOF(this));
this->add_test(new Test_BAR(this));
this->add_test(new Test_GBAR(this));
}
TestSuite::~TestSuite() {
for (size_t i = 0; i < _tests.size(); ++i) {
delete _tests[i];
}
}
ITestCase* TestSuite::get_test(int testid) const {
return _tests.at(testid);
}
void TestSuite::add_test(ITestCase* test) {
_tests.push_back(test);
}
size_t TestSuite::size() const {
return _tests.size();
}
vx_device_h TestSuite::device() const {
return device_;
}

View File

@@ -1,78 +1,9 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
OPTS ?= -n64
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
VX_SRCS = kernel.c
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = fence
SRCS = main.cpp
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DP) -D kernel.elf > kernel.dump
VX_SRCS = kernel.cpp
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
OPTS ?= -n64
kernel.elf: $(VX_SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-fpga: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

View File

@@ -6,9 +6,9 @@
typedef struct {
uint32_t num_tasks;
uint32_t task_size;
uint32_t src0_addr;
uint32_t src1_addr;
uint32_t dst_addr;
uint64_t src0_addr;
uint64_t src1_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -3,14 +3,13 @@
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* arg) {
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
uint32_t count = arg->task_size;
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
dst_ptr[offset+i] = src0_ptr[offset+i] + src1_ptr[offset+i];
}
@@ -18,7 +17,8 @@ void kernel_body(int task_id, kernel_arg_t* arg) {
vx_fence();
}
void main() {
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
}
return 0;
}

View File

@@ -1,6 +1,7 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include "common.h"
@@ -20,8 +21,8 @@ const char* kernel_file = "kernel.bin";
uint32_t count = 0;
vx_device_h device = nullptr;
vx_buffer_h staging_buf = nullptr;
kernel_arg_t kernel_arg;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
@@ -51,9 +52,6 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (staging_buf) {
vx_buf_free(staging_buf);
}
if (device) {
vx_mem_free(device, kernel_arg.src0_addr);
vx_mem_free(device, kernel_arg.src1_addr);
@@ -71,17 +69,17 @@ int run_test(const kernel_arg_t& kernel_arg,
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = i + i;
int cur = buf_ptr[i];
@@ -101,9 +99,7 @@ int run_test(const kernel_arg_t& kernel_arg,
return 0;
}
int main(int argc, char *argv[]) {
size_t value;
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
@@ -115,12 +111,12 @@ int main(int argc, char *argv[]) {
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t max_cores, max_warps, max_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads));
uint64_t num_cores, num_warps, num_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
uint32_t num_tasks = max_cores * max_warps * max_threads;
uint32_t num_tasks = num_cores * num_warps * num_threads;
uint32_t num_points = count * num_tasks;
uint32_t buf_size = num_points * sizeof(int32_t);
@@ -132,64 +128,60 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.src0_addr = value;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.src1_addr = value;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.dst_addr = value;
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.num_tasks = num_tasks;
kernel_arg.task_size = count;
std::cout << "dev_src0=" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src1=" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
{
auto buf_ptr = (int*)vx_host_ptr(staging_buf);
auto buf_ptr = (int*)staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
// upload source buffer0
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "upload source buffer0" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i-1;
}
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size));
}
std::cout << "upload source buffer0" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src0_addr, buf_size, 0));
// upload source buffer1
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "upload source buffer1" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i+1;
}
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size));
}
std::cout << "upload source buffer1" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src1_addr, buf_size, 0));
// clear destination buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
}
std::cout << "clear destination buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
// run tests
std::cout << "run tests" << std::endl;

View File

@@ -1,78 +1,9 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
OPTS ?= -n16
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
VX_SRCS = kernel.c
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include -I../../../hw
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = io_addr
SRCS = main.cpp
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DP) -D kernel.elf > kernel.dump
VX_SRCS = kernel.cpp
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
OPTS ?= -n16
kernel.elf: $(VX_SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-fpga: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

View File

@@ -5,8 +5,8 @@
typedef struct {
uint32_t num_points;
uint32_t src_addr;
uint32_t dst_addr;
uint64_t src_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -3,8 +3,8 @@
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* arg) {
uint32_t* src_ptr = (uint32_t*)arg->src_addr;
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
uint64_t* src_ptr = (uint64_t*)arg->src_addr;
uint32_t* dst_ptr = (uint32_t*)arg->dst_addr;
int32_t* addr_ptr = (int32_t*)(src_ptr[task_id]);
@@ -12,7 +12,8 @@ void kernel_body(int task_id, kernel_arg_t* arg) {
dst_ptr[task_id] = *addr_ptr;
}
void main() {
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
}
return 0;
}

View File

@@ -23,14 +23,16 @@
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
size_t usr_test_mem;
static uint64_t io_base_addr = IO_CSR_ADDR + IO_CSR_SIZE;
std::vector<uint32_t> src_data;
uint64_t usr_test_mem;
std::vector<uint64_t> src_addrs;
std::vector<int32_t> ref_data;
vx_device_h device = nullptr;
vx_buffer_h staging_buf = nullptr;
kernel_arg_t kernel_arg;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
@@ -60,18 +62,16 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (staging_buf) {
vx_buf_free(staging_buf);
}
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_mem_free(device, usr_test_mem);
vx_dev_close(device);
}
}
void gen_input_data(uint32_t num_points) {
src_data.resize(num_points);
void gen_src_addrs(uint32_t num_points) {
src_addrs.resize(num_points);
uint32_t u = 0, k = 0;
for (uint32_t i = 0; i < num_points; ++i) {
@@ -80,9 +80,9 @@ void gen_input_data(uint32_t num_points) {
++u;
}
uint32_t j = i % NUM_ADDRS;
uint32_t v = ((j == k) ? usr_test_mem : IO_BASE_ADDR) + j * sizeof(uint32_t);
src_data[i] = v;
std::cout << std::dec << i << "," << k << ": value=0x" << std::hex << v << std::endl;
uint64_t a = ((j == k) ? usr_test_mem : io_base_addr) + j * sizeof(uint32_t);
std::cout << std::dec << i << "," << k << ": value=0x" << std::hex << a << std::endl;
src_addrs[i] = a;
}
}
@@ -90,7 +90,7 @@ void gen_ref_data(uint32_t num_points) {
ref_data.resize(num_points);
for (uint32_t i = 0; i < num_points; ++i) {
uint32_t j = i % NUM_ADDRS;
int32_t j = i % NUM_ADDRS;
ref_data[i] = j * j;
}
}
@@ -104,17 +104,17 @@ int run_test(const kernel_arg_t& kernel_arg,
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = ref_data.at(i);
int cur = buf_ptr[i];
@@ -135,7 +135,7 @@ int run_test(const kernel_arg_t& kernel_arg,
}
int main(int argc, char *argv[]) {
size_t value;
uint64_t value;
// parse command arguments
parse_args(argc, argv);
@@ -152,19 +152,18 @@ int main(int argc, char *argv[]) {
uint32_t num_points = count;
RT_CHECK(vx_mem_alloc(device, NUM_ADDRS * sizeof(uint32_t), &usr_test_mem));
RT_CHECK(vx_mem_alloc(device, NUM_ADDRS * sizeof(int32_t), VX_MEM_TYPE_GLOBAL, &usr_test_mem));
// generate input data
gen_input_data(num_points);
gen_src_addrs(num_points);
// generate reference data
gen_ref_data(num_points);
uint32_t src_buf_size = src_data.size() * sizeof(int32_t);
uint32_t dst_buf_size = src_data.size() * sizeof(int32_t);
uint32_t src_buf_size = num_points * sizeof(uint64_t);
uint32_t dst_buf_size = num_points * sizeof(int32_t);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
std::cout << "number of points: " << std::dec << num_points << std::endl;
// upload program
std::cout << "upload program" << std::endl;
@@ -173,61 +172,59 @@ int main(int argc, char *argv[]) {
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, src_buf_size, &value));
RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_TYPE_GLOBAL, &value));
kernel_arg.src_addr = value;
RT_CHECK(vx_mem_alloc(device, dst_buf_size, &value));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &value));
kernel_arg.dst_addr = value;
kernel_arg.num_points = num_points;
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(NUM_ADDRS * sizeof(uint32_t),
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(NUM_ADDRS * sizeof(uint64_t),
std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(dst_buf_size,
sizeof(kernel_arg_t))));
RT_CHECK(vx_buf_alloc(device, staging_buf_size, &staging_buf));
staging_buf.resize(staging_buf_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
// upload kernel argument
{
auto buf_ptr = (int*)vx_host_ptr(staging_buf);
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = (int*)staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
// upload test address data
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "upload test address data" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < NUM_ADDRS; ++i) {
buf_ptr[i] = i * i;
}
RT_CHECK(vx_copy_to_dev(device, io_base_addr, staging_buf.data(), NUM_ADDRS * sizeof(int32_t)));
RT_CHECK(vx_copy_to_dev(device, usr_test_mem, staging_buf.data(), NUM_ADDRS * sizeof(int32_t)));
}
RT_CHECK(vx_copy_to_dev(staging_buf, 0xFF000000, NUM_ADDRS * sizeof(uint32_t), 0));
RT_CHECK(vx_copy_to_dev(staging_buf, usr_test_mem, NUM_ADDRS * sizeof(uint32_t), 0));
// upload source buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = src_data.at(i);
}
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = (uint64_t*)staging_buf.data();
memcpy(buf_ptr, src_addrs.data(), src_buf_size);
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), src_buf_size));
}
std::cout << "upload source buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src_addr, src_buf_size, 0));
// clear destination buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size));
}
std::cout << "clear destination buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_addr, dst_buf_size, 0));
// run tests
std::cout << "run tests" << std::endl;

View File

@@ -1,78 +1,9 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
OPTS ?= -n64
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
VX_SRCS = kernel.c
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = mstress
SRCS = main.cpp
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DP) -D kernel.elf > kernel.dump
VX_SRCS = kernel.cpp
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
OPTS ?= -n64
kernel.elf: $(VX_SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-fpga: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

View File

@@ -9,9 +9,9 @@ typedef struct {
uint32_t num_tasks;
uint32_t size;
uint32_t stride;
uint32_t src0_addr;
uint32_t src1_addr;
uint32_t dst_addr;
uint64_t src0_addr;
uint64_t src1_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -3,7 +3,7 @@
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* arg) {
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
uint32_t stride = arg->stride;
uint32_t* addr_ptr = (uint32_t*)arg->src0_addr;
float* src_ptr = (float*)arg->src1_addr;
@@ -22,7 +22,8 @@ void kernel_body(int task_id, kernel_arg_t* arg) {
}
}
void main() {
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
}
return 0;
}

View File

@@ -72,8 +72,8 @@ std::vector<float> test_data;
std::vector<uint32_t> addr_table;
vx_device_h device = nullptr;
vx_buffer_h staging_buf = nullptr;
kernel_arg_t kernel_arg;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
@@ -103,9 +103,6 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (staging_buf) {
vx_buf_free(staging_buf);
}
if (device) {
vx_mem_free(device, kernel_arg.src0_addr);
vx_mem_free(device, kernel_arg.src1_addr);
@@ -140,17 +137,17 @@ int run_test(const kernel_arg_t& kernel_arg,
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_addr, dst_buf_size, 0));
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, dst_buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (float*)vx_host_ptr(staging_buf);
auto buf_ptr = (float*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
@@ -181,9 +178,7 @@ int run_test(const kernel_arg_t& kernel_arg,
return 0;
}
int main(int argc, char *argv[]) {
size_t value;
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
@@ -197,12 +192,12 @@ int main(int argc, char *argv[]) {
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t max_cores, max_warps, max_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads));
uint64_t num_cores, num_warps, num_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
uint32_t num_tasks = max_cores * max_warps * max_threads;
uint32_t num_tasks = num_cores * num_warps * num_threads;
uint32_t num_points = count * num_tasks;
// generate input data
@@ -220,67 +215,59 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, addr_buf_size, &value));
kernel_arg.src0_addr = value;
RT_CHECK(vx_mem_alloc(device, src_buf_size, &value));
kernel_arg.src1_addr = value;
RT_CHECK(vx_mem_alloc(device, dst_buf_size, &value));
kernel_arg.dst_addr = value;
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, addr_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.num_tasks = num_tasks;
kernel_arg.stride = count;
std::cout << "dev_addr=" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src=" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
std::cout << "dev_addr=0x" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(addr_buf_size,
std::max<uint32_t>(dst_buf_size,
sizeof(kernel_arg_t))));
RT_CHECK(vx_buf_alloc(device, staging_buf_size, &staging_buf));
staging_buf.resize(staging_buf_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
// upload kernel argument
{
auto buf_ptr = (int*)vx_host_ptr(staging_buf);
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = (int*)staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
// upload source buffer0
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
for (uint32_t i = 0; i < addr_table.size(); ++i) {
buf_ptr[i] = addr_table.at(i);
}
std::cout << "upload address buffer" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, addr_table.data(), addr_table.size() * sizeof(int32_t));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), addr_buf_size));
}
std::cout << "upload address buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src0_addr, addr_buf_size, 0));
// upload source buffer1
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
for (uint32_t i = 0; i < test_data.size(); ++i) {
buf_ptr[i] = test_data.at(i);
}
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, test_data.data(), test_data.size() * sizeof(int32_t));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), src_buf_size));
}
std::cout << "upload source buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src1_addr, src_buf_size, 0));
// clear destination buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < test_data.size(); ++i) {
buf_ptr[i] = 0xdeadbeef;
}
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size));
}
std::cout << "clear destination buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_addr, dst_buf_size, 0));
// run tests
std::cout << "run tests" << std::endl;

View File

@@ -1,78 +1,9 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
OPTS ?= -n8
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
VX_SRCS = kernel.c
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = no_mf_ext
SRCS = main.cpp
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DP) -D kernel.elf > kernel.dump
VX_SRCS = kernel.cpp
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
OPTS ?= -n8
kernel.elf: $(VX_SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-fpga: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

View File

@@ -5,8 +5,8 @@
typedef struct {
uint32_t size;
uint32_t src_addr;
uint32_t dst_addr;
uint64_t src_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -3,7 +3,7 @@
#include <vx_spawn.h>
#include "common.h"
void main() {
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
uint32_t size = arg->size;
@@ -13,4 +13,6 @@ void main() {
for (uint32_t i = 0; i < size; ++i) {
dst_ptr[i] = src_ptr[i];
}
}
return 0;
}

View File

@@ -1,6 +1,7 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include "common.h"
@@ -20,8 +21,8 @@ const char* kernel_file = "kernel.bin";
uint32_t count = 0;
vx_device_h device = nullptr;
vx_buffer_h staging_buf = nullptr;
kernel_arg_t kernel_arg;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
@@ -51,9 +52,6 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (staging_buf) {
vx_buf_free(staging_buf);
}
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
@@ -70,17 +68,17 @@ int run_test(const kernel_arg_t& kernel_arg,
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = i-1;
int cur = buf_ptr[i];
@@ -100,9 +98,7 @@ int run_test(const kernel_arg_t& kernel_arg,
return 0;
}
int main(int argc, char *argv[]) {
size_t value;
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
@@ -125,51 +121,47 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.src_addr = value;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.dst_addr = value;
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.size = num_points;
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
// upload kernel argument
{
auto buf_ptr = (int*)vx_host_ptr(staging_buf);
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = (int*)staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
// upload source buffer0
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i-1;
}
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), buf_size));
}
std::cout << "upload source buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src_addr, buf_size, 0));
// clear destination buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
}
std::cout << "clear destination buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
// run tests
std::cout << "run tests" << std::endl;

View File

@@ -1,80 +1,17 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
PROJECT = no_smem
OPTS ?= -n8
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -DSM_ENABLE=0 -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections,--defsym=__stack_top=0xfefff000
VX_RUNTIME = $(VORTEX_RT_PATH)/src/vx_start.S $(VORTEX_RT_PATH)/src/vx_perf.c $(VORTEX_RT_PATH)/src/vx_syscalls.c
VX_SRCS = kernel.c $(VX_RUNTIME)
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = no_smem
VX_SRCS = kernel.cpp $(VORTEX_KN_PATH)/src/vx_perf.c $(VORTEX_KN_PATH)/src/vx_syscalls.c $(VORTEX_KN_PATH)/src/vx_print.S $(VORTEX_KN_PATH)/src/vx_start.S
SRCS = main.cpp
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DP) -D kernel.elf > kernel.dump
include ../common.mk
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
VX_CFLAGS += -DSM_DISABLE
VX_LDFLAGS = -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR)
kernel.elf: $(VX_SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-fpga: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy

View File

@@ -5,8 +5,8 @@
typedef struct {
uint32_t size;
uint32_t src_addr;
uint32_t dst_addr;
uint64_t src_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -3,7 +3,7 @@
#include <vx_spawn.h>
#include "common.h"
void main() {
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
uint32_t size = arg->size;
@@ -13,4 +13,6 @@ void main() {
for (uint32_t i = 0; i < size; ++i) {
dst_ptr[i] = src_ptr[i];
}
}
return 0;
}

View File

@@ -1,6 +1,7 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include "common.h"
@@ -20,8 +21,8 @@ const char* kernel_file = "kernel.bin";
uint32_t count = 0;
vx_device_h device = nullptr;
vx_buffer_h staging_buf = nullptr;
kernel_arg_t kernel_arg;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
@@ -51,9 +52,6 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (staging_buf) {
vx_buf_free(staging_buf);
}
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
@@ -70,17 +68,17 @@ int run_test(const kernel_arg_t& kernel_arg,
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = i-1;
int cur = buf_ptr[i];
@@ -100,9 +98,7 @@ int run_test(const kernel_arg_t& kernel_arg,
return 0;
}
int main(int argc, char *argv[]) {
size_t value;
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
@@ -125,51 +121,47 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.src_addr = value;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.dst_addr = value;
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.size = num_points;
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
{
auto buf_ptr = (int*)vx_host_ptr(staging_buf);
auto buf_ptr = (int*)staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
// upload source buffer0
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i-1;
}
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), buf_size));
}
std::cout << "upload source buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src_addr, buf_size, 0));
// clear destination buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
}
std::cout << "clear destination buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
// run tests
std::cout << "run tests" << std::endl;

View File

@@ -1,78 +0,0 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
OPTS ?= -n32
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
VX_SRCS = kernel.c
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = prefetch
SRCS = main.cpp
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DP) -D kernel.elf > kernel.dump
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
kernel.elf: $(VX_SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-fpga: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif

View File

@@ -1,14 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
typedef struct {
uint32_t num_tasks;
uint32_t task_size;
uint32_t src0_addr;
uint32_t src1_addr;
uint32_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -1,43 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include <vx_print.h>
#include "common.h"
#define BLOCK_SIZE 64
void kernel_body(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
uint32_t offset = task_id * count;
uint32_t num_blocks = (count * 4 + BLOCK_SIZE-1) / BLOCK_SIZE;
int32_t* src0_ptr = (int32_t*)arg->src0_addr + offset;
int32_t* src1_ptr = (int32_t*)arg->src1_addr + offset;
int32_t* dst_ptr = (int32_t*)arg->dst_addr + offset;
uint32_t src0_end = (uint32_t)(src0_ptr + count);
uint32_t src1_end = (uint32_t)(src1_ptr + count);
for (uint32_t i = 0; i < count; ++i) {
dst_ptr[i] = src0_ptr[i] + src1_ptr[i];
uint32_t src0_mask = ((uint32_t)(src0_ptr + i)) % BLOCK_SIZE;
uint32_t src0_next = (uint32_t)(src0_ptr + i + BLOCK_SIZE/4);
if (src0_mask == 0 && src0_next < src0_end) {
//vx_printf("src0_next=%d\n", src0_next);
vx_prefetch(src0_next);
}
uint32_t src1_mask = ((uint32_t)(src1_ptr + i)) % BLOCK_SIZE;
uint32_t src1_next = (uint32_t)(src1_ptr + i + BLOCK_SIZE/4);
if (src1_mask == 0 && src1_next < src1_end) {
//vx_printf("src1_next=%d\n", src1_next);
vx_prefetch(src1_next);
}
}
}
void main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
}

View File

@@ -1,205 +0,0 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
vx_device_h device = nullptr;
vx_buffer_h staging_buf = nullptr;
kernel_arg_t kernel_arg;
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (staging_buf) {
vx_buf_free(staging_buf);
}
if (device) {
vx_mem_free(device, kernel_arg.src0_addr);
vx_mem_free(device, kernel_arg.src1_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_dev_close(device);
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, -1));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
for (uint32_t i = 0; i < num_points; ++i) {
int ref = i + i;
int cur = buf_ptr[i];
if (cur != ref) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
size_t value;
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t max_cores, max_warps, max_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads));
uint32_t num_tasks = max_cores * max_warps * max_threads;
uint32_t num_points = count * num_tasks;
uint32_t buf_size = num_points * sizeof(int32_t);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.src0_addr = value;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.src1_addr = value;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.dst_addr = value;
kernel_arg.num_tasks = num_tasks;
kernel_arg.task_size = count;
std::cout << "dev_src0=" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src1=" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
{
auto buf_ptr = (int*)vx_host_ptr(staging_buf);
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
}
// upload source buffer0
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i-1;
}
}
std::cout << "upload source buffer0" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src0_addr, buf_size, 0));
// upload source buffer1
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i+1;
}
}
std::cout << "upload source buffer1" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src1_addr, buf_size, 0));
// clear destination buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
}
std::cout << "clear destination buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, buf_size, num_points));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;
return 0;
}

View File

@@ -1,78 +1,9 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
OPTS ?= -n4
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
VX_SRCS = kernel.c
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = printf
SRCS = main.cpp
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DP) -D kernel.elf > kernel.dump
VX_SRCS = kernel.cpp
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
OPTS ?= -n4
kernel.elf: $(VX_SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-fpga: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

View File

@@ -5,7 +5,7 @@
typedef struct {
uint32_t num_points;
uint32_t src_addr;
uint64_t src_addr;
} kernel_arg_t;
#endif

View File

@@ -4,13 +4,15 @@
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* arg) {
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
int cid = vx_core_id();
int* src_ptr = (int*)arg->src_addr;
char value = 'A' + src_ptr[task_id];
vx_printf("task=%d, value=%c\n", task_id, value);
vx_printf("cid=%d: task=%d, value=%c\n", cid, task_id, value);
}
void main() {
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
}
return 0;
}

View File

@@ -1,6 +1,7 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include "common.h"
@@ -20,8 +21,8 @@ const char* kernel_file = "kernel.bin";
uint32_t count = 4;
vx_device_h device = nullptr;
vx_buffer_h staging_buf = nullptr;
kernel_arg_t kernel_arg;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
@@ -51,9 +52,6 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (staging_buf) {
vx_buf_free(staging_buf);
}
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_dev_close(device);
@@ -67,14 +65,12 @@ int run_test() {
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
return 0;
}
int main(int argc, char *argv[]) {
size_t value;
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
@@ -86,10 +82,9 @@ int main(int argc, char *argv[]) {
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t max_cores, max_warps, max_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads));
uint64_t num_warps, num_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
uint32_t num_points = count;
uint32_t buf_size = count * sizeof(int32_t);
@@ -102,37 +97,35 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.src_addr = value;
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src_addr));
kernel_arg.num_points = num_points;
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
// upload kernel argument
{
auto buf_ptr = (void*)vx_host_ptr(staging_buf);
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = (void*)staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
// upload source buffer0
{
auto buf_ptr = (int*)vx_host_ptr(staging_buf);
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i;
}
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), buf_size));
}
std::cout << "upload source buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src_addr, buf_size, 0));
// run tests
std::cout << "run tests" << std::endl;

View File

@@ -1,86 +1,9 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
VORTEX_HW_PATH ?= $(realpath ../../../hw)
LLVM_PREFIX ?= /opt/llvm-riscv
SYSROOT=${RISCV_TOOLCHAIN_PATH}/riscv32-unknown-elf
OPTS ?= -n16
VX_CC = ${LLVM_PREFIX}/bin/clang
VX_CXX = ${LLVM_PREFIX}/bin/clang++
VX_DP = ${LLVM_PREFIX}/bin/llvm-objdump
VX_CP = ${LLVM_PREFIX}/bin/llvm-objcopy
VX_CFLAGS += -O3 -march=rv32imf -mabi=ilp32f -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -Xclang -target-feature -Xclang +vortex
VX_CFLAGS += --sysroot=${SYSROOT} --gcc-toolchain=${RISCV_TOOLCHAIN_PATH}
VX_CFLAGS += -I${VORTEX_HW_PATH} -I${VORTEX_RT_PATH}/include
VX_LDFLAGS += -Wl,-Bstatic,-T${VORTEX_RT_PATH}/linker/vx_link$(XLEN).ld,--gc-sections ${VORTEX_RT_PATH}/libvortexrt.a
VX_DPFLAGS = -arch=riscv32 -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex
VX_SRCS = kernel.c
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = sort
SRCS = main.cpp
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DP) $(VX_DPFLAGS) -D kernel.elf > kernel.dump
VX_SRCS = kernel.cpp
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
OPTS ?= -n16
kernel.elf: $(VX_SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-fpga: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

View File

@@ -3,10 +3,18 @@
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
#define FP_ENABLE
#ifdef FP_ENABLE
#define TYPE float
#else
#define TYPE int
#endif
typedef struct {
uint32_t num_points;
uint32_t src_addr;
uint32_t dst_addr;
uint64_t src_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -3,22 +3,23 @@
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int __DIVERGENT__ task_id, kernel_arg_t* arg) {
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
uint32_t num_points = arg->num_points;
int32_t* src_ptr = (int32_t*)arg->src_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
TYPE* src_ptr = (TYPE*)arg->src_addr;
TYPE* dst_ptr = (TYPE*)arg->dst_addr;
int32_t ref_value = src_ptr[task_id];
TYPE ref_value = src_ptr[task_id];
uint32_t pos = 0;
for (uint32_t i = 0; i < num_points; ++i) {
int32_t cur_value = src_ptr[i];
TYPE cur_value = src_ptr[i];
pos += (cur_value < ref_value) || ((cur_value == ref_value) && (i < task_id));
}
dst_ptr[pos] = ref_value;
}
void main() {
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
}
return 0;
}

View File

@@ -20,12 +20,12 @@
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
std::vector<int32_t> src_data;
std::vector<int32_t> ref_data;
std::vector<TYPE> src_data;
std::vector<TYPE> ref_data;
vx_device_h device = nullptr;
vx_buffer_h staging_buf = nullptr;
kernel_arg_t kernel_arg;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
@@ -55,9 +55,6 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (staging_buf) {
vx_buf_free(staging_buf);
}
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
@@ -70,9 +67,9 @@ void gen_input_data(uint32_t num_points) {
for (uint32_t i = 0; i < num_points; ++i) {
float r = static_cast<float>(std::rand()) / RAND_MAX;
int32_t value = r * num_points;
TYPE value = r * num_points;
src_data[i] = value;
std::cout << std::dec << i << ": value=0x" << std::hex << value << std::endl;
std::cout << std::dec << i << ": value=" << value << std::endl;
}
}
@@ -80,13 +77,11 @@ void gen_ref_data(uint32_t num_points) {
ref_data.resize(num_points);
for (uint32_t i = 0; i < num_points; ++i) {
int32_t ref_value = src_data.at(i);
TYPE ref_value = src_data.at(i);
uint32_t pos = 0;
for (uint32_t j = 0; j < num_points; ++j) {
int32_t cur_value = src_data.at(j);
int is_smaller = (cur_value < ref_value)
|| (cur_value == ref_value && j < i);
pos += is_smaller;
TYPE cur_value = src_data.at(j);
pos += (cur_value < ref_value) || (cur_value == ref_value && j < i);
}
ref_data.at(pos) = ref_value;
}
@@ -101,23 +96,23 @@ int run_test(const kernel_arg_t& kernel_arg,
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = ref_data.at(i);
int cur = buf_ptr[i];
TYPE ref = ref_data.at(i);
TYPE cur = buf_ptr[i];
if (cur != ref) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
<< std::hex << ": actual=" << cur << ", expected=" << ref << std::endl;
++errors;
}
}
@@ -131,9 +126,7 @@ int run_test(const kernel_arg_t& kernel_arg,
return 0;
}
int main(int argc, char *argv[]) {
size_t value;
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
@@ -166,52 +159,49 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, src_buf_size, &value));
kernel_arg.src_addr = value;
RT_CHECK(vx_mem_alloc(device, dst_buf_size, &value));
kernel_arg.dst_addr = value;
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.num_points = num_points;
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(dst_buf_size,
sizeof(kernel_arg_t)));
RT_CHECK(vx_buf_alloc(device, staging_buf_size, &staging_buf));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
// allocate staging buffer
{
auto buf_ptr = (int*)vx_host_ptr(staging_buf);
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(dst_buf_size,
sizeof(kernel_arg_t)));
staging_buf.resize(staging_buf_size);
}
// upload kernel argument
{
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
// upload source buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = src_data.at(i);
}
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_data.data(), num_points * sizeof(TYPE));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), src_buf_size));
}
std::cout << "upload source buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src_addr, src_buf_size, 0));
// clear destination buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size));
}
std::cout << "clear destination buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_addr, dst_buf_size, 0));
// run tests
std::cout << "run tests" << std::endl;

View File

@@ -1,78 +0,0 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
OPTS ?= -g1
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -std=c++11 -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -DENABLE_SW -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw -I$(VORTEX_RT_PATH)/../sim/common -I$(VORTEX_RT_PATH)/../third_party
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
VX_SRCS = kernel.c
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include -I$(VORTEX_RT_PATH)/../hw -I$(VORTEX_RT_PATH)/../sim/common -I$(VORTEX_RT_PATH)/../third_party
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex $(VORTEX_RT_PATH)/../third_party/cocogfx/libcocogfx.a -lz
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = tex
SRCS = main.cpp utils.cpp
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DP) -D kernel.elf > kernel.dump
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
kernel.elf: $(VX_SRCS)
$(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-fpga: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif

View File

@@ -1,26 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#include <VX_config.h>
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
typedef struct {
bool use_sw;
uint32_t num_tasks;
uint8_t format;
uint8_t filter;
uint8_t wrapu;
uint8_t wrapv;
uint8_t src_logwidth;
uint8_t src_logheight;
uint32_t src_addr;
uint32_t mip_offs[TEX_LOD_MAX+1];
uint32_t dst_width;
uint32_t dst_height;
uint8_t dst_stride;
uint32_t dst_pitch;
uint32_t dst_addr;
} kernel_arg_t;
#endif

Binary file not shown.

Before

Width:  |  Height:  |  Size: 14 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.2 MiB

View File

@@ -1,98 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include <vx_print.h>
#include "texsw.h"
typedef struct {
kernel_arg_t* state;
uint32_t tile_width;
uint32_t tile_height;
float deltaX;
float deltaY;
float minification;
} tile_arg_t;
template <typename T, T Start, T End>
struct static_for_t {
template <typename Fn>
inline void operator()(const Fn& callback) const {
callback(Start);
static_for_t<T, Start+1, End>()(callback);
}
};
template <typename T, T N>
struct static_for_t<T, N, N> {
template <typename Fn>
inline void operator()(const Fn& callback) const {}
};
void kernel_body(int task_id, tile_arg_t* arg) {
kernel_arg_t* state = arg->state;
uint32_t xoffset = 0;
uint32_t yoffset = task_id * arg->tile_height;
uint8_t* dst_ptr = (uint8_t*)(state->dst_addr + xoffset * state->dst_stride + yoffset * state->dst_pitch);
Fixed<16> xj(arg->minification);
/*vx_printf("task_id=%d, tile_width=%d, tile_height=%d, deltaX=%f, deltaY=%f, minification=%f\n",
task_id, arg->tile_width, arg->tile_height, arg->deltaX, arg->deltaY, arg->minification);*/
float fv = (yoffset + 0.5f) * arg->deltaY;
for (uint32_t y = 0; y < arg->tile_height; ++y) {
uint32_t* dst_row = (uint32_t*)dst_ptr;
float fu = (xoffset + 0.5f) * arg->deltaX;
for (uint32_t x = 0; x < arg->tile_width; ++x) {
Fixed<TEX_FXD_FRAC> xu(fu);
Fixed<TEX_FXD_FRAC> xv(fv);
uint32_t color = tex_load(state, xu, xv, xj);
//vx_printf("task_id=%d, x=%d, y=%d, fu=%f, fv=%f, xu=0x%x, xv=0x%x, color=0x%x\n", task_id, x, y, fu, fv, xu.data(), xv.data(), color);
dst_row[x] = color;
fu += arg->deltaX;
}
dst_ptr += state->dst_pitch;
fv += arg->deltaY;
}
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
// configure texture unit
csr_write(CSR_TEX_UNIT, 0);
csr_write(CSR_TEX_WIDTH, arg->src_logwidth);
csr_write(CSR_TEX_HEIGHT, arg->src_logheight);
csr_write(CSR_TEX_FORMAT, arg->format);
csr_write(CSR_TEX_WRAPU, arg->wrapu);
csr_write(CSR_TEX_WRAPV, arg->wrapv);
csr_write(CSR_TEX_FILTER, (arg->filter ? 1 : 0));
csr_write(CSR_TEX_ADDR, arg->src_addr);
static_for_t<int, 0, TEX_LOD_MAX+1>()([&](int i) {
csr_write(CSR_TEX_MIPOFF(i), arg->mip_offs[i]);
});
tile_arg_t targ;
targ.state = arg;
targ.tile_width = arg->dst_width;
targ.tile_height = (arg->dst_height + arg->num_tasks - 1) / arg->num_tasks;
targ.deltaX = 1.0f / arg->dst_width;
targ.deltaY = 1.0f / arg->dst_height;
{
uint32_t src_width = (1 << arg->src_logwidth);
uint32_t src_height = (1 << arg->src_logheight);
float width_ratio = float(src_width) / arg->dst_width;
float height_ratio = float(src_height) / arg->dst_height;
targ.minification = std::max<float>(width_ratio, height_ratio);
}
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, &targ);
/*for (uint32_t t=0; t < arg->num_tasks; ++t) {
kernel_body(t, &targ);
}*/
return 0;
}

View File

@@ -1,274 +0,0 @@
#include <iostream>
#include <vector>
#include <unistd.h>
#include <string.h>
#include <chrono>
#include <cmath>
#include <assert.h>
#include <vortex.h>
#include "common.h"
#include "utils.h"
using namespace cocogfx;
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
const char* input_file = "palette64.png";
const char* output_file = "output.png";
int wrap = 0;
int filter = 0; // 0-> point, 1->bilinear, 2->trilinear
float scale = 1.0f;
int format = 0;
bool use_sw = false;
ePixelFormat eformat = FORMAT_A8R8G8B8;
vx_device_h device = nullptr;
vx_buffer_h buffer = nullptr;
kernel_arg_t kernel_arg;
static void show_usage() {
std::cout << "Vortex Texture Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-i image] [-o image] [-s scale] [-w wrap] [-f format] [-g filter] [-z no_hw] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "zi:o:k:w:f:g:s:h?")) != -1) {
switch (c) {
case 'i':
input_file = optarg;
break;
case 'o':
output_file = optarg;
break;
case 's':
scale = std::stof(optarg, NULL);
break;
case 'w':
wrap = std::atoi(optarg);
break;
case 'z':
use_sw = true;
break;
case 'f': {
format = std::atoi(optarg);
switch (format) {
case 0: eformat = FORMAT_A8R8G8B8; break;
case 1: eformat = FORMAT_R5G6B5; break;
case 2: eformat = FORMAT_A1R5G5B5; break;
case 3: eformat = FORMAT_A4R4G4B4; break;
case 4: eformat = FORMAT_A8L8; break;
case 5: eformat = FORMAT_L8; break;
case 6: eformat = FORMAT_A8; break;
default:
std::cout << "Error: invalid format: " << format << std::endl;
exit(1);
}
} break;
case 'g':
filter = std::atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (buffer) {
vx_buf_free(buffer);
}
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_dev_close(device);
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t width,
uint32_t height,
uint32_t bpp) {
(void)bpp;
auto time_start = std::chrono::high_resolution_clock::now();
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT));
auto time_end = std::chrono::high_resolution_clock::now();
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
printf("Elapsed time: %lg ms\n", elapsed);
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(buffer, kernel_arg.dst_addr, buf_size, 0));
std::vector<uint8_t> dst_pixels(buf_size);
auto buf_ptr = (uint8_t*)vx_host_ptr(buffer);
for (uint32_t i = 0; i < buf_size; ++i) {
dst_pixels[i] = buf_ptr[i];
}
// save output image
std::cout << "save output image" << std::endl;
//dump_image(dst_pixels, width, height, bpp);
RT_CHECK(SaveImage(output_file, FORMAT_A8R8G8B8, dst_pixels, width, height));
return 0;
}
int main(int argc, char *argv[]) {
std::vector<uint8_t> src_pixels;
std::vector<uint32_t> mip_offsets;
uint32_t src_width;
uint32_t src_height;
// parse command arguments
parse_args(argc, argv);
{
std::vector<uint8_t> staging;
RT_CHECK(LoadImage(input_file, eformat, staging, &src_width, &src_height));
uint32_t src_bpp = GetInfo(eformat).BytePerPixel;
//dump_image(staging, src_width, src_height, src_bpp);
RT_CHECK(GenerateMipmaps(src_pixels, mip_offsets, staging, eformat, src_width, src_height, src_width * src_bpp));
}
// check power of two support
if (!ispow2(src_width) || !ispow2(src_height)) {
std::cout << "Error: only power of two textures supported: width=" << src_width << ", heigth=" << src_height << std::endl;
return -1;
}
uint32_t src_logwidth = log2ceil(src_width);
uint32_t src_logheight = log2ceil(src_height);
uint32_t src_bufsize = src_pixels.size();
uint32_t dst_width = (uint32_t)(src_width * scale);
uint32_t dst_height = (uint32_t)(src_height * scale);
uint32_t dst_bpp = 4;
uint32_t dst_bufsize = dst_bpp * dst_width * dst_height;
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t max_cores, max_warps, max_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads));
uint32_t num_tasks = max_cores * max_warps * max_threads;
std::cout << "number of tasks: " << std::dec << num_tasks << std::endl;
std::cout << "source buffer: width=" << src_width << ", heigth=" << src_height << ", size=" << src_bufsize << " bytes" << std::endl;
std::cout << "destination buffer: width=" << dst_width << ", heigth=" << dst_height << ", size=" << dst_bufsize << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
uint64_t src_addr, dst_addr;
RT_CHECK(vx_mem_alloc(device, src_bufsize, &src_addr));
RT_CHECK(vx_mem_alloc(device, dst_bufsize, &dst_addr));
std::cout << "src_addr=0x" << std::hex << src_addr << std::endl;
std::cout << "dst_addr=0x" << std::hex << dst_addr << std::endl;
// allocate staging shared memory
std::cout << "allocate shared memory" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(sizeof(kernel_arg_t),
std::max<uint32_t>(src_bufsize, dst_bufsize));
RT_CHECK(vx_buf_alloc(device, alloc_size, &buffer));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
{
kernel_arg.use_sw = use_sw;
kernel_arg.num_tasks = std::min<uint32_t>(num_tasks, dst_height);
kernel_arg.format = format;
kernel_arg.filter = filter;
kernel_arg.wrapu = wrap;
kernel_arg.wrapv = wrap;
kernel_arg.src_logwidth = src_logwidth;
kernel_arg.src_logheight = src_logheight;
kernel_arg.src_addr = src_addr;
for (uint32_t i = 0; i < mip_offsets.size(); ++i) {
assert(i < TEX_LOD_MAX);
kernel_arg.mip_offs[i] = mip_offsets.at(i);
}
kernel_arg.dst_width = dst_width;
kernel_arg.dst_height = dst_height;
kernel_arg.dst_stride = dst_bpp;
kernel_arg.dst_pitch = dst_bpp * dst_width;
kernel_arg.dst_addr = dst_addr;
auto buf_ptr = (uint8_t*)vx_host_ptr(buffer);
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(buffer, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
}
// upload source buffer
std::cout << "upload source buffer" << std::endl;
{
auto buf_ptr = (uint8_t*)vx_host_ptr(buffer);
for (uint32_t i = 0; i < src_bufsize; ++i) {
buf_ptr[i] = src_pixels[i];
}
RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src_addr, src_bufsize, 0));
}
// clear destination buffer
std::cout << "clear destination buffer" << std::endl;
{
auto buf_ptr = (uint32_t*)vx_host_ptr(buffer);
for (uint32_t i = 0; i < (dst_bufsize/4); ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.dst_addr, dst_bufsize, 0));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_bufsize, dst_width, dst_height, dst_bpp));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;
return 0;
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 543 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 534 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.3 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.4 KiB

View File

@@ -1,125 +0,0 @@
#pragma once
#include <vx_intrinsics.h>
#include <texturing.h>
#include "common.h"
using namespace cocogfx;
inline void texel_read(uint32_t* texels,
uint8_t** addresses,
uint32_t count,
uint32_t stride) {
switch (stride) {
case 1:
for (uint32_t i = 0; i < count; ++i) {
texels[i] = *(uint8_t*)addresses[i];
}
break;
case 2:
for (uint32_t i = 0; i < count; ++i) {
texels[i] = *(uint16_t*)addresses[i];
}
break;
case 4:
for (uint32_t i = 0; i < count; ++i) {
texels[i] = *(uint32_t*)addresses[i];
}
break;
default:
std::abort();
}
}
inline uint32_t vx_tex_sw(kernel_arg_t* state,
Fixed<TEX_FXD_FRAC> xu,
Fixed<TEX_FXD_FRAC> xv,
uint32_t lod) {
uint8_t* base_addr = ((uint8_t*)state->src_addr) + state->mip_offs[lod];
uint32_t log_width = std::max<int32_t>(state->src_logwidth - lod, 0);
uint32_t log_height = std::max<int32_t>(state->src_logheight - lod, 0);
auto format = (TexFormat)state->format;
auto wrapu = (WrapMode)state->wrapu;
auto wrapv = (WrapMode)state->wrapv;
auto filter = state->filter;
auto stride = Stride(format);
uint32_t color;
if (filter) {
// addressing
uint32_t offset00, offset01, offset10, offset11;
uint32_t alpha, beta;
uint8_t* addr[4];
uint32_t texel[4];
TexAddressLinear(xu, xv, log_width, log_height, wrapu, wrapv,
&offset00, &offset01, &offset10, &offset11, &alpha, &beta);
addr[0] = base_addr + offset00 * stride;
addr[1] = base_addr + offset01 * stride;
addr[2] = base_addr + offset10 * stride;
addr[3] = base_addr + offset11 * stride;
// memory fetch
texel_read(texel, addr, 4, stride);
// filtering
color = TexFilterLinear(
format, texel[0], texel[1], texel[2], texel[3], alpha, beta);
} else {
// addressing
uint32_t offset;
uint8_t* addr;
uint32_t texel;
TexAddressPoint(xu, xv, log_width, log_height, wrapu, wrapv, &offset);
addr = base_addr + offset * stride;
// memory fetch
texel_read(&texel, &addr, 1, stride);
// filtering
color = TexFilterPoint(format, texel);
}
return color;
}
inline uint32_t tex_load(kernel_arg_t* state,
Fixed<TEX_FXD_FRAC> xu,
Fixed<TEX_FXD_FRAC> xv,
Fixed<16> xj) {
uint32_t color;
uint32_t j = std::max<int32_t>(xj.data(), Fixed<16>::ONE);
uint32_t l = std::min<uint32_t>(log2floor(j) - 16, TEX_LOD_MAX);
if (state->filter == 2) {
uint32_t ln = std::min<uint32_t>(l + 1, TEX_LOD_MAX);
uint32_t f = (j - (1 << (l + 16))) >> (l + 16 - 8);
uint32_t texel0, texel1;
if (state->use_sw) {
texel0 = vx_tex_sw(state, xu, xv, l);
texel1 = vx_tex_sw(state, xu, xv, ln);
} else {
texel0 = vx_tex(0, xu.data(), xv.data(), l);
texel1 = vx_tex(0, xu.data(), xv.data(), ln);
}
uint32_t cl, ch;
{
uint32_t c0l, c0h, c1l, c1h;
Unpack8888(texel0, &c0l, &c0h);
Unpack8888(texel1, &c1l, &c1h);
cl = Lerp8888(c0l, c1l, f);
ch = Lerp8888(c0h, c1h, f);
}
color = Pack8888(cl, ch);
//vx_printf("j=0x%x, l=%d, ln=%d, f=%d, texel0=0x%x, texel1=0x%x, color=0x%x\n", j, l, ln, f, texel0, texel1, color);
} else {
if (state->use_sw) {
color = vx_tex_sw(state, xu, xv, l);
} else {
color = vx_tex(0, xu.data(), xv.data(), l);
}
}
return color;
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.2 KiB

View File

@@ -1,122 +0,0 @@
#include "utils.h"
#include <assert.h>
#include <string>
#include <iostream>
#include <iomanip>
#include <cocogfx/include/tga.h>
#include <cocogfx/include/png.h>
using namespace cocogfx;
std::string getFileExt(const std::string& str) {
auto i = str.rfind('.');
if (i != std::string::npos) {
return str.substr(i+1);
}
return("");
}
bool iequals(const std::string& a, const std::string& b) {
auto sz = a.size();
if (b.size() != sz)
return false;
for (size_t i = 0; i < sz; ++i) {
if (tolower(a[i]) != tolower(b[i]))
return false;
}
return true;
}
int LoadImage(const char *filename,
ePixelFormat format,
std::vector<uint8_t> &pixels,
uint32_t *width,
uint32_t *height) {
uint32_t img_width;
uint32_t img_height;
uint32_t img_bpp;
auto ext = getFileExt(filename);
if (iequals(ext, "tga")) {
int ret = LoadTGA(filename, pixels, &img_width, &img_height, &img_bpp);
if (ret)
return ret;
} else
if (iequals(ext, "png")) {
int ret = LoadPNG(filename, pixels, &img_width, &img_height, &img_bpp);
if (ret)
return ret;
} else {
std::cerr << "invalid file extension: " << ext << "!" << std::endl;
return -1;
}
ePixelFormat img_format;
switch (img_bpp) {
case 1:
img_format = FORMAT_A8;
break;
case 2:
img_format = FORMAT_A1R5G5B5;
break;
case 3:
img_format = FORMAT_R8G8B8;
break;
case 4:
img_format = FORMAT_A8R8G8B8;
break;
default:
std::abort();
}
if (img_format != format) {
// format conversion to RGBA
std::vector<uint8_t> staging;
int ret = ConvertImage(staging, format, pixels, img_format, img_width, img_height, img_width * img_bpp);
if (ret)
return ret;
pixels.swap(staging);
}
*width = img_width;
*height = img_height;
return 0;
}
int SaveImage(const char *filename,
ePixelFormat format,
const std::vector<uint8_t> &pixels,
uint32_t width,
uint32_t height) {
uint32_t bpp = GetInfo(format).BytePerPixel;
auto ext = getFileExt(filename);
if (iequals(ext, "tga")) {
return SaveTGA(filename, pixels, width, height, bpp);
} else
if (iequals(ext, "png")) {
return SavePNG(filename, pixels, width, height, bpp);
} else {
std::cerr << "invalid file extension: " << ext << "!" << std::endl;
return -1;
}
return 0;
}
void dump_image(const std::vector<uint8_t>& pixels, uint32_t width, uint32_t height, uint32_t bpp) {
assert(width * height * bpp == pixels.size());
const uint8_t* pixel_bytes = pixels.data();
for (uint32_t y = 0; y < height; ++y) {
for (uint32_t x = 0; x < width; ++x) {
uint32_t pixel32 = 0;
for (uint32_t b = 0; b < bpp; ++b) {
uint32_t pixel8 = *pixel_bytes++;
pixel32 |= pixel8 << (b * 8);
}
if (x) std::cout << ", ";
std::cout << std::hex << std::setw(bpp * 2) << std::setfill('0') << pixel32;
}
std::cout << std::endl;
}
}

View File

@@ -1,22 +0,0 @@
#include <cstdint>
#include <vector>
#include <bitmanip.h>
#include <cocogfx/include/format.h>
#include <cocogfx/include/blitter.h>
int LoadImage(const char *filename,
cocogfx::ePixelFormat format,
std::vector<uint8_t> &pixels,
uint32_t *width,
uint32_t *height);
int SaveImage(const char *filename,
cocogfx::ePixelFormat format,
const std::vector<uint8_t> &pixels,
uint32_t width,
uint32_t height);
void dump_image(const std::vector<uint8_t>& pixels,
uint32_t width,
uint32_t height,
uint32_t bpp);