Vortex 2.0 changes:

+ Microarchitecture optimizations
+ 64-bit support
+ Xilinx FPGA support
+ LLVM-16 support
+ Refactoring and quality control fixes

minor update

minor update

minor update

minor update

minor update

minor update

cleanup

cleanup

cache bindings and memory perf refactory

minor update

minor update

hw unit tests fixes

minor update

minor update

minor update

minor update

minor update

minor udpate

minor update

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor updates

minor updates

minor update

minor update
This commit is contained in:
Blaise Tine
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit c1e168fdbe
1309 changed files with 247412 additions and 311463 deletions

View File

@@ -10,7 +10,6 @@ all:
$(MAKE) -C fence
$(MAKE) -C no_mf_ext
$(MAKE) -C no_smem
$(MAKE) -C prefetch
run-simx:
$(MAKE) -C basic run-simx
@@ -24,7 +23,6 @@ run-simx:
$(MAKE) -C fence run-simx
$(MAKE) -C no_mf_ext run-simx
$(MAKE) -C no_smem run-simx
$(MAKE) -C prefetch run-simx
run-rtlsim:
$(MAKE) -C basic run-rtlsim
@@ -38,21 +36,19 @@ run-rtlsim:
$(MAKE) -C fence run-rtlsim
$(MAKE) -C no_mf_ext run-rtlsim
$(MAKE) -C no_smem run-rtlsim
$(MAKE) -C prefetch run-rtlsim
run-vlsim:
$(MAKE) -C basic run-vlsim
$(MAKE) -C demo run-vlsim
$(MAKE) -C dogfood run-vlsim
$(MAKE) -C mstress run-vlsim
$(MAKE) -C io_addr run-vlsim
$(MAKE) -C printf run-vlsim
$(MAKE) -C diverge run-vlsim
$(MAKE) -C sort run-vlsim
$(MAKE) -C fence run-vlsim
$(MAKE) -C no_mf_ext run-vlsim
$(MAKE) -C no_smem run-vlsim
$(MAKE) -C prefetch run-vlsim
run-opae:
$(MAKE) -C basic run-opae
$(MAKE) -C demo run-opae
$(MAKE) -C dogfood run-opae
$(MAKE) -C mstress run-opae
$(MAKE) -C io_addr run-opae
$(MAKE) -C printf run-opae
$(MAKE) -C diverge run-opae
$(MAKE) -C sort run-opae
$(MAKE) -C fence run-opae
$(MAKE) -C no_mf_ext run-opae
$(MAKE) -C no_smem run-opae
clean:
$(MAKE) -C basic clean
@@ -66,7 +62,6 @@ clean:
$(MAKE) -C fence clean
$(MAKE) -C no_mf_ext clean
$(MAKE) -C no_smem clean
$(MAKE) -C prefetch clean
clean-all:
$(MAKE) -C basic clean-all
@@ -80,4 +75,3 @@ clean-all:
$(MAKE) -C fence clean-all
$(MAKE) -C no_mf_ext clean-all
$(MAKE) -C no_smem clean-all
$(MAKE) -C prefetch clean-all

View File

@@ -1,78 +1,16 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
OPTS ?= -n256
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
VX_SRCS = kernel.c
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = basic
SRCS = main.cpp
all: $(PROJECT) kernel.bin kernel.dump
VX_SRCS = kernel.cpp ../../../kernel/src/vx_perf.c start.S
kernel.dump: kernel.elf
$(VX_DP) -D kernel.elf > kernel.dump
OPTS ?= -n256
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
include ../common.mk
kernel.elf: $(VX_SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
VX_LDFLAGS = -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR)
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-fpga: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy

View File

@@ -5,8 +5,8 @@
typedef struct {
uint32_t count;
uint32_t src_addr;
uint32_t dst_addr;
uint64_t src_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -2,15 +2,17 @@
#include <vx_intrinsics.h>
#include "common.h"
void main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
int main() {
kernel_arg_t* __UNIFORM__ arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
uint32_t count = arg->count;
int32_t* src_ptr = (int32_t*)arg->src_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
uint32_t offset = vx_core_id() * count;
for (uint32_t i = 0; i < count; ++i) {
dst_ptr[offset + i] = src_ptr[offset + i];
}
}
return 0;
}

View File

@@ -3,6 +3,7 @@
#include <string.h>
#include <vortex.h>
#include <chrono>
#include <vector>
#include "common.h"
#define RT_CHECK(_expr) \
@@ -22,8 +23,8 @@ int test = -1;
uint32_t count = 0;
vx_device_h device = nullptr;
vx_buffer_h staging_buf = nullptr;
kernel_arg_t kernel_arg;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
@@ -56,9 +57,6 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (staging_buf) {
vx_buf_free(staging_buf);
}
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
@@ -77,15 +75,15 @@ int run_memcopy_test(uint32_t dev_addr, uint64_t value, int num_blocks) {
int num_blocks_8 = (64 * num_blocks) / 8;
// update source buffer
// update source buffer
for (int i = 0; i < num_blocks_8; ++i) {
((uint64_t*)vx_host_ptr(staging_buf))[i] = shuffle(i, value);
((uint64_t*)staging_buf.data())[i] = shuffle(i, value);
}
/*for (int i = 0; i < num_blocks; ++i) {
std::cout << "data[" << i << "]=0x";
for (int j = 7; j >= 0; --j) {
std::cout << std::hex << ((uint64_t*)vx_host_ptr(staging_buf))[i * 8 +j];
std::cout << std::hex << ((uint64_t*)staging_buf.data())[i * 8 +j];
}
std::cout << std::endl;
}*/
@@ -93,24 +91,24 @@ int run_memcopy_test(uint32_t dev_addr, uint64_t value, int num_blocks) {
// write source buffer to local memory
std::cout << "write source buffer to local memory" << std::endl;
auto t0 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_copy_to_dev(staging_buf, dev_addr, 64 * num_blocks, 0));
RT_CHECK(vx_copy_to_dev(device, dev_addr, staging_buf.data(), 64 * num_blocks));
auto t1 = std::chrono::high_resolution_clock::now();
// clear destination buffer
for (int i = 0; i < num_blocks_8; ++i) {
((uint64_t*)vx_host_ptr(staging_buf))[i] = 0;
((uint64_t*)staging_buf.data())[i] = 0;
}
// read destination buffer from local memory
std::cout << "read destination buffer from local memory" << std::endl;
auto t2 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_copy_from_dev(staging_buf, dev_addr, 64 * num_blocks, 0));
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), dev_addr, 64 * num_blocks));
auto t3 = std::chrono::high_resolution_clock::now();
// verify result
std::cout << "verify result" << std::endl;
for (int i = 0; i < num_blocks_8; ++i) {
auto curr = ((uint64_t*)vx_host_ptr(staging_buf))[i];
auto curr = ((uint64_t*)staging_buf.data())[i];
auto ref = shuffle(i, value);
if (curr != ref) {
std::cout << "error at 0x" << std::hex << (dev_addr + 8 * i)
@@ -147,44 +145,44 @@ int run_kernel_test(const kernel_arg_t& kernel_arg,
// update source buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i;
}
}
std::cout << "upload source buffer" << std::endl;
}
auto t0 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src_addr, buf_size, 0));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), buf_size));
auto t1 = std::chrono::high_resolution_clock::now();
// clear destination buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
}
std::cout << "clear destination buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
}
// start device
std::cout << "start execution" << std::endl;
auto t2 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_start(device));
RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
auto t3 = std::chrono::high_resolution_clock::now();
// read destination buffer from local memory
std::cout << "read destination buffer from local memory" << std::endl;
auto t4 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
auto t5 = std::chrono::high_resolution_clock::now();
// verify result
std::cout << "verify result" << std::endl;
for (uint32_t i = 0; i < num_points; ++i) {
int32_t curr = ((int32_t*)vx_host_ptr(staging_buf))[i];
int32_t curr = ((int32_t*)staging_buf.data())[i];
int32_t ref = i;
if (curr != ref) {
std::cout << "error at result #" << std::dec << i
@@ -215,9 +213,6 @@ int run_kernel_test(const kernel_arg_t& kernel_arg,
}
int main(int argc, char *argv[]) {
size_t value;
// parse command arguments
parse_args(argc, argv);
@@ -228,10 +223,11 @@ int main(int argc, char *argv[]) {
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t max_cores;
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores));
uint32_t num_points = count;
uint64_t num_cores;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
uint32_t num_points = count * num_cores;
uint32_t num_blocks = (num_points * sizeof(int32_t) + 63) / 64;
uint32_t buf_size = num_blocks * 64;
@@ -239,20 +235,19 @@ int main(int argc, char *argv[]) {
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// allocate device memory
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.src_addr = value;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.dst_addr = value;
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.count = num_points;
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
staging_buf.resize(alloc_size);
// run tests
if (0 == test || -1 == test) {
@@ -268,9 +263,9 @@ int main(int argc, char *argv[]) {
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
{
auto buf_ptr = (void*)vx_host_ptr(staging_buf);
auto buf_ptr = (void*)staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
std::cout << "run kernel test" << std::endl;

View File

@@ -0,0 +1,13 @@
.section .init, "ax"
.global _start
.type _start, @function
_start:
# call main routine
call main
# dump perf counter
call vx_perf_dump
# end execution
.insn r 0x0b, 0, 0, x0, x0, x0
.size _start, .-_start

120
tests/regression/common.mk Normal file
View File

@@ -0,0 +1,120 @@
XLEN ?= 32
TARGET ?= opaesim
XRT_SYN_DIR ?= ../../../hw/syn/xilinx/xrt
XRT_DEVICE_INDEX ?= 0
ifeq ($(XLEN),64)
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
VX_CFLAGS += -march=rv64imafd -mabi=lp64d
STARTUP_ADDR ?= 0x180000000
else
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VX_CFLAGS += -march=rv32imaf -mabi=ilp32f
STARTUP_ADDR ?= 0x80000000
endif
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
FPGA_BIN_DIR ?= $(VORTEX_RT_PATH)/opae
LLVM_VORTEX ?= /opt/llvm-vortex
LLVM_CFLAGS += --sysroot=$(RISCV_SYSROOT)
LLVM_CFLAGS += --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH)
LLVM_CFLAGS += -Xclang -target-feature -Xclang +vortex
#LLVM_CFLAGS += -mllvm -vortex-branch-divergence=2
#LLVM_CFLAGS += -mllvm -print-after-all
#LLVM_CFLAGS += -I$(RISCV_SYSROOT)/include/c++/9.2.0/$(RISCV_PREFIX)
#LLVM_CFLAGS += -I$(RISCV_SYSROOT)/include/c++/9.2.0
#LLVM_CFLAGS += -Wl,-L$(RISCV_TOOLCHAIN_PATH)/lib/gcc/$(RISCV_PREFIX)/9.2.0
#LLVM_CFLAGS += --rtlib=libgcc
VX_CC = $(LLVM_VORTEX)/bin/clang $(LLVM_CFLAGS)
VX_CXX = $(LLVM_VORTEX)/bin/clang++ $(LLVM_CFLAGS)
VX_DP = $(LLVM_VORTEX)/bin/llvm-objdump
VX_CP = $(LLVM_VORTEX)/bin/llvm-objcopy
#VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
#VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-g++
#VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
#VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
VX_CFLAGS += -v -O3 -std=c++17
VX_CFLAGS += -mcmodel=medany -fno-rtti -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw
VX_CFLAGS += -DNDEBUG -DLLVM_VORTEX
VX_LDFLAGS += -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR) $(VORTEX_KN_PATH)/libvortexrt.a
CXXFLAGS += -std=c++17 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_KN_PATH)/../hw
LDFLAGS += -L$(VORTEX_RT_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
ifeq ($(TARGET), fpga)
OPAE_DRV_PATHS ?= libopae-c.so
else
ifeq ($(TARGET), asesim)
OPAE_DRV_PATHS ?= libopae-c-ase.so
else
ifeq ($(TARGET), opaesim)
OPAE_DRV_PATHS ?= libopae-c-sim.so
endif
endif
endif
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DP) -D kernel.elf > kernel.dump
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
kernel.elf: $(VX_SRCS)
$(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(VORTEX_RT_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-opae: $(PROJECT) kernel.bin
SCOPE_JSON_PATH=$(FPGA_BIN_DIR)/scope.json OPAE_DRV_PATHS=$(OPAE_DRV_PATHS) LD_LIBRARY_PATH=$(VORTEX_RT_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(VORTEX_RT_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-xrt: $(PROJECT) kernel.bin
ifeq ($(TARGET), hw)
SCOPE_JSON_PATH=$(FPGA_BIN_DIR)/scope.json XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(VORTEX_RT_PATH)/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
else
XCL_EMULATION_MODE=$(TARGET) XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(VORTEX_RT_PATH)/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
endif
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif

View File

@@ -1,78 +1,9 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
OPTS ?= -n64
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
VX_SRCS = kernel.c
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = demo
SRCS = main.cpp
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DP) -D kernel.elf > kernel.dump
VX_SRCS = kernel.cpp
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
OPTS ?= -n64
kernel.elf: $(VX_SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-fpga: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

View File

@@ -6,9 +6,9 @@
typedef struct {
uint32_t num_tasks;
uint32_t task_size;
uint32_t src0_addr;
uint32_t src1_addr;
uint32_t dst_addr;
uint64_t src0_addr;
uint64_t src1_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -3,7 +3,7 @@
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* arg) {
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
uint32_t count = arg->task_size;
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
@@ -16,7 +16,8 @@ void kernel_body(int task_id, kernel_arg_t* arg) {
}
}
void main() {
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
}
return 0;
}

View File

@@ -1,6 +1,7 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include "common.h"
@@ -20,8 +21,8 @@ const char* kernel_file = "kernel.bin";
uint32_t count = 0;
vx_device_h device = nullptr;
vx_buffer_h staging_buf = nullptr;
kernel_arg_t kernel_arg;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
@@ -51,9 +52,6 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (staging_buf) {
vx_buf_free(staging_buf);
}
if (device) {
vx_mem_free(device, kernel_arg.src0_addr);
vx_mem_free(device, kernel_arg.src1_addr);
@@ -64,24 +62,24 @@ void cleanup() {
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
uint32_t num_points) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = i + i;
int cur = buf_ptr[i];
@@ -101,9 +99,7 @@ int run_test(const kernel_arg_t& kernel_arg,
return 0;
}
int main(int argc, char *argv[]) {
size_t value;
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
@@ -115,12 +111,12 @@ int main(int argc, char *argv[]) {
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t max_cores, max_warps, max_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads));
uint64_t num_cores, num_warps, num_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
uint32_t num_tasks = max_cores * max_warps * max_threads;
uint32_t num_tasks = num_cores * num_warps * num_threads;
uint32_t num_points = count * num_tasks;
uint32_t buf_size = num_points * sizeof(int32_t);
@@ -132,64 +128,60 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.src0_addr = value;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.src1_addr = value;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.dst_addr = value;
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.num_tasks = num_tasks;
kernel_arg.task_size = count;
std::cout << "dev_src0=" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src1=" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
{
auto buf_ptr = (int*)vx_host_ptr(staging_buf);
auto buf_ptr = (int*)staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
// upload source buffer0
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "upload source buffer0" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i-1;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size));
}
std::cout << "upload source buffer0" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src0_addr, buf_size, 0));
// upload source buffer1
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "upload source buffer1" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i+1;
}
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size));
}
std::cout << "upload source buffer1" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src1_addr, buf_size, 0));
// clear destination buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
}
std::cout << "clear destination buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
// run tests
std::cout << "run tests" << std::endl;

View File

@@ -1,78 +1,9 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
OPTS ?= -n16
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O1 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
VX_SRCS = kernel.c
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = diverge
SRCS = main.cpp
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DP) -D kernel.elf > kernel.dump
VX_SRCS = kernel.cpp
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
OPTS ?= -n16
kernel.elf: $(VX_SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-fpga: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

View File

@@ -5,8 +5,8 @@
typedef struct {
uint32_t num_points;
uint32_t src_addr;
uint32_t dst_addr;
uint64_t src_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -1,49 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
// Parallel Selection sort
void kernel_body(int task_id, kernel_arg_t* arg) {
int32_t* src_ptr = (int32_t*)arg->src_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
int value = src_ptr[task_id];
// none taken
__if (task_id >= 0x7fffffff) {
value = 0;
}__else {
value += 2;
}__endif
// diverge
__if (task_id > 1) {
__if (task_id > 2) {
value += 6;
}__else {
value += 5;
}__endif
}__else {
__if (task_id > 0) {
value += 4;
}__else {
value += 3;
}__endif
}__endif
// all taken
__if (task_id >= 0) {
value += 7;
}__else {
value = 0;
}__endif
dst_ptr[task_id] = value;
}
void main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
}

View File

@@ -0,0 +1,83 @@
#include <stdint.h>
#include <assert.h>
#include <algorithm>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
// Parallel Selection sort
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
int32_t* src_ptr = (int32_t*)arg->src_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
int value = src_ptr[task_id];
// none taken
if (task_id >= 0x7fffffff) {
value = 0;
} else {
value += 2;
}
// diverge
if (task_id > 1) {
if (task_id > 2) {
value += 6;
} else {
value += 5;
}
} else {
if (task_id > 0) {
value += 4;
} else {
value += 3;
}
}
// all taken
if (task_id >= 0) {
value += 7;
} else {
value = 0;
}
// loop
for (int i = 0, n = task_id; i < n; ++i) {
value += src_ptr[i];
}
// switch
switch (task_id) {
case 0:
value += 1;
break;
case 1:
value -= 1;
break;
case 2:
value *= 3;
break;
case 3:
value *= 5;
break;
default:
assert(task_id < arg->num_points);
break;
}
// select
value += (task_id >= 0) ? ((task_id > 5) ? src_ptr[0] : task_id) : ((task_id < 5) ? src_ptr[1] : -task_id);
// min/max
value += std::min(src_ptr[task_id], value);
value += std::max(src_ptr[task_id], value);
dst_ptr[task_id] = value;
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View File

@@ -3,6 +3,7 @@
#include <string.h>
#include <vortex.h>
#include <vector>
#include <assert.h>
#include "common.h"
#define RT_CHECK(_expr) \
@@ -24,8 +25,8 @@ std::vector<int> src_data;
std::vector<int> ref_data;
vx_device_h device = nullptr;
vx_buffer_h staging_buf = nullptr;
kernel_arg_t kernel_arg;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
@@ -55,9 +56,6 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (staging_buf) {
vx_buf_free(staging_buf);
}
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
@@ -110,8 +108,38 @@ void gen_ref_data(uint32_t num_points) {
value = 0;
}
// loop
for (int j = 0, n = i; j < n; ++j) {
value += src_data.at(j);
}
// switch
switch (i) {
case 0:
value += 1;
break;
case 1:
value -= 1;
break;
case 2:
value *= 3;
break;
case 3:
value *= 5;
break;
default:
assert(i < (int)num_points);
break;
}
// select
value += (i >= 0) ? ((i > 5) ? src_data.at(0) : i) : ((i < 5) ? src_data.at(1) : -i);
// min/max
value += std::min(src_data.at(i), value);
value += std::max(src_data.at(i), value);
ref_data[i] = value;
//std::cout << std::dec << i << ": result=0x" << std::hex << value << std::endl;
}
}
@@ -124,17 +152,17 @@ int run_test(const kernel_arg_t& kernel_arg,
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = ref_data.at(i);
int cur = buf_ptr[i];
@@ -154,9 +182,7 @@ int run_test(const kernel_arg_t& kernel_arg,
return 0;
}
int main(int argc, char *argv[]) {
size_t value;
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
@@ -190,51 +216,46 @@ int main(int argc, char *argv[]) {
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, src_buf_size, &value));
kernel_arg.src_addr = value;
RT_CHECK(vx_mem_alloc(device, dst_buf_size, &value));
kernel_arg.dst_addr = value;
RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.num_points = num_points;
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(dst_buf_size,
sizeof(kernel_arg_t)));
RT_CHECK(vx_buf_alloc(device, staging_buf_size, &staging_buf));
staging_buf.resize(staging_buf_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
{
auto buf_ptr = (int*)vx_host_ptr(staging_buf);
auto buf_ptr = (int*)staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
// upload source buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = src_data.at(i);
}
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_data.data(), num_points * sizeof(int32_t));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), src_buf_size));
}
std::cout << "upload source buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src_addr, src_buf_size, 0));
// clear destination buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size));
}
std::cout << "clear destination buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_addr, dst_buf_size, 0));
// run tests
std::cout << "run tests" << std::endl;

View File

@@ -1,80 +1,9 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
OPTS ?= -n64
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
VX_LDFLAGS += -lm
VX_SRCS = kernel.c
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include -I$(VORTEX_RT_PATH)/../hw
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = dogfood
SRCS = main.cpp
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DP) -d -r -t kernel.elf > kernel.dump
VX_SRCS = kernel.cpp
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
OPTS ?= -n64 -x19 -x20
kernel.elf: $(VX_SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-fpga: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

View File

@@ -7,9 +7,9 @@ typedef struct {
uint32_t testid;
uint32_t num_tasks;
uint32_t task_size;
uint32_t src0_addr;
uint32_t src1_addr;
uint32_t dst_addr;
uint64_t src0_addr;
uint64_t src1_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -1,334 +0,0 @@
#include <stdint.h>
#include <math.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
typedef void (*PFN_Kernel)(int task_id, kernel_arg_t* arg);
inline float __ieee754_sqrtf (float x) {
asm ("fsqrt.s %0, %1" : "=f" (x) : "f" (x));
return x;
}
void kernel_iadd(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
int32_t a = src0_ptr[offset+i];
int32_t b = src1_ptr[offset+i];
int32_t c = a + b;
dst_ptr[offset+i] = c;
}
}
void kernel_imul(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
int32_t a = src0_ptr[offset+i];
int32_t b = src1_ptr[offset+i];
int32_t c = a * b;
dst_ptr[offset+i] = c;
}
}
void kernel_idiv(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
int32_t a = src0_ptr[offset+i];
int32_t b = src1_ptr[offset+i];
int32_t c = a / b;
dst_ptr[offset+i] = c;
}
}
void kernel_idiv_mul(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
int32_t a = src0_ptr[offset+i];
int32_t b = src1_ptr[offset+i];
int32_t c = a / b;
int32_t d = a * b;
int32_t e = c + d;
dst_ptr[offset+i] = e;
}
}
void kernel_fadd(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a + b;
dst_ptr[offset+i] = c;
}
}
void kernel_fsub(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a - b;
dst_ptr[offset+i] = c;
}
}
void kernel_fmul(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a * b;
dst_ptr[offset+i] = c;
}
}
void kernel_fmadd(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a * b + b;
dst_ptr[offset+i] = c;
}
}
void kernel_fmsub(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a * b - b;
dst_ptr[offset+i] = c;
}
}
void kernel_fnmadd(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c =-a * b - b;
dst_ptr[offset+i] = c;
}
}
void kernel_fnmsub(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c =-a * b + b;
dst_ptr[offset+i] = c;
}
}
void kernel_fnmadd_madd(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c =-a * b - b;
float d = a * b + b;
float e = c + d;
dst_ptr[offset+i] = e;
}
}
void kernel_fdiv(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a / b;
dst_ptr[offset+i] = c;
}
}
void kernel_fdiv2(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a / b;
float d = b / a;
float e = c + d;
dst_ptr[offset+i] = e;
}
}
void kernel_fsqrt(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = __ieee754_sqrtf(a * b);
dst_ptr[offset+i] = c;
}
}
void kernel_ftoi(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a + b;
int32_t d = (int32_t)c;
dst_ptr[offset+i] = d;
}
}
void kernel_ftou(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
uint32_t* dst_ptr = (uint32_t*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a + b;
uint32_t d = (uint32_t)c;
dst_ptr[offset+i] = d;
}
}
void kernel_itof(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
int32_t a = src0_ptr[offset+i];
int32_t b = src1_ptr[offset+i];
int32_t c = a + b;
float d = (float)c;
dst_ptr[offset+i] = d;
}
}
void kernel_utof(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
int32_t a = src0_ptr[offset+i];
int32_t b = src1_ptr[offset+i];
uint32_t c = a + b;
float d = (float)c;
dst_ptr[offset+i] = d;
}
}
static const PFN_Kernel sc_tests[] = {
kernel_iadd,
kernel_imul,
kernel_idiv,
kernel_idiv_mul,
kernel_fadd,
kernel_fsub,
kernel_fmul,
kernel_fmadd,
kernel_fmsub,
kernel_fnmadd,
kernel_fnmsub,
kernel_fnmadd_madd,
kernel_fdiv,
kernel_fdiv2,
kernel_fsqrt,
kernel_ftoi,
kernel_ftou,
kernel_itof,
kernel_utof,
};
void main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)sc_tests[arg->testid], arg);
}

View File

@@ -0,0 +1,396 @@
#include <stdint.h>
#include <math.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
typedef void (*PFN_Kernel)(int task_id, kernel_arg_t* __UNIFORM__ arg);
inline float __ieee754_sqrtf (float x) {
asm ("fsqrt.s %0, %1" : "=f" (x) : "f" (x));
return x;
}
void kernel_iadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
int32_t a = src0_ptr[offset+i];
int32_t b = src1_ptr[offset+i];
int32_t c = a + b;
dst_ptr[offset+i] = c;
}
}
void kernel_imul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a * b;
dst_ptr[offset+i] = c;
}
}
void kernel_idiv(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a / b;
dst_ptr[offset+i] = c;
}
}
void kernel_idiv_mul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a / b;
auto d = a * b;
auto e = c + d;
dst_ptr[offset+i] = e;
}
}
void kernel_fadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a + b;
dst_ptr[offset+i] = c;
}
}
void kernel_fsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a - b;
dst_ptr[offset+i] = c;
}
}
void kernel_fmul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a * b;
dst_ptr[offset+i] = c;
}
}
void kernel_fmadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a * b + b;
dst_ptr[offset+i] = c;
}
}
void kernel_fmsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a * b - b;
dst_ptr[offset+i] = c;
}
}
void kernel_fnmadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c =-a * b - b;
dst_ptr[offset+i] = c;
}
}
void kernel_fnmsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c =-a * b + b;
dst_ptr[offset+i] = c;
}
}
void kernel_fnmadd_madd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c =-a * b - b;
auto d = a * b + b;
auto e = c + d;
dst_ptr[offset+i] = e;
}
}
void kernel_fdiv(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a / b;
dst_ptr[offset+i] = c;
}
}
void kernel_fdiv2(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a / b;
auto d = b / a;
auto e = c + d;
dst_ptr[offset+i] = e;
}
}
void kernel_fsqrt(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = __ieee754_sqrtf(a * b);
dst_ptr[offset+i] = c;
}
}
void kernel_ftoi(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a + b;
auto d = (int32_t)c;
dst_ptr[offset+i] = d;
}
}
void kernel_ftou(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (uint32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a + b;
auto d = (uint32_t)c;
dst_ptr[offset+i] = d;
}
}
void kernel_itof(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a + b;
auto d = (float)c;
dst_ptr[offset+i] = d;
}
}
void kernel_utof(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a + b;
auto d = (float)c;
dst_ptr[offset+i] = d;
}
}
void kernel_bar(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto num_warps = vx_num_warps();
auto num_threads = vx_num_threads();
auto cid = vx_core_id();
auto wid = vx_warp_id();
auto tid = vx_thread_id();
auto src0_ptr = (uint32_t*)arg->src0_addr;
auto dst_ptr = (uint32_t*)arg->dst_addr;
// per warp delay
uint32_t barrier_stall = 0;
for (int i = 0; i <= wid; ++i) {
barrier_stall += src0_ptr[0] * src0_ptr[i];
}
// memory fence
vx_fence();
// local barrier
vx_barrier(0, num_warps);
// update destination
auto src_idx = (cid * num_warps + (num_warps - 1 - wid)) * num_threads + tid;
dst_ptr[task_id] = src0_ptr[src_idx] + barrier_stall;
}
void kernel_gbar(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto num_cores = vx_num_cores();
auto num_warps = vx_num_warps();
auto num_threads = vx_num_threads();
auto cid = vx_core_id();
auto wid = vx_warp_id();
auto tid = vx_thread_id();
auto src0_ptr = (uint32_t*)arg->src0_addr;
auto dst_ptr = (uint32_t*)arg->dst_addr;
// per core delay
uint32_t barrier_stall = 0;
for (int i = 0; i <= cid; ++i) {
for (int j = 0; j <= wid; ++j) {
barrier_stall += src0_ptr[0] * src0_ptr[i + j];
}
}
// memory fence
vx_fence();
// global barrier
vx_barrier(0x80000000, num_cores);
// update destination
auto src_idx = ((num_cores - 1 - cid) * num_warps + (num_warps - 1 - wid)) * num_threads + tid;
dst_ptr[task_id] = src0_ptr[src_idx] + barrier_stall;
}
static const PFN_Kernel sc_tests[] = {
kernel_iadd,
kernel_imul,
kernel_idiv,
kernel_idiv_mul,
kernel_fadd,
kernel_fsub,
kernel_fmul,
kernel_fmadd,
kernel_fmsub,
kernel_fnmadd,
kernel_fnmsub,
kernel_fnmadd_madd,
kernel_fdiv,
kernel_fdiv2,
kernel_fsqrt,
kernel_ftoi,
kernel_ftou,
kernel_itof,
kernel_utof,
kernel_bar,
kernel_gbar
};
int main() {
auto arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)sc_tests[arg->testid], arg);
return 0;
}

View File

@@ -1,109 +1,49 @@
#include <iostream>
#include <vector>
#include <unordered_set>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include <VX_config.h>
#include "testcases.h"
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
class TestMngr {
public:
TestMngr() {
this->add_test("iadd", new Test_IADD());
this->add_test("imul", new Test_IMUL());
this->add_test("idiv", new Test_IDIV());
this->add_test("idiv-mul", new Test_IDIV_MUL());
#ifdef EXT_F_ENABLE
this->add_test("fadd", new Test_FADD());
this->add_test("fsub", new Test_FSUB());
this->add_test("fmul", new Test_FMUL());
this->add_test("fmadd", new Test_FMADD());
this->add_test("fmsub", new Test_FMSUB());
this->add_test("fnmadd", new Test_FNMADD());
this->add_test("fnmsub", new Test_FNMSUB());
this->add_test("fnmadd-madd", new Test_FNMADD_MADD());
this->add_test("fdiv", new Test_FDIV());
this->add_test("fdiv2", new Test_FDIV2());
this->add_test("fsqrt", new Test_FSQRT());
this->add_test("ftoi", new Test_FTOI());
this->add_test("ftou", new Test_FTOU());
this->add_test("itof", new Test_ITOF());
this->add_test("utof", new Test_UTOF());
#endif
}
~TestMngr() {
for (size_t i = 0; i < _tests.size(); ++i) {
delete _tests[i];
}
}
const std::string& get_name(int testid) const {
return _names.at(testid);
}
ITestCase* get_test(int testid) const {
return _tests.at(testid);
}
void add_test(const char* name, ITestCase* test) {
_names.push_back(name);
_tests.push_back(test);
}
size_t size() const {
return _tests.size();
}
private:
std::vector<std::string> _names;
std::vector<ITestCase*> _tests;
};
///////////////////////////////////////////////////////////////////////////////
TestMngr testMngr;
TestSuite* testSuite = nullptr;
const char* kernel_file = "kernel.bin";
int count = 0;
int count = 0;
std::unordered_set<int> included;
std::unordered_set<int> excluded;
int testid_s = 0;
int testid_e = (testMngr.size() - 1);
int testid_e = 0;
bool stop_on_error = true;
vx_device_h device = nullptr;
vx_buffer_h arg_buf = nullptr;
vx_buffer_h src1_buf = nullptr;
vx_buffer_h src2_buf = nullptr;
vx_buffer_h dst_buf = nullptr;
kernel_arg_t kernel_arg;
vx_device_h device = nullptr;
std::vector<uint8_t> arg_buf;
std::vector<uint8_t> src1_buf;
std::vector<uint8_t> src2_buf;
std::vector<uint8_t> dst_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-t:testid] [-s:testid] [-e:testid] [-k: kernel] [-n words] [-c] [-h: help]" << std::endl;
std::cout << "Usage: [-t<testid>: selected test] [-s<testid>: start test] [-e<testid>: end test] [-x<testid>: excluded tests]" << std::endl;
std::cout << " [-k<kernel>] [-n<words>] [-c] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:t:s:e:k:ch?")) != -1) {
while ((c = getopt(argc, argv, "n:t:x:s:e:k:ch?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 't':
testid_s = atoi(optarg);
testid_e = atoi(optarg);
included.insert(atoi(optarg));
break;
case 'x':
excluded.insert(atoi(optarg));
break;
case 's':
testid_s = atoi(optarg);
@@ -130,17 +70,8 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (arg_buf) {
vx_buf_free(arg_buf);
}
if (src1_buf) {
vx_buf_free(src1_buf);
}
if (src2_buf) {
vx_buf_free(src2_buf);
}
if (dst_buf) {
vx_buf_free(dst_buf);
if (testSuite) {
delete testSuite;
}
if (device) {
vx_mem_free(device, kernel_arg.src0_addr);
@@ -152,7 +83,6 @@ void cleanup() {
int main(int argc, char *argv[]) {
int exitcode = 0;
size_t value;
// parse command arguments
parse_args(argc, argv);
@@ -171,12 +101,12 @@ int main(int argc, char *argv[]) {
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t max_cores, max_warps, max_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads));
uint64_t num_cores, num_warps, num_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
int num_tasks = max_cores * max_warps * max_threads;
int num_tasks = num_cores * num_warps * num_threads;
int num_points = count * num_tasks;
size_t buf_size = num_points * sizeof(uint32_t);
@@ -188,59 +118,69 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.src0_addr = value;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.src1_addr = value;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.dst_addr = value;
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.num_tasks = num_tasks;
kernel_arg.task_size = count;
std::cout << "dev_src0=" << std::hex << kernel_arg.src0_addr << std::dec << std::endl;
std::cout << "dev_src1=" << std::hex << kernel_arg.src1_addr << std::dec << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::dec << std::endl;
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::dec << std::endl;
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::dec << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::dec << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
RT_CHECK(vx_buf_alloc(device, sizeof(kernel_arg_t), &arg_buf));
RT_CHECK(vx_buf_alloc(device, buf_size, &src1_buf));
RT_CHECK(vx_buf_alloc(device, buf_size, &src2_buf));
RT_CHECK(vx_buf_alloc(device, buf_size, &dst_buf));
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
arg_buf.resize(sizeof(kernel_arg_t));
src1_buf.resize(buf_size);
src2_buf.resize(buf_size);
dst_buf.resize(buf_size);
// allocate test suite
testSuite = new TestSuite(device);
if (testid_e == 0) {
testid_e = (testSuite->size() - 1);
}
// execute tests
for (int t = testid_s; t <= testid_e; ++t) {
auto name = testMngr.get_name(t);
auto test = testMngr.get_test(t);
if (!included.empty()) {
if (included.count(t) == 0)
continue;
}
if (!excluded.empty()) {
if (excluded.count(t) != 0)
continue;
}
auto test = testSuite->get_test(t);
auto name = test->name();
std::cout << "Test" << t << ": " << name << std::endl;
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
kernel_arg.testid = t;
memcpy((void*)vx_host_ptr(arg_buf), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(arg_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
memcpy(arg_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, arg_buf.data(), sizeof(kernel_arg_t)));
// get test arguments
std::cout << "get test arguments" << std::endl;
test->setup(num_points, (void*)vx_host_ptr(src1_buf), (void*)vx_host_ptr(src2_buf));
RT_CHECK(test->setup(num_points, (void*)src1_buf.data(), (void*)src2_buf.data()));
// upload source buffer0
std::cout << "upload source buffer0" << std::endl;
RT_CHECK(vx_copy_to_dev(src1_buf, kernel_arg.src0_addr, buf_size, 0));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, src1_buf.data(), buf_size));
// upload source buffer1
std::cout << "upload source buffer1" << std::endl;
RT_CHECK(vx_copy_to_dev(src2_buf, kernel_arg.src1_addr, buf_size, 0));
std::cout << "upload source buffer1" << std::endl;
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, src2_buf.data(), buf_size));
// clear destination buffer
std::cout << "clear destination buffer" << std::endl;
for (int i = 0; i < num_points; ++i) {
((uint32_t*)vx_host_ptr(dst_buf))[i] = 0xdeadbeef;
((uint32_t*)dst_buf.data())[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(dst_buf, kernel_arg.dst_addr, buf_size, 0));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, dst_buf.data(), buf_size));
// start device
std::cout << "start device" << std::endl;
@@ -248,18 +188,15 @@ int main(int argc, char *argv[]) {
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(dst_buf, kernel_arg.dst_addr, buf_size, 0));
RT_CHECK(vx_copy_from_dev(device, dst_buf.data(), kernel_arg.dst_addr, buf_size));
// verify destination
std::cout << "verify test result" << std::endl;
int errors = test->verify(num_points,
(void*)vx_host_ptr(dst_buf),
(void*)vx_host_ptr(src1_buf),
(void*)vx_host_ptr(src2_buf));
int errors = test->verify(num_points, dst_buf.data(), src1_buf.data(), src2_buf.data());
if (errors != 0) {
std::cout << "found " << std::dec << errors << " errors!" << std::endl;
std::cout << "Test" << t << "-" << name << " FAILED!" << std::endl << std::flush;

View File

@@ -3,6 +3,19 @@
#include <iostream>
#include <math.h>
#include <limits>
#include <assert.h>
void cleanup();
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
union Float_t {
float f;
@@ -47,33 +60,72 @@ inline bool almost_equal(float a, float b) {
return almost_equal_ulp(a, b);
}
class ITestCase;
class TestSuite {
public:
TestSuite(vx_device_h device);
~TestSuite();
ITestCase* get_test(int testid) const;
void add_test(ITestCase* test);
size_t size() const;
vx_device_h device() const;
private:
std::vector<ITestCase*> _tests;
vx_device_h device_;
};
class ITestCase {
public:
ITestCase() {}
ITestCase(TestSuite* suite, const char* name)
: suite_(suite)
, name_(name)
{}
virtual ~ITestCase() {}
virtual void setup(int n, void* src1, void* src2) = 0;
virtual int verify(int n, void* dst, const void* src1, const void* src2) = 0;
TestSuite* suite() const {
return suite_;
}
const char* name() const {
return name_;
}
virtual int setup(uint32_t n, void* src1, void* src2) = 0;
virtual int verify(uint32_t n, void* dst, const void* src1, const void* src2) = 0;
protected:
TestSuite* suite_;
const char* const name_;
};
class Test_IADD : public ITestCase {
public:
Test_IADD(TestSuite* suite) : ITestCase(suite, "iadd") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = n/2 - i;
b[i] = n/2 + i;
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
auto c = (int32_t*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] + b[i];
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -86,22 +138,24 @@ public:
class Test_IMUL : public ITestCase {
public:
Test_IMUL(TestSuite* suite) : ITestCase(suite, "imul") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = n/2 - i;
b[i] = n/2 + i;
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
auto c = (int32_t*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] * b[i];
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -114,22 +168,24 @@ public:
class Test_IDIV : public ITestCase {
public:
Test_IDIV(TestSuite* suite) : ITestCase(suite, "idiv") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = n/2 - i;
b[i] = n/2 + i;
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
auto c = (int32_t*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] / b[i];
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -142,22 +198,24 @@ public:
class Test_IDIV_MUL : public ITestCase {
public:
Test_IDIV_MUL(TestSuite* suite) : ITestCase(suite, "idiv-mul") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = n/2 - i;
b[i] = n/2 + i;
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
auto c = (int32_t*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] / b[i];
auto y = a[i] * b[i];
auto ref = x + y;
@@ -172,22 +230,24 @@ public:
class Test_FADD : public ITestCase {
public:
Test_FADD(TestSuite* suite) : ITestCase(suite, "fadd") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] + b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -200,22 +260,24 @@ public:
class Test_FSUB : public ITestCase {
public:
Test_FSUB(TestSuite* suite) : ITestCase(suite, "fsub") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] - b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -228,22 +290,24 @@ public:
class Test_FMUL : public ITestCase {
public:
Test_FMUL(TestSuite* suite) : ITestCase(suite, "fmul") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] * b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -256,22 +320,24 @@ public:
class Test_FMADD : public ITestCase {
public:
Test_FMADD(TestSuite* suite) : ITestCase(suite, "fmadd") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] * b[i] + b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -284,22 +350,24 @@ public:
class Test_FMSUB : public ITestCase {
public:
Test_FMSUB(TestSuite* suite) : ITestCase(suite, "fmsub") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] * b[i] - b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -312,22 +380,24 @@ public:
class Test_FNMADD : public ITestCase {
public:
Test_FNMADD(TestSuite* suite) : ITestCase(suite, "fnmadd") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = -a[i] * b[i] - b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -340,22 +410,24 @@ public:
class Test_FNMSUB : public ITestCase {
public:
Test_FNMSUB(TestSuite* suite) : ITestCase(suite, "fnmsub") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = -a[i] * b[i] + b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -368,22 +440,24 @@ public:
class Test_FNMADD_MADD : public ITestCase {
public:
Test_FNMADD_MADD(TestSuite* suite) : ITestCase(suite, "fnmadd-madd") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto x = -a[i] * b[i] - b[i];
auto y = a[i] * b[i] + b[i];
auto ref = x + y;
@@ -398,22 +472,24 @@ public:
class Test_FDIV : public ITestCase {
public:
Test_FDIV(TestSuite* suite) : ITestCase(suite, "fdiv") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] / b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -426,22 +502,24 @@ public:
class Test_FDIV2 : public ITestCase {
public:
Test_FDIV2(TestSuite* suite) : ITestCase(suite, "fdiv2") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] / b[i];
auto y = b[i] / a[i];
auto ref = x + y;
@@ -456,23 +534,25 @@ public:
class Test_FSQRT : public ITestCase {
public:
Test_FSQRT(TestSuite* suite) : ITestCase(suite, "fsqrt") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
float q = 1.0f + (i % 64);
a[i] = q;
b[i] = q;
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = sqrt(a[i] * b[i]);
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -485,22 +565,25 @@ public:
class Test_FTOI : public ITestCase {
public:
Test_FTOI(TestSuite* suite) : ITestCase(suite, "ftoi") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
a[i] = fround((n/2 - i) + (float(i)/n));
b[i] = fround((n/2 - i) + (float(i)/n));
for (uint32_t i = 0; i < n; ++i) {
float q = fround(float(n/2) - i + (float(i) / n));
a[i] = q;
b[i] = q;
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (int32_t*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] + b[i];
auto ref = (int32_t)x;
if (c[i] != ref) {
@@ -514,22 +597,25 @@ public:
class Test_FTOU : public ITestCase {
public:
Test_FTOU(TestSuite* suite) : ITestCase(suite, "ftou") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
a[i] = fround(i + (float(i)/n));
b[i] = fround(i + (float(i)/n));
for (uint32_t i = 0; i < n; ++i) {
float q = fround(i + (float(i) / n));
a[i] = q;
b[i] = q;
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (uint32_t*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] + b[i];
auto ref = (uint32_t)x;
if (c[i] != ref) {
@@ -543,22 +629,24 @@ public:
class Test_ITOF : public ITestCase {
public:
Test_ITOF(TestSuite* suite) : ITestCase(suite, "itof") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = n/2 - i;
b[i] = n/2 - i;
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] + b[i];
auto ref = (float)x;
if (!almost_equal(c[i], ref)) {
@@ -572,22 +660,24 @@ public:
class Test_UTOF : public ITestCase {
public:
Test_UTOF(TestSuite* suite) : ITestCase(suite, "utof") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (uint32_t*)src1;
auto b = (uint32_t*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = i;
b[i] = i;
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (uint32_t*)src1;
auto b = (uint32_t*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] + b[i];
auto ref = (float)x;
if (!almost_equal(c[i], ref)) {
@@ -597,4 +687,135 @@ public:
}
return errors;
}
};
};
class Test_BAR : public ITestCase {
public:
Test_BAR(TestSuite* suite) : ITestCase(suite, "bar") {}
int setup(uint32_t n, void* src1, void* /*src2*/) override {
RT_CHECK(vx_dev_caps(suite_->device(), VX_CAPS_NUM_WARPS, &num_warps_));
if (num_warps_ == 1) {
std::cout << "Error: multiple warps configuration required!" << std::endl;
return -1;
}
RT_CHECK(vx_dev_caps(suite_->device(), VX_CAPS_NUM_THREADS, &num_threads_));
auto a = (uint32_t*)src1;
for (uint32_t i = 0; i < n; ++i) {
a[i] = i;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* /*src2*/) override {
int errors = 0;
auto a = (uint32_t*)src1;
auto c = (uint32_t*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto tid = i % num_threads_;
auto wid = (i / num_threads_) % num_warps_;
auto cid = i / (num_warps_ * num_threads_);
auto src_idx = (cid * num_warps_ + (num_warps_ - 1 - wid)) * num_threads_ + tid;
uint32_t ref = a[src_idx];
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << std::hex << ref << ", actual=" << c[i] << std::endl;
++errors;
}
}
return errors;
}
uint64_t num_warps_;
uint64_t num_threads_;
};
class Test_GBAR : public ITestCase {
public:
Test_GBAR(TestSuite* suite) : ITestCase(suite, "gbar") {}
int setup(uint32_t n, void* src1, void* /*src2*/) override {
RT_CHECK(vx_dev_caps(suite_->device(), VX_CAPS_NUM_CORES, &num_cores_));
if (num_cores_ == 1) {
std::cout << "Error: multiple cores configuration required!" << std::endl;
return -1;
}
RT_CHECK(vx_dev_caps(suite_->device(), VX_CAPS_NUM_WARPS, &num_warps_));
RT_CHECK(vx_dev_caps(suite_->device(), VX_CAPS_NUM_THREADS, &num_threads_));
auto a = (uint32_t*)src1;
for (uint32_t i = 0; i < n; ++i) {
a[i] = i;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* /*src2*/) override {
int errors = 0;
auto a = (uint32_t*)src1;
auto c = (uint32_t*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto tid = i % num_threads_;
auto wid = (i / num_threads_) % num_warps_;
auto cid = i / (num_warps_ * num_threads_);
auto src_idx = ((num_cores_ - 1 - cid) * num_warps_ + (num_warps_ - 1 - wid)) * num_threads_ + tid;
uint32_t ref = a[src_idx];
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << std::hex << ref << ", actual=" << c[i] << std::endl;
++errors;
}
}
return errors;
}
uint64_t num_cores_;
uint64_t num_warps_;
uint64_t num_threads_;
};
///////////////////////////////////////////////////////////////////////////////
TestSuite::TestSuite(vx_device_h device)
: device_(device) {
this->add_test(new Test_IADD(this));
this->add_test(new Test_IMUL(this));
this->add_test(new Test_IDIV(this));
this->add_test(new Test_IDIV_MUL(this));
this->add_test(new Test_FADD(this));
this->add_test(new Test_FSUB(this));
this->add_test(new Test_FMUL(this));
this->add_test(new Test_FMADD(this));
this->add_test(new Test_FMSUB(this));
this->add_test(new Test_FNMADD(this));
this->add_test(new Test_FNMSUB(this));
this->add_test(new Test_FNMADD_MADD(this));
this->add_test(new Test_FDIV(this));
this->add_test(new Test_FDIV2(this));
this->add_test(new Test_FSQRT(this));
this->add_test(new Test_FTOI(this));
this->add_test(new Test_FTOU(this));
this->add_test(new Test_ITOF(this));
this->add_test(new Test_UTOF(this));
this->add_test(new Test_BAR(this));
this->add_test(new Test_GBAR(this));
}
TestSuite::~TestSuite() {
for (size_t i = 0; i < _tests.size(); ++i) {
delete _tests[i];
}
}
ITestCase* TestSuite::get_test(int testid) const {
return _tests.at(testid);
}
void TestSuite::add_test(ITestCase* test) {
_tests.push_back(test);
}
size_t TestSuite::size() const {
return _tests.size();
}
vx_device_h TestSuite::device() const {
return device_;
}

View File

@@ -1,78 +1,9 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
OPTS ?= -n64
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
VX_SRCS = kernel.c
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = fence
SRCS = main.cpp
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DP) -D kernel.elf > kernel.dump
VX_SRCS = kernel.cpp
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
OPTS ?= -n64
kernel.elf: $(VX_SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-fpga: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

View File

@@ -6,9 +6,9 @@
typedef struct {
uint32_t num_tasks;
uint32_t task_size;
uint32_t src0_addr;
uint32_t src1_addr;
uint32_t dst_addr;
uint64_t src0_addr;
uint64_t src1_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -3,14 +3,13 @@
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* arg) {
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
uint32_t count = arg->task_size;
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
dst_ptr[offset+i] = src0_ptr[offset+i] + src1_ptr[offset+i];
}
@@ -18,7 +17,8 @@ void kernel_body(int task_id, kernel_arg_t* arg) {
vx_fence();
}
void main() {
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
}
return 0;
}

View File

@@ -1,6 +1,7 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include "common.h"
@@ -20,8 +21,8 @@ const char* kernel_file = "kernel.bin";
uint32_t count = 0;
vx_device_h device = nullptr;
vx_buffer_h staging_buf = nullptr;
kernel_arg_t kernel_arg;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
@@ -51,9 +52,6 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (staging_buf) {
vx_buf_free(staging_buf);
}
if (device) {
vx_mem_free(device, kernel_arg.src0_addr);
vx_mem_free(device, kernel_arg.src1_addr);
@@ -71,17 +69,17 @@ int run_test(const kernel_arg_t& kernel_arg,
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = i + i;
int cur = buf_ptr[i];
@@ -101,9 +99,7 @@ int run_test(const kernel_arg_t& kernel_arg,
return 0;
}
int main(int argc, char *argv[]) {
size_t value;
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
@@ -115,12 +111,12 @@ int main(int argc, char *argv[]) {
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t max_cores, max_warps, max_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads));
uint64_t num_cores, num_warps, num_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
uint32_t num_tasks = max_cores * max_warps * max_threads;
uint32_t num_tasks = num_cores * num_warps * num_threads;
uint32_t num_points = count * num_tasks;
uint32_t buf_size = num_points * sizeof(int32_t);
@@ -132,64 +128,60 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.src0_addr = value;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.src1_addr = value;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.dst_addr = value;
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.num_tasks = num_tasks;
kernel_arg.task_size = count;
std::cout << "dev_src0=" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src1=" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
{
auto buf_ptr = (int*)vx_host_ptr(staging_buf);
auto buf_ptr = (int*)staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
// upload source buffer0
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "upload source buffer0" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i-1;
}
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size));
}
std::cout << "upload source buffer0" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src0_addr, buf_size, 0));
// upload source buffer1
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "upload source buffer1" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i+1;
}
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size));
}
std::cout << "upload source buffer1" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src1_addr, buf_size, 0));
// clear destination buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
}
std::cout << "clear destination buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
// run tests
std::cout << "run tests" << std::endl;

View File

@@ -1,78 +1,9 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
OPTS ?= -n16
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
VX_SRCS = kernel.c
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include -I../../../hw
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = io_addr
SRCS = main.cpp
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DP) -D kernel.elf > kernel.dump
VX_SRCS = kernel.cpp
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
OPTS ?= -n16
kernel.elf: $(VX_SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-fpga: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

View File

@@ -5,8 +5,8 @@
typedef struct {
uint32_t num_points;
uint32_t src_addr;
uint32_t dst_addr;
uint64_t src_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -3,8 +3,8 @@
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* arg) {
uint32_t* src_ptr = (uint32_t*)arg->src_addr;
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
uint64_t* src_ptr = (uint64_t*)arg->src_addr;
uint32_t* dst_ptr = (uint32_t*)arg->dst_addr;
int32_t* addr_ptr = (int32_t*)(src_ptr[task_id]);
@@ -12,7 +12,8 @@ void kernel_body(int task_id, kernel_arg_t* arg) {
dst_ptr[task_id] = *addr_ptr;
}
void main() {
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
}
return 0;
}

View File

@@ -23,14 +23,16 @@
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
size_t usr_test_mem;
static uint64_t io_base_addr = IO_CSR_ADDR + IO_CSR_SIZE;
std::vector<uint32_t> src_data;
uint64_t usr_test_mem;
std::vector<uint64_t> src_addrs;
std::vector<int32_t> ref_data;
vx_device_h device = nullptr;
vx_buffer_h staging_buf = nullptr;
kernel_arg_t kernel_arg;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
@@ -60,18 +62,16 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (staging_buf) {
vx_buf_free(staging_buf);
}
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_mem_free(device, usr_test_mem);
vx_dev_close(device);
}
}
void gen_input_data(uint32_t num_points) {
src_data.resize(num_points);
void gen_src_addrs(uint32_t num_points) {
src_addrs.resize(num_points);
uint32_t u = 0, k = 0;
for (uint32_t i = 0; i < num_points; ++i) {
@@ -80,9 +80,9 @@ void gen_input_data(uint32_t num_points) {
++u;
}
uint32_t j = i % NUM_ADDRS;
uint32_t v = ((j == k) ? usr_test_mem : IO_BASE_ADDR) + j * sizeof(uint32_t);
src_data[i] = v;
std::cout << std::dec << i << "," << k << ": value=0x" << std::hex << v << std::endl;
uint64_t a = ((j == k) ? usr_test_mem : io_base_addr) + j * sizeof(uint32_t);
std::cout << std::dec << i << "," << k << ": value=0x" << std::hex << a << std::endl;
src_addrs[i] = a;
}
}
@@ -90,7 +90,7 @@ void gen_ref_data(uint32_t num_points) {
ref_data.resize(num_points);
for (uint32_t i = 0; i < num_points; ++i) {
uint32_t j = i % NUM_ADDRS;
int32_t j = i % NUM_ADDRS;
ref_data[i] = j * j;
}
}
@@ -104,17 +104,17 @@ int run_test(const kernel_arg_t& kernel_arg,
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = ref_data.at(i);
int cur = buf_ptr[i];
@@ -135,7 +135,7 @@ int run_test(const kernel_arg_t& kernel_arg,
}
int main(int argc, char *argv[]) {
size_t value;
uint64_t value;
// parse command arguments
parse_args(argc, argv);
@@ -152,19 +152,18 @@ int main(int argc, char *argv[]) {
uint32_t num_points = count;
RT_CHECK(vx_mem_alloc(device, NUM_ADDRS * sizeof(uint32_t), &usr_test_mem));
RT_CHECK(vx_mem_alloc(device, NUM_ADDRS * sizeof(int32_t), VX_MEM_TYPE_GLOBAL, &usr_test_mem));
// generate input data
gen_input_data(num_points);
gen_src_addrs(num_points);
// generate reference data
gen_ref_data(num_points);
uint32_t src_buf_size = src_data.size() * sizeof(int32_t);
uint32_t dst_buf_size = src_data.size() * sizeof(int32_t);
uint32_t src_buf_size = num_points * sizeof(uint64_t);
uint32_t dst_buf_size = num_points * sizeof(int32_t);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
std::cout << "number of points: " << std::dec << num_points << std::endl;
// upload program
std::cout << "upload program" << std::endl;
@@ -173,61 +172,59 @@ int main(int argc, char *argv[]) {
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, src_buf_size, &value));
RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_TYPE_GLOBAL, &value));
kernel_arg.src_addr = value;
RT_CHECK(vx_mem_alloc(device, dst_buf_size, &value));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &value));
kernel_arg.dst_addr = value;
kernel_arg.num_points = num_points;
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(NUM_ADDRS * sizeof(uint32_t),
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(NUM_ADDRS * sizeof(uint64_t),
std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(dst_buf_size,
sizeof(kernel_arg_t))));
RT_CHECK(vx_buf_alloc(device, staging_buf_size, &staging_buf));
staging_buf.resize(staging_buf_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
// upload kernel argument
{
auto buf_ptr = (int*)vx_host_ptr(staging_buf);
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = (int*)staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
// upload test address data
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "upload test address data" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < NUM_ADDRS; ++i) {
buf_ptr[i] = i * i;
}
RT_CHECK(vx_copy_to_dev(device, io_base_addr, staging_buf.data(), NUM_ADDRS * sizeof(int32_t)));
RT_CHECK(vx_copy_to_dev(device, usr_test_mem, staging_buf.data(), NUM_ADDRS * sizeof(int32_t)));
}
RT_CHECK(vx_copy_to_dev(staging_buf, 0xFF000000, NUM_ADDRS * sizeof(uint32_t), 0));
RT_CHECK(vx_copy_to_dev(staging_buf, usr_test_mem, NUM_ADDRS * sizeof(uint32_t), 0));
// upload source buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = src_data.at(i);
}
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = (uint64_t*)staging_buf.data();
memcpy(buf_ptr, src_addrs.data(), src_buf_size);
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), src_buf_size));
}
std::cout << "upload source buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src_addr, src_buf_size, 0));
// clear destination buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size));
}
std::cout << "clear destination buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_addr, dst_buf_size, 0));
// run tests
std::cout << "run tests" << std::endl;

View File

@@ -1,78 +1,9 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
OPTS ?= -n64
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
VX_SRCS = kernel.c
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = mstress
SRCS = main.cpp
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DP) -D kernel.elf > kernel.dump
VX_SRCS = kernel.cpp
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
OPTS ?= -n64
kernel.elf: $(VX_SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-fpga: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

View File

@@ -9,9 +9,9 @@ typedef struct {
uint32_t num_tasks;
uint32_t size;
uint32_t stride;
uint32_t src0_addr;
uint32_t src1_addr;
uint32_t dst_addr;
uint64_t src0_addr;
uint64_t src1_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -3,7 +3,7 @@
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* arg) {
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
uint32_t stride = arg->stride;
uint32_t* addr_ptr = (uint32_t*)arg->src0_addr;
float* src_ptr = (float*)arg->src1_addr;
@@ -22,7 +22,8 @@ void kernel_body(int task_id, kernel_arg_t* arg) {
}
}
void main() {
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
}
return 0;
}

View File

@@ -72,8 +72,8 @@ std::vector<float> test_data;
std::vector<uint32_t> addr_table;
vx_device_h device = nullptr;
vx_buffer_h staging_buf = nullptr;
kernel_arg_t kernel_arg;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
@@ -103,9 +103,6 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (staging_buf) {
vx_buf_free(staging_buf);
}
if (device) {
vx_mem_free(device, kernel_arg.src0_addr);
vx_mem_free(device, kernel_arg.src1_addr);
@@ -140,17 +137,17 @@ int run_test(const kernel_arg_t& kernel_arg,
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_addr, dst_buf_size, 0));
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, dst_buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (float*)vx_host_ptr(staging_buf);
auto buf_ptr = (float*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
@@ -181,9 +178,7 @@ int run_test(const kernel_arg_t& kernel_arg,
return 0;
}
int main(int argc, char *argv[]) {
size_t value;
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
@@ -197,12 +192,12 @@ int main(int argc, char *argv[]) {
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t max_cores, max_warps, max_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads));
uint64_t num_cores, num_warps, num_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
uint32_t num_tasks = max_cores * max_warps * max_threads;
uint32_t num_tasks = num_cores * num_warps * num_threads;
uint32_t num_points = count * num_tasks;
// generate input data
@@ -220,67 +215,59 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, addr_buf_size, &value));
kernel_arg.src0_addr = value;
RT_CHECK(vx_mem_alloc(device, src_buf_size, &value));
kernel_arg.src1_addr = value;
RT_CHECK(vx_mem_alloc(device, dst_buf_size, &value));
kernel_arg.dst_addr = value;
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, addr_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.num_tasks = num_tasks;
kernel_arg.stride = count;
std::cout << "dev_addr=" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src=" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
std::cout << "dev_addr=0x" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(addr_buf_size,
std::max<uint32_t>(dst_buf_size,
sizeof(kernel_arg_t))));
RT_CHECK(vx_buf_alloc(device, staging_buf_size, &staging_buf));
staging_buf.resize(staging_buf_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
// upload kernel argument
{
auto buf_ptr = (int*)vx_host_ptr(staging_buf);
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = (int*)staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
// upload source buffer0
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
for (uint32_t i = 0; i < addr_table.size(); ++i) {
buf_ptr[i] = addr_table.at(i);
}
std::cout << "upload address buffer" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, addr_table.data(), addr_table.size() * sizeof(int32_t));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), addr_buf_size));
}
std::cout << "upload address buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src0_addr, addr_buf_size, 0));
// upload source buffer1
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
for (uint32_t i = 0; i < test_data.size(); ++i) {
buf_ptr[i] = test_data.at(i);
}
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, test_data.data(), test_data.size() * sizeof(int32_t));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), src_buf_size));
}
std::cout << "upload source buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src1_addr, src_buf_size, 0));
// clear destination buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < test_data.size(); ++i) {
buf_ptr[i] = 0xdeadbeef;
}
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size));
}
std::cout << "clear destination buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_addr, dst_buf_size, 0));
// run tests
std::cout << "run tests" << std::endl;

View File

@@ -1,78 +1,9 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
OPTS ?= -n8
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
VX_SRCS = kernel.c
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = no_mf_ext
SRCS = main.cpp
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DP) -D kernel.elf > kernel.dump
VX_SRCS = kernel.cpp
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
OPTS ?= -n8
kernel.elf: $(VX_SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-fpga: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

View File

@@ -5,8 +5,8 @@
typedef struct {
uint32_t size;
uint32_t src_addr;
uint32_t dst_addr;
uint64_t src_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -3,7 +3,7 @@
#include <vx_spawn.h>
#include "common.h"
void main() {
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
uint32_t size = arg->size;
@@ -13,4 +13,6 @@ void main() {
for (uint32_t i = 0; i < size; ++i) {
dst_ptr[i] = src_ptr[i];
}
}
return 0;
}

View File

@@ -1,6 +1,7 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include "common.h"
@@ -20,8 +21,8 @@ const char* kernel_file = "kernel.bin";
uint32_t count = 0;
vx_device_h device = nullptr;
vx_buffer_h staging_buf = nullptr;
kernel_arg_t kernel_arg;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
@@ -51,9 +52,6 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (staging_buf) {
vx_buf_free(staging_buf);
}
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
@@ -70,17 +68,17 @@ int run_test(const kernel_arg_t& kernel_arg,
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = i-1;
int cur = buf_ptr[i];
@@ -100,9 +98,7 @@ int run_test(const kernel_arg_t& kernel_arg,
return 0;
}
int main(int argc, char *argv[]) {
size_t value;
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
@@ -125,51 +121,47 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.src_addr = value;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.dst_addr = value;
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.size = num_points;
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
// upload kernel argument
{
auto buf_ptr = (int*)vx_host_ptr(staging_buf);
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = (int*)staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
// upload source buffer0
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i-1;
}
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), buf_size));
}
std::cout << "upload source buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src_addr, buf_size, 0));
// clear destination buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
}
std::cout << "clear destination buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
// run tests
std::cout << "run tests" << std::endl;

View File

@@ -1,80 +1,17 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
PROJECT = no_smem
OPTS ?= -n8
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -DSM_ENABLE=0 -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections,--defsym=__stack_top=0xfefff000
VX_RUNTIME = $(VORTEX_RT_PATH)/src/vx_start.S $(VORTEX_RT_PATH)/src/vx_perf.c $(VORTEX_RT_PATH)/src/vx_syscalls.c
VX_SRCS = kernel.c $(VX_RUNTIME)
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = no_smem
VX_SRCS = kernel.cpp $(VORTEX_KN_PATH)/src/vx_perf.c $(VORTEX_KN_PATH)/src/vx_syscalls.c $(VORTEX_KN_PATH)/src/vx_print.S $(VORTEX_KN_PATH)/src/vx_start.S
SRCS = main.cpp
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DP) -D kernel.elf > kernel.dump
include ../common.mk
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
VX_CFLAGS += -DSM_DISABLE
VX_LDFLAGS = -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR)
kernel.elf: $(VX_SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-fpga: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy

View File

@@ -5,8 +5,8 @@
typedef struct {
uint32_t size;
uint32_t src_addr;
uint32_t dst_addr;
uint64_t src_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -3,7 +3,7 @@
#include <vx_spawn.h>
#include "common.h"
void main() {
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
uint32_t size = arg->size;
@@ -13,4 +13,6 @@ void main() {
for (uint32_t i = 0; i < size; ++i) {
dst_ptr[i] = src_ptr[i];
}
}
return 0;
}

View File

@@ -1,6 +1,7 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include "common.h"
@@ -20,8 +21,8 @@ const char* kernel_file = "kernel.bin";
uint32_t count = 0;
vx_device_h device = nullptr;
vx_buffer_h staging_buf = nullptr;
kernel_arg_t kernel_arg;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
@@ -51,9 +52,6 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (staging_buf) {
vx_buf_free(staging_buf);
}
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
@@ -70,17 +68,17 @@ int run_test(const kernel_arg_t& kernel_arg,
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = i-1;
int cur = buf_ptr[i];
@@ -100,9 +98,7 @@ int run_test(const kernel_arg_t& kernel_arg,
return 0;
}
int main(int argc, char *argv[]) {
size_t value;
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
@@ -125,51 +121,47 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.src_addr = value;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.dst_addr = value;
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.size = num_points;
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
{
auto buf_ptr = (int*)vx_host_ptr(staging_buf);
auto buf_ptr = (int*)staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
// upload source buffer0
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i-1;
}
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), buf_size));
}
std::cout << "upload source buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src_addr, buf_size, 0));
// clear destination buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
}
std::cout << "clear destination buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
// run tests
std::cout << "run tests" << std::endl;

View File

@@ -1,78 +0,0 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
OPTS ?= -n32
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
VX_SRCS = kernel.c
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = prefetch
SRCS = main.cpp
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DP) -D kernel.elf > kernel.dump
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
kernel.elf: $(VX_SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-fpga: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif

View File

@@ -1,14 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
typedef struct {
uint32_t num_tasks;
uint32_t task_size;
uint32_t src0_addr;
uint32_t src1_addr;
uint32_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -1,43 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include <vx_print.h>
#include "common.h"
#define BLOCK_SIZE 64
void kernel_body(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
uint32_t offset = task_id * count;
uint32_t num_blocks = (count * 4 + BLOCK_SIZE-1) / BLOCK_SIZE;
int32_t* src0_ptr = (int32_t*)arg->src0_addr + offset;
int32_t* src1_ptr = (int32_t*)arg->src1_addr + offset;
int32_t* dst_ptr = (int32_t*)arg->dst_addr + offset;
uint32_t src0_end = (uint32_t)(src0_ptr + count);
uint32_t src1_end = (uint32_t)(src1_ptr + count);
for (uint32_t i = 0; i < count; ++i) {
dst_ptr[i] = src0_ptr[i] + src1_ptr[i];
uint32_t src0_mask = ((uint32_t)(src0_ptr + i)) % BLOCK_SIZE;
uint32_t src0_next = (uint32_t)(src0_ptr + i + BLOCK_SIZE/4);
if (src0_mask == 0 && src0_next < src0_end) {
//vx_printf("src0_next=%d\n", src0_next);
vx_prefetch(src0_next);
}
uint32_t src1_mask = ((uint32_t)(src1_ptr + i)) % BLOCK_SIZE;
uint32_t src1_next = (uint32_t)(src1_ptr + i + BLOCK_SIZE/4);
if (src1_mask == 0 && src1_next < src1_end) {
//vx_printf("src1_next=%d\n", src1_next);
vx_prefetch(src1_next);
}
}
}
void main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
}

View File

@@ -1,205 +0,0 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
vx_device_h device = nullptr;
vx_buffer_h staging_buf = nullptr;
kernel_arg_t kernel_arg;
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (staging_buf) {
vx_buf_free(staging_buf);
}
if (device) {
vx_mem_free(device, kernel_arg.src0_addr);
vx_mem_free(device, kernel_arg.src1_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_dev_close(device);
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, -1));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
for (uint32_t i = 0; i < num_points; ++i) {
int ref = i + i;
int cur = buf_ptr[i];
if (cur != ref) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
size_t value;
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t max_cores, max_warps, max_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads));
uint32_t num_tasks = max_cores * max_warps * max_threads;
uint32_t num_points = count * num_tasks;
uint32_t buf_size = num_points * sizeof(int32_t);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.src0_addr = value;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.src1_addr = value;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.dst_addr = value;
kernel_arg.num_tasks = num_tasks;
kernel_arg.task_size = count;
std::cout << "dev_src0=" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src1=" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
{
auto buf_ptr = (int*)vx_host_ptr(staging_buf);
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
}
// upload source buffer0
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i-1;
}
}
std::cout << "upload source buffer0" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src0_addr, buf_size, 0));
// upload source buffer1
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i+1;
}
}
std::cout << "upload source buffer1" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src1_addr, buf_size, 0));
// clear destination buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
}
std::cout << "clear destination buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, buf_size, num_points));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;
return 0;
}

View File

@@ -1,78 +1,9 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
OPTS ?= -n4
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
VX_SRCS = kernel.c
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = printf
SRCS = main.cpp
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DP) -D kernel.elf > kernel.dump
VX_SRCS = kernel.cpp
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
OPTS ?= -n4
kernel.elf: $(VX_SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-fpga: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

View File

@@ -5,7 +5,7 @@
typedef struct {
uint32_t num_points;
uint32_t src_addr;
uint64_t src_addr;
} kernel_arg_t;
#endif

View File

@@ -4,13 +4,15 @@
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* arg) {
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
int cid = vx_core_id();
int* src_ptr = (int*)arg->src_addr;
char value = 'A' + src_ptr[task_id];
vx_printf("task=%d, value=%c\n", task_id, value);
vx_printf("cid=%d: task=%d, value=%c\n", cid, task_id, value);
}
void main() {
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
}
return 0;
}

View File

@@ -1,6 +1,7 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include "common.h"
@@ -20,8 +21,8 @@ const char* kernel_file = "kernel.bin";
uint32_t count = 4;
vx_device_h device = nullptr;
vx_buffer_h staging_buf = nullptr;
kernel_arg_t kernel_arg;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
@@ -51,9 +52,6 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (staging_buf) {
vx_buf_free(staging_buf);
}
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_dev_close(device);
@@ -67,14 +65,12 @@ int run_test() {
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
return 0;
}
int main(int argc, char *argv[]) {
size_t value;
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
@@ -86,10 +82,9 @@ int main(int argc, char *argv[]) {
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t max_cores, max_warps, max_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads));
uint64_t num_warps, num_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
uint32_t num_points = count;
uint32_t buf_size = count * sizeof(int32_t);
@@ -102,37 +97,35 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.src_addr = value;
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src_addr));
kernel_arg.num_points = num_points;
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
RT_CHECK(vx_buf_alloc(device, alloc_size, &staging_buf));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
// upload kernel argument
{
auto buf_ptr = (void*)vx_host_ptr(staging_buf);
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = (void*)staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
// upload source buffer0
{
auto buf_ptr = (int*)vx_host_ptr(staging_buf);
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i;
}
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), buf_size));
}
std::cout << "upload source buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src_addr, buf_size, 0));
// run tests
std::cout << "run tests" << std::endl;

View File

@@ -1,86 +1,9 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
VORTEX_HW_PATH ?= $(realpath ../../../hw)
LLVM_PREFIX ?= /opt/llvm-riscv
SYSROOT=${RISCV_TOOLCHAIN_PATH}/riscv32-unknown-elf
OPTS ?= -n16
VX_CC = ${LLVM_PREFIX}/bin/clang
VX_CXX = ${LLVM_PREFIX}/bin/clang++
VX_DP = ${LLVM_PREFIX}/bin/llvm-objdump
VX_CP = ${LLVM_PREFIX}/bin/llvm-objcopy
VX_CFLAGS += -O3 -march=rv32imf -mabi=ilp32f -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -Xclang -target-feature -Xclang +vortex
VX_CFLAGS += --sysroot=${SYSROOT} --gcc-toolchain=${RISCV_TOOLCHAIN_PATH}
VX_CFLAGS += -I${VORTEX_HW_PATH} -I${VORTEX_RT_PATH}/include
VX_LDFLAGS += -Wl,-Bstatic,-T${VORTEX_RT_PATH}/linker/vx_link$(XLEN).ld,--gc-sections ${VORTEX_RT_PATH}/libvortexrt.a
VX_DPFLAGS = -arch=riscv32 -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex
VX_SRCS = kernel.c
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = sort
SRCS = main.cpp
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DP) $(VX_DPFLAGS) -D kernel.elf > kernel.dump
VX_SRCS = kernel.cpp
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
OPTS ?= -n16
kernel.elf: $(VX_SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-fpga: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

View File

@@ -3,10 +3,18 @@
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
#define FP_ENABLE
#ifdef FP_ENABLE
#define TYPE float
#else
#define TYPE int
#endif
typedef struct {
uint32_t num_points;
uint32_t src_addr;
uint32_t dst_addr;
uint64_t src_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -3,22 +3,23 @@
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int __DIVERGENT__ task_id, kernel_arg_t* arg) {
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
uint32_t num_points = arg->num_points;
int32_t* src_ptr = (int32_t*)arg->src_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
TYPE* src_ptr = (TYPE*)arg->src_addr;
TYPE* dst_ptr = (TYPE*)arg->dst_addr;
int32_t ref_value = src_ptr[task_id];
TYPE ref_value = src_ptr[task_id];
uint32_t pos = 0;
for (uint32_t i = 0; i < num_points; ++i) {
int32_t cur_value = src_ptr[i];
TYPE cur_value = src_ptr[i];
pos += (cur_value < ref_value) || ((cur_value == ref_value) && (i < task_id));
}
dst_ptr[pos] = ref_value;
}
void main() {
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
}
return 0;
}

View File

@@ -20,12 +20,12 @@
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
std::vector<int32_t> src_data;
std::vector<int32_t> ref_data;
std::vector<TYPE> src_data;
std::vector<TYPE> ref_data;
vx_device_h device = nullptr;
vx_buffer_h staging_buf = nullptr;
kernel_arg_t kernel_arg;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
@@ -55,9 +55,6 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (staging_buf) {
vx_buf_free(staging_buf);
}
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
@@ -70,9 +67,9 @@ void gen_input_data(uint32_t num_points) {
for (uint32_t i = 0; i < num_points; ++i) {
float r = static_cast<float>(std::rand()) / RAND_MAX;
int32_t value = r * num_points;
TYPE value = r * num_points;
src_data[i] = value;
std::cout << std::dec << i << ": value=0x" << std::hex << value << std::endl;
std::cout << std::dec << i << ": value=" << value << std::endl;
}
}
@@ -80,13 +77,11 @@ void gen_ref_data(uint32_t num_points) {
ref_data.resize(num_points);
for (uint32_t i = 0; i < num_points; ++i) {
int32_t ref_value = src_data.at(i);
TYPE ref_value = src_data.at(i);
uint32_t pos = 0;
for (uint32_t j = 0; j < num_points; ++j) {
int32_t cur_value = src_data.at(j);
int is_smaller = (cur_value < ref_value)
|| (cur_value == ref_value && j < i);
pos += is_smaller;
TYPE cur_value = src_data.at(j);
pos += (cur_value < ref_value) || (cur_value == ref_value && j < i);
}
ref_data.at(pos) = ref_value;
}
@@ -101,23 +96,23 @@ int run_test(const kernel_arg_t& kernel_arg,
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_addr, buf_size, 0));
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = ref_data.at(i);
int cur = buf_ptr[i];
TYPE ref = ref_data.at(i);
TYPE cur = buf_ptr[i];
if (cur != ref) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
<< std::hex << ": actual=" << cur << ", expected=" << ref << std::endl;
++errors;
}
}
@@ -131,9 +126,7 @@ int run_test(const kernel_arg_t& kernel_arg,
return 0;
}
int main(int argc, char *argv[]) {
size_t value;
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
@@ -166,52 +159,49 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, src_buf_size, &value));
kernel_arg.src_addr = value;
RT_CHECK(vx_mem_alloc(device, dst_buf_size, &value));
kernel_arg.dst_addr = value;
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.num_points = num_points;
std::cout << "dev_src=" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::endl;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(dst_buf_size,
sizeof(kernel_arg_t)));
RT_CHECK(vx_buf_alloc(device, staging_buf_size, &staging_buf));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
// allocate staging buffer
{
auto buf_ptr = (int*)vx_host_ptr(staging_buf);
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(dst_buf_size,
sizeof(kernel_arg_t)));
staging_buf.resize(staging_buf_size);
}
// upload kernel argument
{
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
// upload source buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = src_data.at(i);
}
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_data.data(), num_points * sizeof(TYPE));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), src_buf_size));
}
std::cout << "upload source buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src_addr, src_buf_size, 0));
// clear destination buffer
{
auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf);
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size));
}
std::cout << "clear destination buffer" << std::endl;
RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_addr, dst_buf_size, 0));
// run tests
std::cout << "run tests" << std::endl;

View File

@@ -1,78 +0,0 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
OPTS ?= -g1
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -std=c++11 -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -DENABLE_SW -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw -I$(VORTEX_RT_PATH)/../sim/common -I$(VORTEX_RT_PATH)/../third_party
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
VX_SRCS = kernel.c
CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include -I$(VORTEX_RT_PATH)/../hw -I$(VORTEX_RT_PATH)/../sim/common -I$(VORTEX_RT_PATH)/../third_party
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex $(VORTEX_RT_PATH)/../third_party/cocogfx/libcocogfx.a -lz
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = tex
SRCS = main.cpp utils.cpp
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DP) -D kernel.elf > kernel.dump
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
kernel.elf: $(VX_SRCS)
$(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-fpga: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif

View File

@@ -1,26 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#include <VX_config.h>
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
typedef struct {
bool use_sw;
uint32_t num_tasks;
uint8_t format;
uint8_t filter;
uint8_t wrapu;
uint8_t wrapv;
uint8_t src_logwidth;
uint8_t src_logheight;
uint32_t src_addr;
uint32_t mip_offs[TEX_LOD_MAX+1];
uint32_t dst_width;
uint32_t dst_height;
uint8_t dst_stride;
uint32_t dst_pitch;
uint32_t dst_addr;
} kernel_arg_t;
#endif

Binary file not shown.

Before

Width:  |  Height:  |  Size: 14 MiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 4.2 MiB

View File

@@ -1,98 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include <vx_print.h>
#include "texsw.h"
typedef struct {
kernel_arg_t* state;
uint32_t tile_width;
uint32_t tile_height;
float deltaX;
float deltaY;
float minification;
} tile_arg_t;
template <typename T, T Start, T End>
struct static_for_t {
template <typename Fn>
inline void operator()(const Fn& callback) const {
callback(Start);
static_for_t<T, Start+1, End>()(callback);
}
};
template <typename T, T N>
struct static_for_t<T, N, N> {
template <typename Fn>
inline void operator()(const Fn& callback) const {}
};
void kernel_body(int task_id, tile_arg_t* arg) {
kernel_arg_t* state = arg->state;
uint32_t xoffset = 0;
uint32_t yoffset = task_id * arg->tile_height;
uint8_t* dst_ptr = (uint8_t*)(state->dst_addr + xoffset * state->dst_stride + yoffset * state->dst_pitch);
Fixed<16> xj(arg->minification);
/*vx_printf("task_id=%d, tile_width=%d, tile_height=%d, deltaX=%f, deltaY=%f, minification=%f\n",
task_id, arg->tile_width, arg->tile_height, arg->deltaX, arg->deltaY, arg->minification);*/
float fv = (yoffset + 0.5f) * arg->deltaY;
for (uint32_t y = 0; y < arg->tile_height; ++y) {
uint32_t* dst_row = (uint32_t*)dst_ptr;
float fu = (xoffset + 0.5f) * arg->deltaX;
for (uint32_t x = 0; x < arg->tile_width; ++x) {
Fixed<TEX_FXD_FRAC> xu(fu);
Fixed<TEX_FXD_FRAC> xv(fv);
uint32_t color = tex_load(state, xu, xv, xj);
//vx_printf("task_id=%d, x=%d, y=%d, fu=%f, fv=%f, xu=0x%x, xv=0x%x, color=0x%x\n", task_id, x, y, fu, fv, xu.data(), xv.data(), color);
dst_row[x] = color;
fu += arg->deltaX;
}
dst_ptr += state->dst_pitch;
fv += arg->deltaY;
}
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
// configure texture unit
csr_write(CSR_TEX_UNIT, 0);
csr_write(CSR_TEX_WIDTH, arg->src_logwidth);
csr_write(CSR_TEX_HEIGHT, arg->src_logheight);
csr_write(CSR_TEX_FORMAT, arg->format);
csr_write(CSR_TEX_WRAPU, arg->wrapu);
csr_write(CSR_TEX_WRAPV, arg->wrapv);
csr_write(CSR_TEX_FILTER, (arg->filter ? 1 : 0));
csr_write(CSR_TEX_ADDR, arg->src_addr);
static_for_t<int, 0, TEX_LOD_MAX+1>()([&](int i) {
csr_write(CSR_TEX_MIPOFF(i), arg->mip_offs[i]);
});
tile_arg_t targ;
targ.state = arg;
targ.tile_width = arg->dst_width;
targ.tile_height = (arg->dst_height + arg->num_tasks - 1) / arg->num_tasks;
targ.deltaX = 1.0f / arg->dst_width;
targ.deltaY = 1.0f / arg->dst_height;
{
uint32_t src_width = (1 << arg->src_logwidth);
uint32_t src_height = (1 << arg->src_logheight);
float width_ratio = float(src_width) / arg->dst_width;
float height_ratio = float(src_height) / arg->dst_height;
targ.minification = std::max<float>(width_ratio, height_ratio);
}
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, &targ);
/*for (uint32_t t=0; t < arg->num_tasks; ++t) {
kernel_body(t, &targ);
}*/
return 0;
}

View File

@@ -1,274 +0,0 @@
#include <iostream>
#include <vector>
#include <unistd.h>
#include <string.h>
#include <chrono>
#include <cmath>
#include <assert.h>
#include <vortex.h>
#include "common.h"
#include "utils.h"
using namespace cocogfx;
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
const char* input_file = "palette64.png";
const char* output_file = "output.png";
int wrap = 0;
int filter = 0; // 0-> point, 1->bilinear, 2->trilinear
float scale = 1.0f;
int format = 0;
bool use_sw = false;
ePixelFormat eformat = FORMAT_A8R8G8B8;
vx_device_h device = nullptr;
vx_buffer_h buffer = nullptr;
kernel_arg_t kernel_arg;
static void show_usage() {
std::cout << "Vortex Texture Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-i image] [-o image] [-s scale] [-w wrap] [-f format] [-g filter] [-z no_hw] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "zi:o:k:w:f:g:s:h?")) != -1) {
switch (c) {
case 'i':
input_file = optarg;
break;
case 'o':
output_file = optarg;
break;
case 's':
scale = std::stof(optarg, NULL);
break;
case 'w':
wrap = std::atoi(optarg);
break;
case 'z':
use_sw = true;
break;
case 'f': {
format = std::atoi(optarg);
switch (format) {
case 0: eformat = FORMAT_A8R8G8B8; break;
case 1: eformat = FORMAT_R5G6B5; break;
case 2: eformat = FORMAT_A1R5G5B5; break;
case 3: eformat = FORMAT_A4R4G4B4; break;
case 4: eformat = FORMAT_A8L8; break;
case 5: eformat = FORMAT_L8; break;
case 6: eformat = FORMAT_A8; break;
default:
std::cout << "Error: invalid format: " << format << std::endl;
exit(1);
}
} break;
case 'g':
filter = std::atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (buffer) {
vx_buf_free(buffer);
}
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_dev_close(device);
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t width,
uint32_t height,
uint32_t bpp) {
(void)bpp;
auto time_start = std::chrono::high_resolution_clock::now();
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT));
auto time_end = std::chrono::high_resolution_clock::now();
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
printf("Elapsed time: %lg ms\n", elapsed);
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(buffer, kernel_arg.dst_addr, buf_size, 0));
std::vector<uint8_t> dst_pixels(buf_size);
auto buf_ptr = (uint8_t*)vx_host_ptr(buffer);
for (uint32_t i = 0; i < buf_size; ++i) {
dst_pixels[i] = buf_ptr[i];
}
// save output image
std::cout << "save output image" << std::endl;
//dump_image(dst_pixels, width, height, bpp);
RT_CHECK(SaveImage(output_file, FORMAT_A8R8G8B8, dst_pixels, width, height));
return 0;
}
int main(int argc, char *argv[]) {
std::vector<uint8_t> src_pixels;
std::vector<uint32_t> mip_offsets;
uint32_t src_width;
uint32_t src_height;
// parse command arguments
parse_args(argc, argv);
{
std::vector<uint8_t> staging;
RT_CHECK(LoadImage(input_file, eformat, staging, &src_width, &src_height));
uint32_t src_bpp = GetInfo(eformat).BytePerPixel;
//dump_image(staging, src_width, src_height, src_bpp);
RT_CHECK(GenerateMipmaps(src_pixels, mip_offsets, staging, eformat, src_width, src_height, src_width * src_bpp));
}
// check power of two support
if (!ispow2(src_width) || !ispow2(src_height)) {
std::cout << "Error: only power of two textures supported: width=" << src_width << ", heigth=" << src_height << std::endl;
return -1;
}
uint32_t src_logwidth = log2ceil(src_width);
uint32_t src_logheight = log2ceil(src_height);
uint32_t src_bufsize = src_pixels.size();
uint32_t dst_width = (uint32_t)(src_width * scale);
uint32_t dst_height = (uint32_t)(src_height * scale);
uint32_t dst_bpp = 4;
uint32_t dst_bufsize = dst_bpp * dst_width * dst_height;
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t max_cores, max_warps, max_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads));
uint32_t num_tasks = max_cores * max_warps * max_threads;
std::cout << "number of tasks: " << std::dec << num_tasks << std::endl;
std::cout << "source buffer: width=" << src_width << ", heigth=" << src_height << ", size=" << src_bufsize << " bytes" << std::endl;
std::cout << "destination buffer: width=" << dst_width << ", heigth=" << dst_height << ", size=" << dst_bufsize << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
uint64_t src_addr, dst_addr;
RT_CHECK(vx_mem_alloc(device, src_bufsize, &src_addr));
RT_CHECK(vx_mem_alloc(device, dst_bufsize, &dst_addr));
std::cout << "src_addr=0x" << std::hex << src_addr << std::endl;
std::cout << "dst_addr=0x" << std::hex << dst_addr << std::endl;
// allocate staging shared memory
std::cout << "allocate shared memory" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(sizeof(kernel_arg_t),
std::max<uint32_t>(src_bufsize, dst_bufsize));
RT_CHECK(vx_buf_alloc(device, alloc_size, &buffer));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
{
kernel_arg.use_sw = use_sw;
kernel_arg.num_tasks = std::min<uint32_t>(num_tasks, dst_height);
kernel_arg.format = format;
kernel_arg.filter = filter;
kernel_arg.wrapu = wrap;
kernel_arg.wrapv = wrap;
kernel_arg.src_logwidth = src_logwidth;
kernel_arg.src_logheight = src_logheight;
kernel_arg.src_addr = src_addr;
for (uint32_t i = 0; i < mip_offsets.size(); ++i) {
assert(i < TEX_LOD_MAX);
kernel_arg.mip_offs[i] = mip_offsets.at(i);
}
kernel_arg.dst_width = dst_width;
kernel_arg.dst_height = dst_height;
kernel_arg.dst_stride = dst_bpp;
kernel_arg.dst_pitch = dst_bpp * dst_width;
kernel_arg.dst_addr = dst_addr;
auto buf_ptr = (uint8_t*)vx_host_ptr(buffer);
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(buffer, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
}
// upload source buffer
std::cout << "upload source buffer" << std::endl;
{
auto buf_ptr = (uint8_t*)vx_host_ptr(buffer);
for (uint32_t i = 0; i < src_bufsize; ++i) {
buf_ptr[i] = src_pixels[i];
}
RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src_addr, src_bufsize, 0));
}
// clear destination buffer
std::cout << "clear destination buffer" << std::endl;
{
auto buf_ptr = (uint32_t*)vx_host_ptr(buffer);
for (uint32_t i = 0; i < (dst_bufsize/4); ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.dst_addr, dst_bufsize, 0));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_bufsize, dst_width, dst_height, dst_bpp));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;
return 0;
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 543 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 534 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.3 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.1 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 5.4 KiB

View File

@@ -1,125 +0,0 @@
#pragma once
#include <vx_intrinsics.h>
#include <texturing.h>
#include "common.h"
using namespace cocogfx;
inline void texel_read(uint32_t* texels,
uint8_t** addresses,
uint32_t count,
uint32_t stride) {
switch (stride) {
case 1:
for (uint32_t i = 0; i < count; ++i) {
texels[i] = *(uint8_t*)addresses[i];
}
break;
case 2:
for (uint32_t i = 0; i < count; ++i) {
texels[i] = *(uint16_t*)addresses[i];
}
break;
case 4:
for (uint32_t i = 0; i < count; ++i) {
texels[i] = *(uint32_t*)addresses[i];
}
break;
default:
std::abort();
}
}
inline uint32_t vx_tex_sw(kernel_arg_t* state,
Fixed<TEX_FXD_FRAC> xu,
Fixed<TEX_FXD_FRAC> xv,
uint32_t lod) {
uint8_t* base_addr = ((uint8_t*)state->src_addr) + state->mip_offs[lod];
uint32_t log_width = std::max<int32_t>(state->src_logwidth - lod, 0);
uint32_t log_height = std::max<int32_t>(state->src_logheight - lod, 0);
auto format = (TexFormat)state->format;
auto wrapu = (WrapMode)state->wrapu;
auto wrapv = (WrapMode)state->wrapv;
auto filter = state->filter;
auto stride = Stride(format);
uint32_t color;
if (filter) {
// addressing
uint32_t offset00, offset01, offset10, offset11;
uint32_t alpha, beta;
uint8_t* addr[4];
uint32_t texel[4];
TexAddressLinear(xu, xv, log_width, log_height, wrapu, wrapv,
&offset00, &offset01, &offset10, &offset11, &alpha, &beta);
addr[0] = base_addr + offset00 * stride;
addr[1] = base_addr + offset01 * stride;
addr[2] = base_addr + offset10 * stride;
addr[3] = base_addr + offset11 * stride;
// memory fetch
texel_read(texel, addr, 4, stride);
// filtering
color = TexFilterLinear(
format, texel[0], texel[1], texel[2], texel[3], alpha, beta);
} else {
// addressing
uint32_t offset;
uint8_t* addr;
uint32_t texel;
TexAddressPoint(xu, xv, log_width, log_height, wrapu, wrapv, &offset);
addr = base_addr + offset * stride;
// memory fetch
texel_read(&texel, &addr, 1, stride);
// filtering
color = TexFilterPoint(format, texel);
}
return color;
}
inline uint32_t tex_load(kernel_arg_t* state,
Fixed<TEX_FXD_FRAC> xu,
Fixed<TEX_FXD_FRAC> xv,
Fixed<16> xj) {
uint32_t color;
uint32_t j = std::max<int32_t>(xj.data(), Fixed<16>::ONE);
uint32_t l = std::min<uint32_t>(log2floor(j) - 16, TEX_LOD_MAX);
if (state->filter == 2) {
uint32_t ln = std::min<uint32_t>(l + 1, TEX_LOD_MAX);
uint32_t f = (j - (1 << (l + 16))) >> (l + 16 - 8);
uint32_t texel0, texel1;
if (state->use_sw) {
texel0 = vx_tex_sw(state, xu, xv, l);
texel1 = vx_tex_sw(state, xu, xv, ln);
} else {
texel0 = vx_tex(0, xu.data(), xv.data(), l);
texel1 = vx_tex(0, xu.data(), xv.data(), ln);
}
uint32_t cl, ch;
{
uint32_t c0l, c0h, c1l, c1h;
Unpack8888(texel0, &c0l, &c0h);
Unpack8888(texel1, &c1l, &c1h);
cl = Lerp8888(c0l, c1l, f);
ch = Lerp8888(c0h, c1h, f);
}
color = Pack8888(cl, ch);
//vx_printf("j=0x%x, l=%d, ln=%d, f=%d, texel0=0x%x, texel1=0x%x, color=0x%x\n", j, l, ln, f, texel0, texel1, color);
} else {
if (state->use_sw) {
color = vx_tex_sw(state, xu, xv, l);
} else {
color = vx_tex(0, xu.data(), xv.data(), l);
}
}
return color;
}

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.2 KiB

View File

@@ -1,122 +0,0 @@
#include "utils.h"
#include <assert.h>
#include <string>
#include <iostream>
#include <iomanip>
#include <cocogfx/include/tga.h>
#include <cocogfx/include/png.h>
using namespace cocogfx;
std::string getFileExt(const std::string& str) {
auto i = str.rfind('.');
if (i != std::string::npos) {
return str.substr(i+1);
}
return("");
}
bool iequals(const std::string& a, const std::string& b) {
auto sz = a.size();
if (b.size() != sz)
return false;
for (size_t i = 0; i < sz; ++i) {
if (tolower(a[i]) != tolower(b[i]))
return false;
}
return true;
}
int LoadImage(const char *filename,
ePixelFormat format,
std::vector<uint8_t> &pixels,
uint32_t *width,
uint32_t *height) {
uint32_t img_width;
uint32_t img_height;
uint32_t img_bpp;
auto ext = getFileExt(filename);
if (iequals(ext, "tga")) {
int ret = LoadTGA(filename, pixels, &img_width, &img_height, &img_bpp);
if (ret)
return ret;
} else
if (iequals(ext, "png")) {
int ret = LoadPNG(filename, pixels, &img_width, &img_height, &img_bpp);
if (ret)
return ret;
} else {
std::cerr << "invalid file extension: " << ext << "!" << std::endl;
return -1;
}
ePixelFormat img_format;
switch (img_bpp) {
case 1:
img_format = FORMAT_A8;
break;
case 2:
img_format = FORMAT_A1R5G5B5;
break;
case 3:
img_format = FORMAT_R8G8B8;
break;
case 4:
img_format = FORMAT_A8R8G8B8;
break;
default:
std::abort();
}
if (img_format != format) {
// format conversion to RGBA
std::vector<uint8_t> staging;
int ret = ConvertImage(staging, format, pixels, img_format, img_width, img_height, img_width * img_bpp);
if (ret)
return ret;
pixels.swap(staging);
}
*width = img_width;
*height = img_height;
return 0;
}
int SaveImage(const char *filename,
ePixelFormat format,
const std::vector<uint8_t> &pixels,
uint32_t width,
uint32_t height) {
uint32_t bpp = GetInfo(format).BytePerPixel;
auto ext = getFileExt(filename);
if (iequals(ext, "tga")) {
return SaveTGA(filename, pixels, width, height, bpp);
} else
if (iequals(ext, "png")) {
return SavePNG(filename, pixels, width, height, bpp);
} else {
std::cerr << "invalid file extension: " << ext << "!" << std::endl;
return -1;
}
return 0;
}
void dump_image(const std::vector<uint8_t>& pixels, uint32_t width, uint32_t height, uint32_t bpp) {
assert(width * height * bpp == pixels.size());
const uint8_t* pixel_bytes = pixels.data();
for (uint32_t y = 0; y < height; ++y) {
for (uint32_t x = 0; x < width; ++x) {
uint32_t pixel32 = 0;
for (uint32_t b = 0; b < bpp; ++b) {
uint32_t pixel8 = *pixel_bytes++;
pixel32 |= pixel8 << (b * 8);
}
if (x) std::cout << ", ";
std::cout << std::hex << std::setw(bpp * 2) << std::setfill('0') << pixel32;
}
std::cout << std::endl;
}
}

View File

@@ -1,22 +0,0 @@
#include <cstdint>
#include <vector>
#include <bitmanip.h>
#include <cocogfx/include/format.h>
#include <cocogfx/include/blitter.h>
int LoadImage(const char *filename,
cocogfx::ePixelFormat format,
std::vector<uint8_t> &pixels,
uint32_t *width,
uint32_t *height);
int SaveImage(const char *filename,
cocogfx::ePixelFormat format,
const std::vector<uint8_t> &pixels,
uint32_t width,
uint32_t height);
void dump_image(const std::vector<uint8_t>& pixels,
uint32_t width,
uint32_t height,
uint32_t bpp);