diff --git a/ci/regression.sh b/ci/regression.sh index 73ce8e94..680ae78a 100755 --- a/ci/regression.sh +++ b/ci/regression.sh @@ -8,7 +8,8 @@ make -s coverage() { -# coverage tests +echo "begin coverage tests..." + make -C tests/runtime run-rtlsim make -C tests/riscv/isa run-rtlsim make -C tests/regression run-vlsim @@ -17,10 +18,14 @@ make -C tests/runtime run-simx make -C tests/riscv/isa run-simx make -C tests/regression run-simx make -C tests/opencl run-simx + +echo "coverage tests done!" } cluster() { +echo "begin clustering tests..." + # warp/threads configurations ./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=2 --threads=8 --app=demo ./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=1 --warps=8 --threads=2 --app=demo @@ -33,26 +38,33 @@ cluster() ./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=demo --args="-n1" ./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l3cache --app=demo --args="-n1" ./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --clusters=2 --l2cache --l3cache --app=io_addr --args="-n1" + +echo "clustering tests done!" } debug() { -# debugging +echo "begin debugging tests..." + ./ci/travis_run.py ./ci/blackbox.sh --driver=vlsim --cores=1 --perf --app=demo --args="-n1" ./ci/travis_run.py ./ci/blackbox.sh --driver=vlsim --cores=1 --debug --app=demo --args="-n1" ./ci/travis_run.py ./ci/blackbox.sh --driver=vlsim --cores=1 --scope --app=basic --args="-t0 -n1" + +echo "debugging tests done!" } config() { +echo "begin configuration tests..." + # disabling M extension -CONFIGS=-DEXT_M_DISABLE make -C hw/simulate +CONFIGS=-DEXT_M_DISABLE ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext # disabling F extension -CONFIGS=-DEXT_F_DISABLE make -C hw/simulate +CONFIGS=-DEXT_F_DISABLE ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext # disable shared memory -CONFIGS=-DSM_ENABLE=0 make -C hw/simulate +CONFIGS=-DSM_ENABLE=0 ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_smem # using Default FPU core FPU_CORE=FPU_DEFAULT ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=dogfood @@ -86,12 +98,17 @@ CONFIGS="-DVERILATOR_RESET_VALUE=1" ./ci/blackbox.sh --driver=vlsim --cores=4 -- # test long memory latency CONFIGS="-DMEM_LATENCY=100 -DMEM_RQ_SIZE=4 -DMEM_STALLS_MODULO=4" ./ci/blackbox.sh --driver=vlsim --cores=1 --app=demo + +echo "configuration tests done!" } stress() { -# test pipeline stress +echo "begin stress tests..." + ./ci/travis_run.py ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --clusters=2 --l3cache --app=sgemm --args="-n256" + +echo "stress tests done!" } usage() diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index ea0ee9dc..a294a6ff 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -318,9 +318,10 @@ // SM Configurable Knobs ////////////////////////////////////////////////////// // per thread stack size -`ifndef STACK_SIZE -`define STACK_SIZE 1024 +`ifndef STACK_LOG2_SIZE +`define STACK_LOG2_SIZE 10 `endif +`define STACK_SIZE (1 << `STACK_LOG2_SIZE) // Size of cache in bytes `ifndef SMEM_SIZE diff --git a/tests/regression/Makefile b/tests/regression/Makefile index 714e3f27..890bf333 100644 --- a/tests/regression/Makefile +++ b/tests/regression/Makefile @@ -7,6 +7,8 @@ all: $(MAKE) -C printf $(MAKE) -C diverge $(MAKE) -C fence + $(MAKE) -C no_mf_ext + $(MAKE) -C no_smem run-simx: $(MAKE) -C basic run-simx @@ -17,6 +19,20 @@ run-simx: $(MAKE) -C printf run-simx $(MAKE) -C diverge run-simx $(MAKE) -C fence run-simx + $(MAKE) -C no_mf_ext run-simx + $(MAKE) -C no_smem run-simx + +run-rtlsim: + $(MAKE) -C basic run-rtlsim + $(MAKE) -C demo run-rtlsim + $(MAKE) -C dogfood run-rtlsim + $(MAKE) -C mstress run-rtlsim + $(MAKE) -C io_addr run-rtlsim + $(MAKE) -C printf run-rtlsim + $(MAKE) -C diverge run-rtlsim + $(MAKE) -C fence run-rtlsim + $(MAKE) -C no_mf_ext run-rtlsim + $(MAKE) -C no_smem run-rtlsim run-vlsim: $(MAKE) -C basic run-vlsim @@ -27,6 +43,8 @@ run-vlsim: $(MAKE) -C printf run-vlsim $(MAKE) -C diverge run-vlsim $(MAKE) -C fence run-vlsim + $(MAKE) -C no_mf_ext run-vlsim + $(MAKE) -C no_smem run-vlsim clean: $(MAKE) -C basic clean @@ -37,6 +55,8 @@ clean: $(MAKE) -C printf clean $(MAKE) -C diverge clean $(MAKE) -C fence clean + $(MAKE) -C no_mf_ext clean + $(MAKE) -C no_smem clean clean-all: $(MAKE) -C basic clean-all @@ -47,4 +67,6 @@ clean-all: $(MAKE) -C printf clean-all $(MAKE) -C diverge clean-all $(MAKE) -C fence clean-all + $(MAKE) -C no_mf_ext clean-all + $(MAKE) -C no_smem clean-all diff --git a/tests/regression/no_mf_ext/Makefile b/tests/regression/no_mf_ext/Makefile new file mode 100644 index 00000000..fd43dc85 --- /dev/null +++ b/tests/regression/no_mf_ext/Makefile @@ -0,0 +1,70 @@ +RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain +VORTEX_DRV_PATH ?= $(realpath ../../../driver) +VORTEX_RT_PATH ?= $(realpath ../../../runtime) + +OPTS ?= -n8 + +VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc +VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ +VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump +VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy + +VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections +VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw + +VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a + +VX_SRCS = kernel.c + +#CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -pedantic -Wfatal-errors +CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors + +CXXFLAGS += -I$(VORTEX_DRV_PATH)/include + +LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex + +PROJECT = no_mf_ext + +SRCS = main.cpp + +all: $(PROJECT) kernel.bin kernel.dump + +kernel.dump: kernel.elf + $(VX_DP) -D kernel.elf > kernel.dump + +kernel.bin: kernel.elf + $(VX_CP) -O binary kernel.elf kernel.bin + +kernel.elf: $(VX_SRCS) + $(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf + +$(PROJECT): $(SRCS) + $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ + +run-simx: $(PROJECT) kernel.bin + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-fpga: $(PROJECT) kernel.bin + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-asesim: $(PROJECT) kernel.bin + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-vlsim: $(PROJECT) kernel.bin + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-rtlsim: $(PROJECT) kernel.bin + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +.depend: $(SRCS) + $(CXX) $(CXXFLAGS) -MM $^ > .depend; + +clean: + rm -rf $(PROJECT) *.o .depend + +clean-all: clean + rm -rf *.elf *.bin *.dump + +ifneq ($(MAKECMDGOALS),clean) + -include .depend +endif \ No newline at end of file diff --git a/tests/regression/no_mf_ext/common.h b/tests/regression/no_mf_ext/common.h new file mode 100644 index 00000000..b22cf16e --- /dev/null +++ b/tests/regression/no_mf_ext/common.h @@ -0,0 +1,12 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 + +struct kernel_arg_t { + uint32_t size; + uint32_t src_ptr; + uint32_t dst_ptr; +}; + +#endif \ No newline at end of file diff --git a/tests/regression/no_mf_ext/kernel.c b/tests/regression/no_mf_ext/kernel.c new file mode 100644 index 00000000..9e074dc3 --- /dev/null +++ b/tests/regression/no_mf_ext/kernel.c @@ -0,0 +1,16 @@ +#include +#include +#include +#include "common.h" + +void main() { + struct kernel_arg_t* arg = (struct kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + + uint32_t size = arg->size; + int32_t* src_ptr = (int32_t*)arg->src_ptr; + int32_t* dst_ptr = (int32_t*)arg->dst_ptr; + + for (uint32_t i = 0; i < size; ++i) { + dst_ptr[i] = src_ptr[i]; + } +} \ No newline at end of file diff --git a/tests/regression/no_mf_ext/main.cpp b/tests/regression/no_mf_ext/main.cpp new file mode 100644 index 00000000..01bcfb90 --- /dev/null +++ b/tests/regression/no_mf_ext/main.cpp @@ -0,0 +1,183 @@ +#include +#include +#include +#include +#include "common.h" + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + exit(-1); \ + } while (false) + +/////////////////////////////////////////////////////////////////////////////// + +const char* kernel_file = "kernel.bin"; +uint32_t count = 0; + +vx_device_h device = nullptr; +vx_buffer_h staging_buf = nullptr; + +static void show_usage() { + std::cout << "Vortex Test." << std::endl; + std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl; +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:k:h?")) != -1) { + switch (c) { + case 'n': + count = atoi(optarg); + break; + case 'k': + kernel_file = optarg; + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } +} + +void cleanup() { + if (staging_buf) { + vx_buf_release(staging_buf); + } + if (device) { + vx_dev_close(device); + } +} + +int run_test(const kernel_arg_t& kernel_arg, + uint32_t buf_size, + uint32_t num_points) { + // start device + std::cout << "start device" << std::endl; + RT_CHECK(vx_start(device)); + + // wait for completion + std::cout << "wait for completion" << std::endl; + RT_CHECK(vx_ready_wait(device, -1)); + + // download destination buffer + std::cout << "download destination buffer" << std::endl; + RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_ptr, buf_size, 0)); + + // verify result + std::cout << "verify result" << std::endl; + { + int errors = 0; + auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf); + for (uint32_t i = 0; i < num_points; ++i) { + int ref = i-1; + int cur = buf_ptr[i]; + if (cur != ref) { + std::cout << "error at result #" << std::dec << i + << std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl; + ++errors; + } + } + if (errors != 0) { + std::cout << "Found " << std::dec << errors << " errors!" << std::endl; + std::cout << "FAILED!" << std::endl; + return 1; + } + } + + return 0; +} + +int main(int argc, char *argv[]) { + size_t value; + kernel_arg_t kernel_arg; + + // parse command arguments + parse_args(argc, argv); + + if (count == 0) { + count = 1; + } + + // open device connection + std::cout << "open device connection" << std::endl; + RT_CHECK(vx_dev_open(&device)); + + uint32_t num_points = count; + uint32_t buf_size = num_points * sizeof(int32_t); + + std::cout << "number of points: " << num_points << std::endl; + std::cout << "buffer size: " << buf_size << " bytes" << std::endl; + + // upload program + std::cout << "upload program" << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file)); + + // allocate device memory + std::cout << "allocate device memory" << std::endl; + + RT_CHECK(vx_alloc_dev_mem(device, buf_size, &value)); + kernel_arg.src_ptr = value; + + RT_CHECK(vx_alloc_dev_mem(device, buf_size, &value)); + kernel_arg.dst_ptr = value; + + kernel_arg.size = num_points; + + std::cout << "dev_src=" << std::hex << kernel_arg.src_ptr << std::endl; + std::cout << "dev_dst=" << std::hex << kernel_arg.dst_ptr << std::endl; + + // allocate shared memory + std::cout << "allocate shared memory" << std::endl; + uint32_t alloc_size = std::max(buf_size, sizeof(kernel_arg_t)); + RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &staging_buf)); + + // upload kernel argument + std::cout << "upload kernel argument" << std::endl; + { + auto buf_ptr = (int*)vx_host_ptr(staging_buf); + memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); + RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0)); + } + + // upload source buffer0 + { + auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf); + for (uint32_t i = 0; i < num_points; ++i) { + buf_ptr[i] = i-1; + } + } + std::cout << "upload source buffer" << std::endl; + RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src_ptr, buf_size, 0)); + + // clear destination buffer + { + auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf); + for (uint32_t i = 0; i < num_points; ++i) { + buf_ptr[i] = 0xdeadbeef; + } + } + std::cout << "clear destination buffer" << std::endl; + RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_ptr, buf_size, 0)); + + // run tests + std::cout << "run tests" << std::endl; + RT_CHECK(run_test(kernel_arg, buf_size, num_points)); + + // cleanup + std::cout << "cleanup" << std::endl; + cleanup(); + + std::cout << "PASSED!" << std::endl; + + return 0; +} \ No newline at end of file diff --git a/tests/regression/no_smem/Makefile b/tests/regression/no_smem/Makefile new file mode 100644 index 00000000..74c3190c --- /dev/null +++ b/tests/regression/no_smem/Makefile @@ -0,0 +1,72 @@ +RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain +VORTEX_DRV_PATH ?= $(realpath ../../../driver) +VORTEX_RT_PATH ?= $(realpath ../../../runtime) + +OPTS ?= -n8 + +VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc +VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++ +VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump +VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy + +VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections +VX_CFLAGS += -DSM_ENABLE=0 -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw + +VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections,--defsym=__stack_top=0xfefff000 + +VX_RUNTIME = $(VORTEX_RT_PATH)/src/vx_start.S $(VORTEX_RT_PATH)/src/vx_perf.c + +VX_SRCS = kernel.c $(VX_RUNTIME) + +#CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -pedantic -Wfatal-errors +CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -pedantic -Wfatal-errors + +CXXFLAGS += -I$(VORTEX_DRV_PATH)/include + +LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex + +PROJECT = no_smem + +SRCS = main.cpp + +all: $(PROJECT) kernel.bin kernel.dump + +kernel.dump: kernel.elf + $(VX_DP) -D kernel.elf > kernel.dump + +kernel.bin: kernel.elf + $(VX_CP) -O binary kernel.elf kernel.bin + +kernel.elf: $(VX_SRCS) + $(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf + +$(PROJECT): $(SRCS) + $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ + +run-simx: $(PROJECT) kernel.bin + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-fpga: $(PROJECT) kernel.bin + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-asesim: $(PROJECT) kernel.bin + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-vlsim: $(PROJECT) kernel.bin + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +run-rtlsim: $(PROJECT) kernel.bin + LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS) + +.depend: $(SRCS) + $(CXX) $(CXXFLAGS) -MM $^ > .depend; + +clean: + rm -rf $(PROJECT) *.o .depend + +clean-all: clean + rm -rf *.elf *.bin *.dump + +ifneq ($(MAKECMDGOALS),clean) + -include .depend +endif \ No newline at end of file diff --git a/tests/regression/no_smem/common.h b/tests/regression/no_smem/common.h new file mode 100644 index 00000000..b22cf16e --- /dev/null +++ b/tests/regression/no_smem/common.h @@ -0,0 +1,12 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 + +struct kernel_arg_t { + uint32_t size; + uint32_t src_ptr; + uint32_t dst_ptr; +}; + +#endif \ No newline at end of file diff --git a/tests/regression/no_smem/kernel.c b/tests/regression/no_smem/kernel.c new file mode 100644 index 00000000..9e074dc3 --- /dev/null +++ b/tests/regression/no_smem/kernel.c @@ -0,0 +1,16 @@ +#include +#include +#include +#include "common.h" + +void main() { + struct kernel_arg_t* arg = (struct kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; + + uint32_t size = arg->size; + int32_t* src_ptr = (int32_t*)arg->src_ptr; + int32_t* dst_ptr = (int32_t*)arg->dst_ptr; + + for (uint32_t i = 0; i < size; ++i) { + dst_ptr[i] = src_ptr[i]; + } +} \ No newline at end of file diff --git a/tests/regression/no_smem/main.cpp b/tests/regression/no_smem/main.cpp new file mode 100644 index 00000000..01bcfb90 --- /dev/null +++ b/tests/regression/no_smem/main.cpp @@ -0,0 +1,183 @@ +#include +#include +#include +#include +#include "common.h" + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + exit(-1); \ + } while (false) + +/////////////////////////////////////////////////////////////////////////////// + +const char* kernel_file = "kernel.bin"; +uint32_t count = 0; + +vx_device_h device = nullptr; +vx_buffer_h staging_buf = nullptr; + +static void show_usage() { + std::cout << "Vortex Test." << std::endl; + std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl; +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:k:h?")) != -1) { + switch (c) { + case 'n': + count = atoi(optarg); + break; + case 'k': + kernel_file = optarg; + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } +} + +void cleanup() { + if (staging_buf) { + vx_buf_release(staging_buf); + } + if (device) { + vx_dev_close(device); + } +} + +int run_test(const kernel_arg_t& kernel_arg, + uint32_t buf_size, + uint32_t num_points) { + // start device + std::cout << "start device" << std::endl; + RT_CHECK(vx_start(device)); + + // wait for completion + std::cout << "wait for completion" << std::endl; + RT_CHECK(vx_ready_wait(device, -1)); + + // download destination buffer + std::cout << "download destination buffer" << std::endl; + RT_CHECK(vx_copy_from_dev(staging_buf, kernel_arg.dst_ptr, buf_size, 0)); + + // verify result + std::cout << "verify result" << std::endl; + { + int errors = 0; + auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf); + for (uint32_t i = 0; i < num_points; ++i) { + int ref = i-1; + int cur = buf_ptr[i]; + if (cur != ref) { + std::cout << "error at result #" << std::dec << i + << std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl; + ++errors; + } + } + if (errors != 0) { + std::cout << "Found " << std::dec << errors << " errors!" << std::endl; + std::cout << "FAILED!" << std::endl; + return 1; + } + } + + return 0; +} + +int main(int argc, char *argv[]) { + size_t value; + kernel_arg_t kernel_arg; + + // parse command arguments + parse_args(argc, argv); + + if (count == 0) { + count = 1; + } + + // open device connection + std::cout << "open device connection" << std::endl; + RT_CHECK(vx_dev_open(&device)); + + uint32_t num_points = count; + uint32_t buf_size = num_points * sizeof(int32_t); + + std::cout << "number of points: " << num_points << std::endl; + std::cout << "buffer size: " << buf_size << " bytes" << std::endl; + + // upload program + std::cout << "upload program" << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file)); + + // allocate device memory + std::cout << "allocate device memory" << std::endl; + + RT_CHECK(vx_alloc_dev_mem(device, buf_size, &value)); + kernel_arg.src_ptr = value; + + RT_CHECK(vx_alloc_dev_mem(device, buf_size, &value)); + kernel_arg.dst_ptr = value; + + kernel_arg.size = num_points; + + std::cout << "dev_src=" << std::hex << kernel_arg.src_ptr << std::endl; + std::cout << "dev_dst=" << std::hex << kernel_arg.dst_ptr << std::endl; + + // allocate shared memory + std::cout << "allocate shared memory" << std::endl; + uint32_t alloc_size = std::max(buf_size, sizeof(kernel_arg_t)); + RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &staging_buf)); + + // upload kernel argument + std::cout << "upload kernel argument" << std::endl; + { + auto buf_ptr = (int*)vx_host_ptr(staging_buf); + memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); + RT_CHECK(vx_copy_to_dev(staging_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0)); + } + + // upload source buffer0 + { + auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf); + for (uint32_t i = 0; i < num_points; ++i) { + buf_ptr[i] = i-1; + } + } + std::cout << "upload source buffer" << std::endl; + RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.src_ptr, buf_size, 0)); + + // clear destination buffer + { + auto buf_ptr = (int32_t*)vx_host_ptr(staging_buf); + for (uint32_t i = 0; i < num_points; ++i) { + buf_ptr[i] = 0xdeadbeef; + } + } + std::cout << "clear destination buffer" << std::endl; + RT_CHECK(vx_copy_to_dev(staging_buf, kernel_arg.dst_ptr, buf_size, 0)); + + // run tests + std::cout << "run tests" << std::endl; + RT_CHECK(run_test(kernel_arg, buf_size, num_points)); + + // cleanup + std::cout << "cleanup" << std::endl; + cleanup(); + + std::cout << "PASSED!" << std::endl; + + return 0; +} \ No newline at end of file