regression restructure

This commit is contained in:
Richard Yan
2025-01-29 18:28:09 -08:00
parent 3de51577ef
commit 5ba132e87b
101 changed files with 1052 additions and 7795 deletions

View File

@@ -1,89 +1,17 @@
all:
$(MAKE) -C basic
$(MAKE) -C demo
$(MAKE) -C dogfood
$(MAKE) -C mstress
$(MAKE) -C io_addr
$(MAKE) -C printf
$(MAKE) -C diverge
$(MAKE) -C sort
$(MAKE) -C fence
$(MAKE) -C no_mf_ext
$(MAKE) -C no_smem
$(MAKE) -C vecaddx
$(MAKE) -C sgemmx
# Find all subdirectories containing a Makefile
SUBDIRS := $(shell find . -mindepth 1 -maxdepth 1 -type d -exec test -e {}/Makefile \; -print)
run-simx:
$(MAKE) -C basic run-simx
$(MAKE) -C demo run-simx
$(MAKE) -C dogfood run-simx
$(MAKE) -C mstress run-simx
$(MAKE) -C io_addr run-simx
$(MAKE) -C printf run-simx
$(MAKE) -C diverge run-simx
$(MAKE) -C sort run-simx
$(MAKE) -C fence run-simx
$(MAKE) -C no_mf_ext run-simx
$(MAKE) -C no_smem run-simx
$(MAKE) -C vecaddx run-simx
$(MAKE) -C sgemmx run-simx
.PHONY: all $(SUBDIRS) clean clean-all
run-rtlsim:
$(MAKE) -C basic run-rtlsim
$(MAKE) -C demo run-rtlsim
$(MAKE) -C dogfood run-rtlsim
$(MAKE) -C mstress run-rtlsim
$(MAKE) -C io_addr run-rtlsim
$(MAKE) -C printf run-rtlsim
$(MAKE) -C diverge run-rtlsim
$(MAKE) -C sort run-rtlsim
$(MAKE) -C fence run-rtlsim
$(MAKE) -C no_mf_ext run-rtlsim
$(MAKE) -C no_smem run-rtlsim
$(MAKE) -C vecaddx run-rtlsim
$(MAKE) -C sgemmx run-rtlsim
# Default target: run make in all subdirectories
all: $(SUBDIRS)
run-opae:
$(MAKE) -C basic run-opae
$(MAKE) -C demo run-opae
$(MAKE) -C dogfood run-opae
$(MAKE) -C mstress run-opae
$(MAKE) -C io_addr run-opae
$(MAKE) -C printf run-opae
$(MAKE) -C diverge run-opae
$(MAKE) -C sort run-opae
$(MAKE) -C fence run-opae
$(MAKE) -C no_mf_ext run-opae
$(MAKE) -C no_smem run-opae
$(MAKE) -C vecaddx run-opae
$(MAKE) -C sgemmx run-opae
$(SUBDIRS):
$(MAKE) -C $@
# Clean target: run make clean in all subdirectories
clean:
$(MAKE) -C basic clean
$(MAKE) -C demo clean
$(MAKE) -C dogfood clean
$(MAKE) -C mstress clean
$(MAKE) -C io_addr clean
$(MAKE) -C printf clean
$(MAKE) -C diverge clean
$(MAKE) -C sort clean
$(MAKE) -C fence clean
$(MAKE) -C no_mf_ext clean
$(MAKE) -C no_smem clean
$(MAKE) -C vecaddx clean
$(MAKE) -C sgemmx clean
for dir in $(SUBDIRS); do $(MAKE) -C $$dir clean; done
clean-all:
$(MAKE) -C basic clean-all
$(MAKE) -C demo clean-all
$(MAKE) -C dogfood clean-all
$(MAKE) -C mstress clean-all
$(MAKE) -C io_addr clean-all
$(MAKE) -C printf clean-all
$(MAKE) -C diverge clean-all
$(MAKE) -C sort clean-all
$(MAKE) -C fence clean-all
$(MAKE) -C no_mf_ext clean-all
$(MAKE) -C no_smem clean-all
$(MAKE) -C vecaddx clean-all
$(MAKE) -C sgemmx clean-all
for dir in $(SUBDIRS); do $(MAKE) -C $$dir clean-all; done

View File

@@ -1,7 +1,5 @@
PROJECT = bad_apple
SRCS = main.cpp common.h
VX_SRCS = kernel.cpp
OPTS ?= -n16

Binary file not shown.

View File

@@ -1,274 +0,0 @@
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <vector>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
std::vector<float> src_a_data;
std::vector<float> src_b_data;
std::vector<float> ref_data;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.addr_a);
vx_mem_free(device, kernel_arg.addr_b);
vx_mem_free(device, kernel_arg.addr_c);
vx_dev_close(device);
}
}
void generate_source_matrix(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
src_a_data.resize(dim_m * dim_k);
src_b_data.resize(dim_k * dim_n);
for (uint32_t i = 0; i < src_a_data.size(); ++i) {
src_a_data[i] = static_cast<float>(i);
std::cout << "A: " << i << ": value=" << src_a_data[i] << std::endl;
}
for (uint32_t i = 0; i < src_b_data.size(); ++i) {
src_b_data[i] = static_cast<float>(i);
std::cout << "B: " << i << ": value=" << src_b_data[i] << std::endl;
}
}
void generate_reference_matmul(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
ref_data.resize(dim_m * dim_n);
for (uint32_t i = 0; i < dim_m; ++i) {
for (uint32_t j = 0; j < dim_n; ++j) {
float ref = 0.0f;
for (uint32_t k = 0; k < dim_k; ++k) {
ref += src_a_data[dim_k * i + k] * src_b_data[dim_n * k + j];
}
ref_data.at(dim_n * i + j) = ref;
}
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t dim_m, uint32_t dim_n) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_c, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (float*)staging_buf.data();
for (uint32_t i = 0; i < dim_m * dim_n; ++i) {
float ref = ref_data.at(i);
float cur = buf_ptr[i];
if (std::abs((cur - ref) / ref) > 1e-6) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual=" << cur << ", expected=" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
// FIXME: hardcoded
uint32_t dim_m = 64;
uint32_t dim_n = 64;
uint32_t dim_k = 64;
generate_source_matrix(dim_m, dim_n, dim_k);
generate_reference_matmul(dim_m, dim_n, dim_k);
uint32_t src_a_buf_size = src_a_data.size() * sizeof(src_a_data[0]);
uint32_t src_b_buf_size = src_b_data.size() * sizeof(src_b_data[0]);
uint32_t dst_buf_size = ref_data.size() * sizeof(src_a_data[0]);
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, src_a_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_a));
RT_CHECK(vx_mem_alloc(device, src_b_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_b));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_c));
kernel_arg.dim_m = dim_m;
kernel_arg.dim_n = dim_n;
kernel_arg.dim_k = dim_k;
std::cout << "dev_addr_a=0x" << std::hex << kernel_arg.addr_a << std::endl;
std::cout << "dev_addr_b=0x" << std::hex << kernel_arg.addr_b << std::endl;
std::cout << "dev_addr_c=0x" << std::hex << kernel_arg.addr_c << std::endl;
// allocate staging buffer
{
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(
src_a_buf_size,
std::max<uint32_t>(
src_b_buf_size,
std::max<uint32_t>(dst_buf_size, sizeof(kernel_arg_t))));
staging_buf.resize(staging_buf_size);
}
// upload kernel argument
{
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = staging_buf.data();
kernel_arg.addr_a = (uint64_t) 0x20000;
kernel_arg.addr_b = (uint64_t) 0x28000;
kernel_arg.addr_c = (uint64_t) 0xc0000000ULL;
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
std::cout << "uploading argument buffer to device, device mem address="
<< std::hex << KERNEL_ARG_DEV_MEM_ADDR << ", size=" << std::dec
<< sizeof(kernel_arg_t) << " bytes\n";
std::ofstream file("args.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(staging_buf.data()),
sizeof(kernel_arg_t));
file.close();
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
// upload source buffer
{
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_a_data.data(), src_a_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_a, staging_buf.data(),
src_a_buf_size));
std::cout << "uploading source A matrix to device, device mem address="
<< std::hex << kernel_arg.addr_a << ", size=" << std::dec
<< src_a_buf_size << " bytes\n";
std::ofstream file("input.a.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_a_buf_size);
file.close();
}
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_b_data.data(), src_b_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_b, staging_buf.data(),
src_b_buf_size));
std::cout << "uploading source B matrix to device, device mem address="
<< std::hex << kernel_arg.addr_b << ", size=" << std::dec
<< src_b_buf_size << " bytes\n";
std::ofstream file("input.b.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_b_buf_size);
file.close();
}
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < ref_data.size(); ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_c, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, kernel_arg.dim_m, kernel_arg.dim_n));
std::cout << "PASSED!" << std::endl;
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
return 0;
}

View File

@@ -1,16 +0,0 @@
PROJECT = basic
SRCS = main.cpp
VX_SRCS = kernel.cpp ../../../kernel/src/vx_perf.c start.S
OPTS ?= -n256
include ../common.mk
VX_LDFLAGS = -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR)
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy

View File

@@ -1,12 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
typedef struct {
uint32_t count;
uint64_t src_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -1,18 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include "common.h"
int main() {
kernel_arg_t* __UNIFORM__ arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
uint32_t count = arg->count;
int32_t* src_ptr = (int32_t*)arg->src_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
uint32_t offset = vx_core_id() * count;
for (uint32_t i = 0; i < count; ++i) {
dst_ptr[offset + i] = src_ptr[offset + i];
}
return 0;
}

View File

@@ -1,279 +0,0 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <chrono>
#include <vector>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
int test = -1;
uint32_t count = 0;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-t testno][-k: kernel][-n words][-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:t:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 't':
test = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_dev_close(device);
}
}
uint64_t shuffle(int i, uint64_t value) {
return (value << i) | (value & ((1 << i)-1));;
}
int run_memcopy_test(uint32_t dev_addr, uint64_t value, int num_blocks) {
int errors = 0;
auto time_start = std::chrono::high_resolution_clock::now();
int num_blocks_8 = (64 * num_blocks) / 8;
// update source buffer
for (int i = 0; i < num_blocks_8; ++i) {
((uint64_t*)staging_buf.data())[i] = shuffle(i, value);
}
/*for (int i = 0; i < num_blocks; ++i) {
std::cout << "data[" << i << "]=0x";
for (int j = 7; j >= 0; --j) {
std::cout << std::hex << ((uint64_t*)staging_buf.data())[i * 8 +j];
}
std::cout << std::endl;
}*/
// write source buffer to local memory
std::cout << "write source buffer to local memory" << std::endl;
auto t0 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_copy_to_dev(device, dev_addr, staging_buf.data(), 64 * num_blocks));
auto t1 = std::chrono::high_resolution_clock::now();
// clear destination buffer
for (int i = 0; i < num_blocks_8; ++i) {
((uint64_t*)staging_buf.data())[i] = 0;
}
// read destination buffer from local memory
std::cout << "read destination buffer from local memory" << std::endl;
auto t2 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), dev_addr, 64 * num_blocks));
auto t3 = std::chrono::high_resolution_clock::now();
// verify result
std::cout << "verify result" << std::endl;
for (int i = 0; i < num_blocks_8; ++i) {
auto curr = ((uint64_t*)staging_buf.data())[i];
auto ref = shuffle(i, value);
if (curr != ref) {
std::cout << "error at 0x" << std::hex << (dev_addr + 8 * i)
<< ": actual 0x" << curr << ", expected 0x" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
auto time_end = std::chrono::high_resolution_clock::now();
double elapsed;
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
printf("upload time: %lg ms\n", elapsed);
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t3 - t2).count();
printf("download time: %lg ms\n", elapsed);
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
printf("Total elapsed time: %lg ms\n", elapsed);
return 0;
}
int run_kernel_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
int errors = 0;
auto time_start = std::chrono::high_resolution_clock::now();
// update source buffer
{
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i;
}
}
auto t0 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), buf_size));
auto t1 = std::chrono::high_resolution_clock::now();
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
}
// start device
std::cout << "start execution" << std::endl;
auto t2 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_start(device));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
auto t3 = std::chrono::high_resolution_clock::now();
// read destination buffer from local memory
std::cout << "read destination buffer from local memory" << std::endl;
auto t4 = std::chrono::high_resolution_clock::now();
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
auto t5 = std::chrono::high_resolution_clock::now();
// verify result
std::cout << "verify result" << std::endl;
for (uint32_t i = 0; i < num_points; ++i) {
int32_t curr = ((int32_t*)staging_buf.data())[i];
int32_t ref = i;
if (curr != ref) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual 0x" << curr << ", expected 0x" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
auto time_end = std::chrono::high_resolution_clock::now();
double elapsed;
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0).count();
printf("upload time: %lg ms\n", elapsed);
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t3 - t2).count();
printf("execute time: %lg ms\n", elapsed);
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(t5 - t4).count();
printf("download time: %lg ms\n", elapsed);
elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
printf("Total elapsed time: %lg ms\n", elapsed);
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_cores;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
uint32_t num_points = count * num_cores;
uint32_t num_blocks = (num_points * sizeof(int32_t) + 63) / 64;
uint32_t buf_size = num_blocks * 64;
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.count = num_points;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
staging_buf.resize(alloc_size);
// run tests
if (0 == test || -1 == test) {
std::cout << "run memcopy test" << std::endl;
RT_CHECK(run_memcopy_test(kernel_arg.src_addr, 0x0badf00d40ff40ff, num_blocks));
}
if (1 == test || -1 == test) {
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
std::cout << "run kernel test" << std::endl;
RT_CHECK(run_kernel_test(kernel_arg, buf_size, num_points));
}
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "Test PASSED" << std::endl;
return 0;
}

View File

@@ -1,13 +0,0 @@
.section .init, "ax"
.global _start
.type _start, @function
_start:
# call main routine
call main
# dump perf counter
call vx_perf_dump
# end execution
.insn r 0x0b, 0, 0, x0, x0, x0
.size _start, .-_start

View File

@@ -2,11 +2,6 @@ XLEN ?= 32
TOOLDIR ?= /opt
TARGET ?= opaesim
XRT_SYN_DIR ?= ../../../hw/syn/xilinx/xrt
XRT_DEVICE_INDEX ?= 0
ifeq ($(XLEN),64)
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
VX_CFLAGS += -march=rv64imafd -mabi=lp64d
@@ -24,13 +19,12 @@ VORTEX_RT_PATH ?= $(realpath ../../../runtime)
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
GEMMINI_SW_PATH ?= $(realpath ../../../gemmini)
FPGA_BIN_DIR ?= $(VORTEX_RT_PATH)/opae
LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex
LLVM_CFLAGS += --sysroot=$(RISCV_SYSROOT)
LLVM_CFLAGS += --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH)
LLVM_CFLAGS += -Xclang -target-feature -Xclang +vortex
#LLVM_CFLAGS += -mllvm -vortex-branch-divergence=2
#LLVM_CFLAGS += -mllvm -print-after-all
#LLVM_CFLAGS += -I$(RISCV_SYSROOT)/include/c++/9.2.0/$(RISCV_PREFIX)
@@ -53,14 +47,14 @@ VX_CFLAGS += -mcmodel=medany -fno-rtti -fno-exceptions -nostartfiles -fdata-sect
# comment out below for regression/basic, which uses GCC that doesn't
# understand these flags
VX_CFLAGS += -mllvm -inline-threshold=262144
VX_CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw -I$(GEMMINI_SW_PATH)
VX_CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(GEMMINI_SW_PATH)
VX_CFLAGS += -DNDEBUG -DLLVM_VORTEX
# VX_LDFLAGS += -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR) $(VORTEX_KN_PATH)/libvortexrt.a
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR) $(VORTEX_KN_PATH)/libvortexrt.a $(VORTEX_KN_PATH)/tohost.S
CXXFLAGS += -std=c++17 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_KN_PATH)/../hw
CXXFLAGS += -I$(VORTEX_RT_PATH)/include
LDFLAGS += -L$(VORTEX_RT_PATH)/stub -lvortex
@@ -71,53 +65,22 @@ else
CXXFLAGS += -O2 -DNDEBUG
endif
ifeq ($(TARGET), fpga)
OPAE_DRV_PATHS ?= libopae-c.so
else
ifeq ($(TARGET), asesim)
OPAE_DRV_PATHS ?= libopae-c-ase.so
else
ifeq ($(TARGET), opaesim)
OPAE_DRV_PATHS ?= libopae-c-sim.so
endif
endif
endif
# CONFIG is supplied from the command line to differentiate ELF files with custom suffixes
CONFIGEXT = $(if $(CONFIG),.$(CONFIG),)
all: $(PROJECT) kernel.bin kernel.dump kernel.radiance.dump kernel$(CONFIGEXT).dump kernel.radiance$(CONFIGEXT).dump
kernel.dump: kernel.elf
$(VX_DP) -D kernel.elf > kernel.dump
all: kernel.radiance.dump kernel.radiance$(CONFIGEXT).dump
kernel.radiance.dump: kernel.radiance.elf
$(VX_DP) -D kernel.radiance.elf > kernel.radiance.dump
ifneq ($(CONFIG),)
kernel$(CONFIGEXT).dump: kernel$(CONFIGEXT).elf
$(VX_DP) -D kernel$(CONFIGEXT).elf > kernel$(CONFIGEXT).dump
kernel.radiance$(CONFIGEXT).dump: kernel.radiance$(CONFIGEXT).elf
$(VX_DP) -D kernel.radiance$(CONFIGEXT).elf > kernel.radiance$(CONFIGEXT).dump
endif
kernel.bin: kernel.elf kernel.radiance.elf
$(VX_CP) -O binary kernel.elf kernel.bin
OBJCOPY ?= $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
OBJCOPY_FLAGS ?= "LOAD,ALLOC,DATA,CONTENTS"
BINFILES := args.bin input.a.bin input.b.bin input.c.bin
kernel.elf: $(VX_SRCS) $(VX_INCLUDES) $(BINFILES)
$(VX_CXX) $(VX_CFLAGS) -o $@ $(VX_SRCS) $(VX_LDFLAGS)
$(OBJCOPY) --set-section-flags .operand.a=$(OBJCOPY_FLAGS) $@
$(OBJCOPY) --set-section-flags .operand.b=$(OBJCOPY_FLAGS) $@
$(OBJCOPY) --set-section-flags .operand.c=$(OBJCOPY_FLAGS) $@
$(OBJCOPY) --set-section-flags .args=$(OBJCOPY_FLAGS) $@
$(OBJCOPY) --update-section .operand.a=input.a.bin $@ || true
$(OBJCOPY) --update-section .operand.b=input.b.bin $@ || true
$(OBJCOPY) --update-section .operand.c=input.c.bin $@ || true
$(OBJCOPY) --update-section .args=args.bin $@ || true
kernel.radiance.elf: $(VX_SRCS) $(VX_INCLUDES) $(BINFILES)
$(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -DRADIANCE -o $@
@@ -131,41 +94,12 @@ kernel.radiance.elf: $(VX_SRCS) $(VX_INCLUDES) $(BINFILES)
$(OBJCOPY) --update-section .args=args.bin $@ || true
ifneq ($(CONFIG),)
kernel$(CONFIGEXT).elf: kernel.elf
cp $< $@
kernel.radiance$(CONFIGEXT).elf: kernel.radiance.elf
cp $< $@
endif
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(VORTEX_RT_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-opae: $(PROJECT) kernel.bin
SCOPE_JSON_PATH=$(FPGA_BIN_DIR)/scope.json OPAE_DRV_PATHS=$(OPAE_DRV_PATHS) LD_LIBRARY_PATH=$(VORTEX_RT_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(VORTEX_RT_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-xrt: $(PROJECT) kernel.bin
ifeq ($(TARGET), hw)
SCOPE_JSON_PATH=$(FPGA_BIN_DIR)/scope.json XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(VORTEX_RT_PATH)/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
else
XCL_EMULATION_MODE=$(TARGET) XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(VORTEX_RT_PATH)/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
endif
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
rm -rf *.o
clean-all: clean
rm -rf kernel.elf kernel.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
rm -rf kernel*.elf kernel*.dump

View File

@@ -0,0 +1,145 @@
XLEN ?= 32
TOOLDIR ?= /opt
ifeq ($(XLEN),64)
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
VX_CFLAGS += -march=rv64imafd -mabi=lp64d
STARTUP_ADDR ?= 0x180000000
else
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
VX_CFLAGS += -march=rv32imafd -mabi=ilp32f
STARTUP_ADDR ?= 0x80000000
endif
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
GEMMINI_SW_PATH ?= $(realpath ../../../third_party/gemmini-rocc-tests)
LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex
LLVM_MUON ?= $(TOOLDIR)/llvm-muon
LLVM_MUON_32R ?= $(TOOLDIR)/llvm-muon-baseline
LLVM_CFLAGS += --sysroot=$(RISCV_SYSROOT)
LLVM_CFLAGS += --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH)
LLVM_CFLAGS += -Xclang -target-feature -Xclang +vortex
#LLVM_CFLAGS += -mllvm -vortex-branch-divergence=2
#LLVM_CFLAGS += -mllvm -print-after-all
#LLVM_CFLAGS += -I$(RISCV_SYSROOT)/include/c++/9.2.0/$(RISCV_PREFIX)
#LLVM_CFLAGS += -I$(RISCV_SYSROOT)/include/c++/9.2.0
#LLVM_CFLAGS += -Wl,-L$(RISCV_TOOLCHAIN_PATH)/lib/gcc/$(RISCV_PREFIX)/9.2.0
#LLVM_CFLAGS += --rtlib=libgcc
VX_CC = $(LLVM_VORTEX)/bin/clang $(LLVM_CFLAGS)
VX_CXX = $(LLVM_VORTEX)/bin/clang++ $(LLVM_CFLAGS)
VX_DP = $(LLVM_VORTEX)/bin/llvm-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
MU_CC = $(LLVM_MUON)/bin/clang $(LLVM_CFLAGS)
MU_CXX = $(LLVM_MUON)/bin/clang++ $(LLVM_CFLAGS)
MU_DP = $(LLVM_MUON)/bin/llvm-objdump
MU_CP = $(LLVM_MUON)/bin/llvm-objcopy
MU_32R_CC = $(LLVM_MUON_32R)/bin/clang $(LLVM_CFLAGS)
MU_32R_CXX = $(LLVM_MUON_32R)/bin/clang++ $(LLVM_CFLAGS)
MU_32R_DP = $(LLVM_MUON_32R)/bin/llvm-objdump
MU_32R_CP = $(LLVM_MUON_32R)/bin/llvm-objcopy
VX_CFLAGS += -v -O2 -std=c++17
VX_CFLAGS += -mcmodel=medany -fno-rtti -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
# comment out below for regression/basic, which uses GCC that doesn't
# understand these flags
VX_CFLAGS += -mllvm -inline-threshold=262144
VX_CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(GEMMINI_SW_PATH)
VX_CFLAGS += -DNDEBUG -DLLVM_VORTEX
MU_CFLAGS := $(VX_CFLAGS)
MU_CFLAGS += -fuse-ld=lld
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR)
MU_LDFLAGS := $(VX_LDFLAGS)
VX_LDFLAGS += $(VORTEX_KN_PATH)/libvortexrt.a
MU_LDFLAGS += $(VORTEX_KN_PATH)/libvortexrtmuon.a $(VORTEX_KN_PATH)/tohost.S
CXXFLAGS += -std=c++17 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_RT_PATH)/include
LDFLAGS += -L$(VORTEX_RT_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
# CONFIG is supplied from the command line to differentiate ELF files with custom suffixes
CONFIGEXT = $(if $(CONFIG),.$(CONFIG),)
all: kernel.radiance.dump kernel.radiance$(CONFIGEXT).dump kernel.radiance.32r.dump kernel.vortex.dump
kernel.vortex.dump: kernel.vortex.elf
$(VX_DP) -D kernel.vortex.elf > kernel.vortex.dump
kernel.radiance.dump: kernel.radiance.elf
$(MU_DP) -D kernel.radiance.elf > kernel.radiance.dump
kernel.radiance.32r.dump: kernel.radiance.32r.elf
$(MU_32R_DP) -D kernel.radiance.32r.elf > kernel.radiance.32r.dump
ifneq ($(CONFIG),)
kernel.radiance$(CONFIGEXT).dump: kernel.radiance$(CONFIGEXT).elf
$(MU_DP) -D kernel.radiance$(CONFIGEXT).elf > kernel.radiance$(CONFIGEXT).dump
endif
OBJCOPY_FLAGS ?= "LOAD,ALLOC,DATA,CONTENTS"
BINFILES := args.bin input.a.bin input.b.bin input.c.bin
kernel.vortex.elf: $(VX_SRCS) $(VX_INCLUDES) $(BINFILES)
$(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -DRADIANCE -o $@
$(VX_CP) --set-section-flags .operand.a=$(OBJCOPY_FLAGS) $@
$(VX_CP) --set-section-flags .operand.b=$(OBJCOPY_FLAGS) $@
$(VX_CP) --set-section-flags .operand.c=$(OBJCOPY_FLAGS) $@
$(VX_CP) --set-section-flags .args=$(OBJCOPY_FLAGS) $@
$(VX_CP) --update-section .operand.a=input.a.bin $@ || true
$(VX_CP) --update-section .operand.b=input.b.bin $@ || true
$(VX_CP) --update-section .operand.c=input.c.bin $@ || true
$(VX_CP) --update-section .args=args.bin $@ || true
kernel.radiance.elf: $(VX_SRCS) $(VX_INCLUDES) $(BINFILES)
$(MU_CXX) $(MU_CFLAGS) $(VX_SRCS) $(MU_LDFLAGS) -DRADIANCE -S
$(MU_CXX) $(MU_CFLAGS) $(VX_SRCS) $(MU_LDFLAGS) -DRADIANCE -c
$(MU_CXX) $(MU_CFLAGS) $(VX_SRCS) $(MU_LDFLAGS) -DRADIANCE -o $@
# $(MU_CP) --set-section-flags .operand.a=$(OBJCOPY_FLAGS) $@
# $(MU_CP) --set-section-flags .operand.b=$(OBJCOPY_FLAGS) $@
# $(MU_CP) --set-section-flags .operand.c=$(OBJCOPY_FLAGS) $@
# $(MU_CP) --set-section-flags .args=$(OBJCOPY_FLAGS) $@
# $(MU_CP) --update-section .operand.a=input.a.bin $@ || true
# $(MU_CP) --update-section .operand.b=input.b.bin $@ || true
# $(MU_CP) --update-section .operand.c=input.c.bin $@ || true
# $(MU_CP) --update-section .args=args.bin $@ || true
kernel.radiance.32r.elf: $(VX_SRCS) $(VX_INCLUDES) $(BINFILES)
$(MU_32R_CXX) $(MU_CFLAGS) $(VX_SRCS) $(MU_LDFLAGS) -DRADIANCE -o $@
$(MU_32R_CP) --set-section-flags .operand.a=$(OBJCOPY_FLAGS) $@
$(MU_32R_CP) --set-section-flags .operand.b=$(OBJCOPY_FLAGS) $@
$(MU_32R_CP) --set-section-flags .operand.c=$(OBJCOPY_FLAGS) $@
$(MU_32R_CP) --set-section-flags .args=$(OBJCOPY_FLAGS) $@
$(MU_32R_CP) --update-section .operand.a=input.a.bin $@ || true
$(MU_32R_CP) --update-section .operand.b=input.b.bin $@ || true
$(MU_32R_CP) --update-section .operand.c=input.c.bin $@ || true
$(MU_32R_CP) --update-section .args=args.bin $@ || true
ifneq ($(CONFIG),)
kernel.radiance$(CONFIGEXT).elf: kernel.radiance.elf
cp $< $@
endif
clean:
rm -rf *.o
clean-all: clean
rm -rf kernel*.elf kernel*.dump

View File

@@ -1,9 +0,0 @@
PROJECT = demo
SRCS = main.cpp
VX_SRCS = kernel.cpp
OPTS ?= -n64
include ../common.mk

View File

@@ -1,18 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
#ifndef TYPE
#define TYPE float
#endif
typedef struct {
uint32_t num_tasks;
uint32_t task_size;
uint64_t src0_addr;
uint64_t src1_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -1,23 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto src0_ptr = reinterpret_cast<TYPE*>(arg->src0_addr);
auto src1_ptr = reinterpret_cast<TYPE*>(arg->src1_addr);
auto dst_ptr = reinterpret_cast<TYPE*>(arg->dst_addr);
uint32_t count = arg->task_size;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
dst_ptr[offset+i] = src0_ptr[offset+i] + src1_ptr[offset+i];
}
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View File

@@ -1,245 +0,0 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include "common.h"
#define FLOAT_ULP 6
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
template <typename Type>
class Comparator {};
template <>
class Comparator<int> {
public:
static const char* type_str() {
return "integer";
}
static int generate() {
return rand();
}
static bool compare(int a, int b, int index, int errors) {
if (a != b) {
if (errors < 100) {
printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
}
return false;
}
return true;
}
};
template <>
class Comparator<float> {
private:
union Float_t { float f; int i; };
public:
static const char* type_str() {
return "float";
}
static int generate() {
return static_cast<float>(rand()) / RAND_MAX;
}
static bool compare(float a, float b, int index, int errors) {
union fi_t { float f; int32_t i; };
fi_t fa, fb;
fa.f = a;
fb.f = b;
auto d = std::abs(fa.i - fb.i);
if (d > FLOAT_ULP) {
if (errors < 100) {
printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
}
return false;
}
return true;
}
};
const char* kernel_file = "kernel.bin";
uint32_t count = 16;
vx_device_h device = nullptr;
std::vector<TYPE> source_data;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src0_addr);
vx_mem_free(device, kernel_arg.src1_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_dev_close(device);
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
auto ref = source_data[2 * i + 0] + source_data[2 * i + 1];
auto cur = buf_ptr[i];
if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_cores, num_warps, num_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
uint32_t num_tasks = num_cores * num_warps * num_threads;
uint32_t num_points = count * num_tasks;
uint32_t buf_size = num_points * sizeof(TYPE);
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.num_tasks = num_tasks;
kernel_arg.task_size = count;
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// generate source data
source_data.resize(2 * num_points);
for (uint32_t i = 0; i < source_data.size(); ++i) {
source_data[i] = Comparator<TYPE>::generate();
}
// upload source buffer0
{
std::cout << "upload source buffer0" << std::endl;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = source_data[2 * i + 0];
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size));
}
// upload source buffer1
{
std::cout << "upload source buffer1" << std::endl;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = source_data[2 * i + 1];
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size));
}
// clear destination buffer
std::cout << "clear destination buffer" << std::endl;
memset(staging_buf.data(), 0, num_points * sizeof(TYPE));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, buf_size, num_points));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;
return 0;
}

View File

@@ -1,9 +0,0 @@
PROJECT = diverge
SRCS = main.cpp
VX_SRCS = kernel.cpp
OPTS ?= -n16
include ../common.mk

View File

@@ -1,12 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
typedef struct {
uint32_t num_points;
uint64_t src_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -1,83 +0,0 @@
#include <stdint.h>
#include <assert.h>
#include <algorithm>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
// Parallel Selection sort
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
int32_t* src_ptr = (int32_t*)arg->src_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
int value = src_ptr[task_id];
// none taken
if (task_id >= 0x7fffffff) {
value = 0;
} else {
value += 2;
}
// diverge
if (task_id > 1) {
if (task_id > 2) {
value += 6;
} else {
value += 5;
}
} else {
if (task_id > 0) {
value += 4;
} else {
value += 3;
}
}
// all taken
if (task_id >= 0) {
value += 7;
} else {
value = 0;
}
// loop
for (int i = 0, n = task_id; i < n; ++i) {
value += src_ptr[i];
}
// switch
switch (task_id) {
case 0:
value += 1;
break;
case 1:
value -= 1;
break;
case 2:
value *= 3;
break;
case 3:
value *= 5;
break;
default:
assert(task_id < arg->num_points);
break;
}
// select
value += (task_id >= 0) ? ((task_id > 5) ? src_ptr[0] : task_id) : ((task_id < 5) ? src_ptr[1] : -task_id);
// min/max
value += std::min(src_ptr[task_id], value);
value += std::max(src_ptr[task_id], value);
dst_ptr[task_id] = value;
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View File

@@ -1,268 +0,0 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <vector>
#include <assert.h>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
std::vector<int> src_data;
std::vector<int> ref_data;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_dev_close(device);
}
}
void gen_input_data(uint32_t num_points) {
src_data.resize(num_points);
for (uint32_t i = 0; i < src_data.size(); ++i) {
int value = std::rand();
src_data[i] = value;
//std::cout << std::dec << i << ": value=0x" << std::hex << value << std::endl;
}
}
void gen_ref_data(uint32_t num_points) {
ref_data.resize(num_points);
for (int i = 0; i < (int)ref_data.size(); ++i) {
int value = src_data.at(i);
// none taken
if (i >= 0x7fffffff) {
value = 0;
} else {
value += 2;
}
// diverge
if (i > 1) {
if (i > 2) {
value += 6;
} else {
value += 5;
}
} else {
if (i > 0) {
value += 4;
} else {
value += 3;
}
}
// all taken
if (i >= 0) {
value += 7;
} else {
value = 0;
}
// loop
for (int j = 0, n = i; j < n; ++j) {
value += src_data.at(j);
}
// switch
switch (i) {
case 0:
value += 1;
break;
case 1:
value -= 1;
break;
case 2:
value *= 3;
break;
case 3:
value *= 5;
break;
default:
assert(i < (int)num_points);
break;
}
// select
value += (i >= 0) ? ((i > 5) ? src_data.at(0) : i) : ((i < 5) ? src_data.at(1) : -i);
// min/max
value += std::min(src_data.at(i), value);
value += std::max(src_data.at(i), value);
ref_data[i] = value;
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = ref_data.at(i);
int cur = buf_ptr[i];
if (cur != ref) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint32_t num_points = count;
// generate input data
gen_input_data(num_points);
// generate reference data
gen_ref_data(num_points);
uint32_t src_buf_size = src_data.size() * sizeof(int32_t);
uint32_t dst_buf_size = ref_data.size() * sizeof(int32_t);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.num_points = num_points;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(dst_buf_size,
sizeof(kernel_arg_t)));
staging_buf.resize(staging_buf_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// upload source buffer
{
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_data.data(), num_points * sizeof(int32_t));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), src_buf_size));
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, num_points));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;
return 0;
}

View File

@@ -1,9 +0,0 @@
PROJECT = dogfood
SRCS = main.cpp
VX_SRCS = kernel.cpp
OPTS ?= -n64 -x19 -x20
include ../common.mk

View File

@@ -1,15 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
typedef struct {
uint32_t testid;
uint32_t num_tasks;
uint32_t task_size;
uint64_t src0_addr;
uint64_t src1_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -1,396 +0,0 @@
#include <stdint.h>
#include <math.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
typedef void (*PFN_Kernel)(int task_id, kernel_arg_t* __UNIFORM__ arg);
inline float __ieee754_sqrtf (float x) {
asm ("fsqrt.s %0, %1" : "=f" (x) : "f" (x));
return x;
}
void kernel_iadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
int32_t a = src0_ptr[offset+i];
int32_t b = src1_ptr[offset+i];
int32_t c = a + b;
dst_ptr[offset+i] = c;
}
}
void kernel_imul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a * b;
dst_ptr[offset+i] = c;
}
}
void kernel_idiv(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a / b;
dst_ptr[offset+i] = c;
}
}
void kernel_idiv_mul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a / b;
auto d = a * b;
auto e = c + d;
dst_ptr[offset+i] = e;
}
}
void kernel_fadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a + b;
dst_ptr[offset+i] = c;
}
}
void kernel_fsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a - b;
dst_ptr[offset+i] = c;
}
}
void kernel_fmul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a * b;
dst_ptr[offset+i] = c;
}
}
void kernel_fmadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a * b + b;
dst_ptr[offset+i] = c;
}
}
void kernel_fmsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a * b - b;
dst_ptr[offset+i] = c;
}
}
void kernel_fnmadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c =-a * b - b;
dst_ptr[offset+i] = c;
}
}
void kernel_fnmsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c =-a * b + b;
dst_ptr[offset+i] = c;
}
}
void kernel_fnmadd_madd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c =-a * b - b;
auto d = a * b + b;
auto e = c + d;
dst_ptr[offset+i] = e;
}
}
void kernel_fdiv(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a / b;
dst_ptr[offset+i] = c;
}
}
void kernel_fdiv2(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a / b;
auto d = b / a;
auto e = c + d;
dst_ptr[offset+i] = e;
}
}
void kernel_fsqrt(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = __ieee754_sqrtf(a * b);
dst_ptr[offset+i] = c;
}
}
void kernel_ftoi(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a + b;
auto d = (int32_t)c;
dst_ptr[offset+i] = d;
}
}
void kernel_ftou(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (uint32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a + b;
auto d = (uint32_t)c;
dst_ptr[offset+i] = d;
}
}
void kernel_itof(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a + b;
auto d = (float)c;
dst_ptr[offset+i] = d;
}
}
void kernel_utof(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a + b;
auto d = (float)c;
dst_ptr[offset+i] = d;
}
}
void kernel_bar(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto num_warps = vx_num_warps();
auto num_threads = vx_num_threads();
auto cid = vx_core_id();
auto wid = vx_warp_id();
auto tid = vx_thread_id();
auto src0_ptr = (uint32_t*)arg->src0_addr;
auto dst_ptr = (uint32_t*)arg->dst_addr;
// per warp delay
uint32_t barrier_stall = 0;
for (int i = 0; i <= wid; ++i) {
barrier_stall += src0_ptr[0] * src0_ptr[i];
}
// memory fence
vx_fence();
// local barrier
vx_barrier(0, num_warps);
// update destination
auto src_idx = (cid * num_warps + (num_warps - 1 - wid)) * num_threads + tid;
dst_ptr[task_id] = src0_ptr[src_idx] + barrier_stall;
}
void kernel_gbar(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto num_cores = vx_num_cores();
auto num_warps = vx_num_warps();
auto num_threads = vx_num_threads();
auto cid = vx_core_id();
auto wid = vx_warp_id();
auto tid = vx_thread_id();
auto src0_ptr = (uint32_t*)arg->src0_addr;
auto dst_ptr = (uint32_t*)arg->dst_addr;
// per core delay
uint32_t barrier_stall = 0;
for (int i = 0; i <= cid; ++i) {
for (int j = 0; j <= wid; ++j) {
barrier_stall += src0_ptr[0] * src0_ptr[i + j];
}
}
// memory fence
vx_fence();
// global barrier
vx_barrier(0x80000000, num_cores);
// update destination
auto src_idx = ((num_cores - 1 - cid) * num_warps + (num_warps - 1 - wid)) * num_threads + tid;
dst_ptr[task_id] = src0_ptr[src_idx] + barrier_stall;
}
static const PFN_Kernel sc_tests[] = {
kernel_iadd,
kernel_imul,
kernel_idiv,
kernel_idiv_mul,
kernel_fadd,
kernel_fsub,
kernel_fmul,
kernel_fmadd,
kernel_fmsub,
kernel_fnmadd,
kernel_fnmsub,
kernel_fnmadd_madd,
kernel_fdiv,
kernel_fdiv2,
kernel_fsqrt,
kernel_ftoi,
kernel_ftou,
kernel_itof,
kernel_utof,
kernel_bar,
kernel_gbar
};
int main() {
auto arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)sc_tests[arg->testid], arg);
return 0;
}

View File

@@ -1,218 +0,0 @@
#include <iostream>
#include <vector>
#include <unordered_set>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include "testcases.h"
#include "common.h"
///////////////////////////////////////////////////////////////////////////////
TestSuite* testSuite = nullptr;
const char* kernel_file = "kernel.bin";
int count = 0;
std::unordered_set<int> included;
std::unordered_set<int> excluded;
int testid_s = 0;
int testid_e = 0;
bool stop_on_error = true;
vx_device_h device = nullptr;
std::vector<uint8_t> arg_buf;
std::vector<uint8_t> src1_buf;
std::vector<uint8_t> src2_buf;
std::vector<uint8_t> dst_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-t<testid>: selected test] [-s<testid>: start test] [-e<testid>: end test] [-x<testid>: excluded tests]" << std::endl;
std::cout << " [-k<kernel>] [-n<words>] [-c] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:t:x:s:e:k:ch?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 't':
included.insert(atoi(optarg));
break;
case 'x':
excluded.insert(atoi(optarg));
break;
case 's':
testid_s = atoi(optarg);
break;
case 'e':
testid_e = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'c':
stop_on_error = false;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (testSuite) {
delete testSuite;
}
if (device) {
vx_mem_free(device, kernel_arg.src0_addr);
vx_mem_free(device, kernel_arg.src1_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_dev_close(device);
}
}
int main(int argc, char *argv[]) {
int exitcode = 0;
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::cout << std::dec;
std::cout << "test ids: " << testid_s << " - " << testid_e << std::endl;
std::cout << "workitem size: " << count << std::endl;
std::cout << "using kernel: " << kernel_file << std::endl;
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_cores, num_warps, num_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
int num_tasks = num_cores * num_warps * num_threads;
int num_points = count * num_tasks;
size_t buf_size = num_points * sizeof(uint32_t);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload kernel" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.num_tasks = num_tasks;
kernel_arg.task_size = count;
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::dec << std::endl;
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::dec << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::dec << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
arg_buf.resize(sizeof(kernel_arg_t));
src1_buf.resize(buf_size);
src2_buf.resize(buf_size);
dst_buf.resize(buf_size);
// allocate test suite
testSuite = new TestSuite(device);
if (testid_e == 0) {
testid_e = (testSuite->size() - 1);
}
// execute tests
for (int t = testid_s; t <= testid_e; ++t) {
if (!included.empty()) {
if (included.count(t) == 0)
continue;
}
if (!excluded.empty()) {
if (excluded.count(t) != 0)
continue;
}
auto test = testSuite->get_test(t);
auto name = test->name();
std::cout << "Test" << t << ": " << name << std::endl;
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
kernel_arg.testid = t;
memcpy(arg_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, arg_buf.data(), sizeof(kernel_arg_t)));
// get test arguments
std::cout << "get test arguments" << std::endl;
RT_CHECK(test->setup(num_points, (void*)src1_buf.data(), (void*)src2_buf.data()));
// upload source buffer0
std::cout << "upload source buffer0" << std::endl;
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, src1_buf.data(), buf_size));
// upload source buffer1
std::cout << "upload source buffer1" << std::endl;
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, src2_buf.data(), buf_size));
// clear destination buffer
std::cout << "clear destination buffer" << std::endl;
for (int i = 0; i < num_points; ++i) {
((uint32_t*)dst_buf.data())[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, dst_buf.data(), buf_size));
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, dst_buf.data(), kernel_arg.dst_addr, buf_size));
// verify destination
std::cout << "verify test result" << std::endl;
int errors = test->verify(num_points, dst_buf.data(), src1_buf.data(), src2_buf.data());
if (errors != 0) {
std::cout << "found " << std::dec << errors << " errors!" << std::endl;
std::cout << "Test" << t << "-" << name << " FAILED!" << std::endl << std::flush;
if (stop_on_error) {
cleanup();
exit(1);
}
exitcode = 1;
} else {
std::cout << "Test" << t << "-" << name << " PASSED!" << std::endl << std::flush;
}
}
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
return exitcode;
}

View File

@@ -1,821 +0,0 @@
#pragma once
#include <iostream>
#include <math.h>
#include <limits>
#include <assert.h>
void cleanup();
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
union Float_t {
float f;
int i;
struct {
uint32_t man : 23;
uint32_t exp : 8;
uint32_t sign : 1;
} parts;
};
inline float fround(float x, int32_t precision = 8) {
auto power_of_10 = std::pow(10, precision);
return std::round(x * power_of_10) / power_of_10;
}
inline bool almost_equal_eps(float a, float b, int ulp = 128) {
auto eps = std::numeric_limits<float>::epsilon() * (std::max(fabs(a), fabs(b)) * ulp);
auto d = fabs(a - b);
if (d > eps) {
std::cout << "*** almost_equal_eps: d=" << d << ", eps=" << eps << std::endl;
return false;
}
return true;
}
inline bool almost_equal_ulp(float a, float b, int32_t ulp = 6) {
Float_t fa{a}, fb{b};
auto d = std::abs(fa.i - fb.i);
if (d > ulp) {
std::cout << "*** almost_equal_ulp: a=" << a << ", b=" << b << ", ulp=" << d << ", ia=" << std::hex << fa.i << ", ib=" << fb.i << std::endl;
return false;
}
return true;
}
inline bool almost_equal(float a, float b) {
if (a == b)
return true;
/*if (almost_equal_eps(a, b))
return true;*/
return almost_equal_ulp(a, b);
}
class ITestCase;
class TestSuite {
public:
TestSuite(vx_device_h device);
~TestSuite();
ITestCase* get_test(int testid) const;
void add_test(ITestCase* test);
size_t size() const;
vx_device_h device() const;
private:
std::vector<ITestCase*> _tests;
vx_device_h device_;
};
class ITestCase {
public:
ITestCase(TestSuite* suite, const char* name)
: suite_(suite)
, name_(name)
{}
virtual ~ITestCase() {}
TestSuite* suite() const {
return suite_;
}
const char* name() const {
return name_;
}
virtual int setup(uint32_t n, void* src1, void* src2) = 0;
virtual int verify(uint32_t n, void* dst, const void* src1, const void* src2) = 0;
protected:
TestSuite* suite_;
const char* const name_;
};
class Test_IADD : public ITestCase {
public:
Test_IADD(TestSuite* suite) : ITestCase(suite, "iadd") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = n/2 - i;
b[i] = n/2 + i;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
auto c = (int32_t*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] + b[i];
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_IMUL : public ITestCase {
public:
Test_IMUL(TestSuite* suite) : ITestCase(suite, "imul") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = n/2 - i;
b[i] = n/2 + i;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
auto c = (int32_t*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] * b[i];
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_IDIV : public ITestCase {
public:
Test_IDIV(TestSuite* suite) : ITestCase(suite, "idiv") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = n/2 - i;
b[i] = n/2 + i;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
auto c = (int32_t*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] / b[i];
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_IDIV_MUL : public ITestCase {
public:
Test_IDIV_MUL(TestSuite* suite) : ITestCase(suite, "idiv-mul") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = n/2 - i;
b[i] = n/2 + i;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
auto c = (int32_t*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] / b[i];
auto y = a[i] * b[i];
auto ref = x + y;
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_FADD : public ITestCase {
public:
Test_FADD(TestSuite* suite) : ITestCase(suite, "fadd") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] + b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_FSUB : public ITestCase {
public:
Test_FSUB(TestSuite* suite) : ITestCase(suite, "fsub") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] - b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_FMUL : public ITestCase {
public:
Test_FMUL(TestSuite* suite) : ITestCase(suite, "fmul") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] * b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_FMADD : public ITestCase {
public:
Test_FMADD(TestSuite* suite) : ITestCase(suite, "fmadd") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] * b[i] + b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_FMSUB : public ITestCase {
public:
Test_FMSUB(TestSuite* suite) : ITestCase(suite, "fmsub") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] * b[i] - b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_FNMADD : public ITestCase {
public:
Test_FNMADD(TestSuite* suite) : ITestCase(suite, "fnmadd") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = -a[i] * b[i] - b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_FNMSUB : public ITestCase {
public:
Test_FNMSUB(TestSuite* suite) : ITestCase(suite, "fnmsub") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = -a[i] * b[i] + b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_FNMADD_MADD : public ITestCase {
public:
Test_FNMADD_MADD(TestSuite* suite) : ITestCase(suite, "fnmadd-madd") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto x = -a[i] * b[i] - b[i];
auto y = a[i] * b[i] + b[i];
auto ref = x + y;
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_FDIV : public ITestCase {
public:
Test_FDIV(TestSuite* suite) : ITestCase(suite, "fdiv") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] / b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_FDIV2 : public ITestCase {
public:
Test_FDIV2(TestSuite* suite) : ITestCase(suite, "fdiv2") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] / b[i];
auto y = b[i] / a[i];
auto ref = x + y;
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_FSQRT : public ITestCase {
public:
Test_FSQRT(TestSuite* suite) : ITestCase(suite, "fsqrt") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (uint32_t i = 0; i < n; ++i) {
float q = 1.0f + (i % 64);
a[i] = q;
b[i] = q;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto ref = sqrt(a[i] * b[i]);
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_FTOI : public ITestCase {
public:
Test_FTOI(TestSuite* suite) : ITestCase(suite, "ftoi") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (uint32_t i = 0; i < n; ++i) {
float q = fround(float(n/2) - i + (float(i) / n));
a[i] = q;
b[i] = q;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (int32_t*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] + b[i];
auto ref = (int32_t)x;
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_FTOU : public ITestCase {
public:
Test_FTOU(TestSuite* suite) : ITestCase(suite, "ftou") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (uint32_t i = 0; i < n; ++i) {
float q = fround(i + (float(i) / n));
a[i] = q;
b[i] = q;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (uint32_t*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] + b[i];
auto ref = (uint32_t)x;
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_ITOF : public ITestCase {
public:
Test_ITOF(TestSuite* suite) : ITestCase(suite, "itof") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = n/2 - i;
b[i] = n/2 - i;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] + b[i];
auto ref = (float)x;
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_UTOF : public ITestCase {
public:
Test_UTOF(TestSuite* suite) : ITestCase(suite, "utof") {}
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (uint32_t*)src1;
auto b = (uint32_t*)src2;
for (uint32_t i = 0; i < n; ++i) {
a[i] = i;
b[i] = i;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (uint32_t*)src1;
auto b = (uint32_t*)src2;
auto c = (float*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] + b[i];
auto ref = (float)x;
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
return errors;
}
};
class Test_BAR : public ITestCase {
public:
Test_BAR(TestSuite* suite) : ITestCase(suite, "bar") {}
int setup(uint32_t n, void* src1, void* /*src2*/) override {
RT_CHECK(vx_dev_caps(suite_->device(), VX_CAPS_NUM_WARPS, &num_warps_));
if (num_warps_ == 1) {
std::cout << "Error: multiple warps configuration required!" << std::endl;
return -1;
}
RT_CHECK(vx_dev_caps(suite_->device(), VX_CAPS_NUM_THREADS, &num_threads_));
auto a = (uint32_t*)src1;
for (uint32_t i = 0; i < n; ++i) {
a[i] = i;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* /*src2*/) override {
int errors = 0;
auto a = (uint32_t*)src1;
auto c = (uint32_t*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto tid = i % num_threads_;
auto wid = (i / num_threads_) % num_warps_;
auto cid = i / (num_warps_ * num_threads_);
auto src_idx = (cid * num_warps_ + (num_warps_ - 1 - wid)) * num_threads_ + tid;
uint32_t ref = a[src_idx];
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << std::hex << ref << ", actual=" << c[i] << std::endl;
++errors;
}
}
return errors;
}
uint64_t num_warps_;
uint64_t num_threads_;
};
class Test_GBAR : public ITestCase {
public:
Test_GBAR(TestSuite* suite) : ITestCase(suite, "gbar") {}
int setup(uint32_t n, void* src1, void* /*src2*/) override {
RT_CHECK(vx_dev_caps(suite_->device(), VX_CAPS_NUM_CORES, &num_cores_));
if (num_cores_ == 1) {
std::cout << "Error: multiple cores configuration required!" << std::endl;
return -1;
}
RT_CHECK(vx_dev_caps(suite_->device(), VX_CAPS_NUM_WARPS, &num_warps_));
RT_CHECK(vx_dev_caps(suite_->device(), VX_CAPS_NUM_THREADS, &num_threads_));
auto a = (uint32_t*)src1;
for (uint32_t i = 0; i < n; ++i) {
a[i] = i;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* /*src2*/) override {
int errors = 0;
auto a = (uint32_t*)src1;
auto c = (uint32_t*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto tid = i % num_threads_;
auto wid = (i / num_threads_) % num_warps_;
auto cid = i / (num_warps_ * num_threads_);
auto src_idx = ((num_cores_ - 1 - cid) * num_warps_ + (num_warps_ - 1 - wid)) * num_threads_ + tid;
uint32_t ref = a[src_idx];
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << std::hex << ref << ", actual=" << c[i] << std::endl;
++errors;
}
}
return errors;
}
uint64_t num_cores_;
uint64_t num_warps_;
uint64_t num_threads_;
};
///////////////////////////////////////////////////////////////////////////////
TestSuite::TestSuite(vx_device_h device)
: device_(device) {
this->add_test(new Test_IADD(this));
this->add_test(new Test_IMUL(this));
this->add_test(new Test_IDIV(this));
this->add_test(new Test_IDIV_MUL(this));
this->add_test(new Test_FADD(this));
this->add_test(new Test_FSUB(this));
this->add_test(new Test_FMUL(this));
this->add_test(new Test_FMADD(this));
this->add_test(new Test_FMSUB(this));
this->add_test(new Test_FNMADD(this));
this->add_test(new Test_FNMSUB(this));
this->add_test(new Test_FNMADD_MADD(this));
this->add_test(new Test_FDIV(this));
this->add_test(new Test_FDIV2(this));
this->add_test(new Test_FSQRT(this));
this->add_test(new Test_FTOI(this));
this->add_test(new Test_FTOU(this));
this->add_test(new Test_ITOF(this));
this->add_test(new Test_UTOF(this));
this->add_test(new Test_BAR(this));
this->add_test(new Test_GBAR(this));
}
TestSuite::~TestSuite() {
for (size_t i = 0; i < _tests.size(); ++i) {
delete _tests[i];
}
}
ITestCase* TestSuite::get_test(int testid) const {
return _tests.at(testid);
}
void TestSuite::add_test(ITestCase* test) {
_tests.push_back(test);
}
size_t TestSuite::size() const {
return _tests.size();
}
vx_device_h TestSuite::device() const {
return device_;
}

View File

@@ -1,9 +0,0 @@
PROJECT = fence
SRCS = main.cpp
VX_SRCS = kernel.cpp
OPTS ?= -n64
include ../common.mk

View File

@@ -1,14 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
typedef struct {
uint32_t num_tasks;
uint32_t task_size;
uint64_t src0_addr;
uint64_t src1_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -1,24 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
uint32_t count = arg->task_size;
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
dst_ptr[offset+i] = src0_ptr[offset+i] + src1_ptr[offset+i];
}
vx_fence();
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View File

@@ -1,194 +0,0 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src0_addr);
vx_mem_free(device, kernel_arg.src1_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_dev_close(device);
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = i + i;
int cur = buf_ptr[i];
if (cur != ref) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_cores, num_warps, num_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
uint32_t num_tasks = num_cores * num_warps * num_threads;
uint32_t num_points = count * num_tasks;
uint32_t buf_size = num_points * sizeof(int32_t);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.num_tasks = num_tasks;
kernel_arg.task_size = count;
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// upload source buffer0
{
std::cout << "upload source buffer0" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i-1;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size));
}
// upload source buffer1
{
std::cout << "upload source buffer1" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i+1;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size));
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, buf_size, num_points));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;
return 0;
}

View File

@@ -1,7 +1,5 @@
PROJECT = flash_attention
SRCS = main.cpp common.h
# VX_SRCS = kernel.cpp
# VX_SRCS = kernel.gemmini.warpspec.cpp
VX_SRCS = kernel.gemmini.cpp

View File

@@ -1,166 +0,0 @@
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <vector>
#include <cassert>
#include "common.h"
#include "half.hpp"
using half_float::half;
using half_float::half_cast;
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
std::vector<float> ref_data;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
// vx_mem_free(device, kernel_arg.addr_a);
// vx_mem_free(device, kernel_arg.addr_b);
// vx_mem_free(device, kernel_arg.addr_c);
vx_dev_close(device);
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_o, buf_size));
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint32_t dim_seqlen = 128;
uint32_t dim_headdim = 64;
using float_type = half;
uint32_t dst_buf_size =
dim_seqlen * dim_headdim * sizeof(ref_data[0]);
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
kernel_arg.addr_q = 0xa0000000;
kernel_arg.addr_k = 0xa1000000;
kernel_arg.addr_v = 0xa2000000;
kernel_arg.addr_o = 0xc0000000;
kernel_arg.dim_seqlen = dim_seqlen;
kernel_arg.dim_headdim = dim_headdim;
std::cout << "dev_addr_q=0x" << std::hex << kernel_arg.addr_q << std::endl;
std::cout << "dev_addr_k=0x" << std::hex << kernel_arg.addr_k << std::endl;
std::cout << "dev_addr_v=0x" << std::hex << kernel_arg.addr_v << std::endl;
std::cout << "dev_addr_o=0x" << std::hex << kernel_arg.addr_o << std::endl;
// allocate staging buffer
{
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = sizeof(kernel_arg_t);
staging_buf.resize(staging_buf_size);
}
// upload kernel argument
{
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
std::cout << "uploading argument buffer to device, device mem address="
<< std::hex << KERNEL_ARG_DEV_MEM_ADDR << ", size=" << std::dec
<< sizeof(kernel_arg_t) << " bytes\n";
std::ofstream file("args.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(staging_buf.data()),
sizeof(kernel_arg_t));
file.close();
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size));
std::cout << "PASSED!" << std::endl;
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
return 0;
}

View File

@@ -1,5 +0,0 @@
*.bin
*.dump
*.elf
flops
.depend

View File

@@ -1,9 +0,0 @@
PROJECT = flops
SRCS = main.cpp common.h
VX_SRCS = kernel.cpp
OPTS ?= -n16
include ../common.mk

View File

@@ -1,15 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#include <cstdint>
#define KERNEL_ARG_DEV_MEM_ADDR 0x7fff0000
#define DEV_SMEM_START_ADDR 0xff000000
typedef struct {
uint32_t size;
uint32_t addr_src;
uint32_t addr_dst;
} kernel_arg_t;
#endif

Binary file not shown.

View File

@@ -1,41 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
const float *A = (const float *)arg->addr_src;
float *C = (float *)arg->addr_dst;
int incr = A[task_id];
float sum = 0.0f;
float sum1 = 0.0f;
float sum2 = 0.0f;
float sum3 = 0.0f;
float sum4 = 0.0f;
float sum5 = 0.0f;
#pragma unroll 8
for (int i = 0; i < 5000; i++) {
sum1 = sum2 + 5.0f;
sum2 = sum3 + 5.0f;
sum3 = sum4 + 5.0f;
sum4 = sum5 + 5.0f;
sum5 = sum1 + 5.0f;
}
sum = sum1 + sum2 + sum3 + sum4 + sum5;
C[task_id] = static_cast<float>(sum);
}
int main() {
kernel_arg_t *arg = (kernel_arg_t *)KERNEL_ARG_DEV_MEM_ADDR;
const uint32_t grid_size = arg->size;
#ifdef RADIANCE
vx_spawn_tasks_cluster(grid_size, (vx_spawn_tasks_cb)kernel_body, arg);
#else
// NOTE: This kernel assumes contiguous thread scheduling for efficient shared
// memory allocation, and therefore does not work with original vx_spawn_tasks
vx_spawn_tasks_contiguous(grid_size, (vx_spawn_tasks_cb)kernel_body, arg);
#endif
return 0;
}

View File

@@ -1,252 +0,0 @@
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <vector>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
std::vector<float> src_data;
std::vector<float> ref_data;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
// vx_mem_free(device, kernel_arg.addr_a);
// vx_mem_free(device, kernel_arg.addr_b);
// vx_mem_free(device, kernel_arg.addr_c);
vx_dev_close(device);
}
}
void generate_source_data(size_t size) {
src_data.resize(size);
for (uint32_t i = 0; i < src_data.size(); ++i) {
src_data[i] = static_cast<float>(i);
}
}
void generate_reference_data(size_t size) {
ref_data.resize(size);
for (uint32_t i = 0; i < ref_data.size(); ++i) {
ref_data[i] = static_cast<float>(i) * 1000.0f;
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t size) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_dst, buf_size));
std::cout << "downloading result C matrix from device, device mem address="
<< std::hex << kernel_arg.addr_dst << ", size=" << std::dec
<< buf_size << " bytes\n";
std::ofstream file("output.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open output.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(staging_buf.data()), buf_size);
file.close();
std::ofstream ref_file("reference.bin", std::ios::binary | std::ios::out);
if (!ref_file) {
std::cerr << "error: failed to open reference.bin for writing\n";
exit(EXIT_FAILURE);
}
ref_file.write(reinterpret_cast<char *>(ref_data.data()), buf_size);
ref_file.close();
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (float*)staging_buf.data();
for (uint32_t i = 0; i < size; ++i) {
float ref = ref_data.at(i);
float cur = buf_ptr[i];
if (std::abs((cur - ref) / ref) > 1e-6) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual=" << cur << ", expected=" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
size_t size = 64;
generate_source_data(size);
generate_reference_data(size);
uint32_t src_buf_size = src_data.size() * sizeof(src_data[0]);
uint32_t dst_buf_size = ref_data.size() * sizeof(ref_data[0]);
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
// RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_src));
// RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_dst));
kernel_arg.addr_src = 0x20000UL;
kernel_arg.addr_dst = 0xc0000000UL;
kernel_arg.size = size;
std::cout << "dev_addr_src=0x" << std::hex << kernel_arg.addr_src << std::endl;
std::cout << "dev_addr_dst=0x" << std::hex << kernel_arg.addr_dst << std::endl;
// allocate staging buffer
{
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(
src_buf_size,
std::max<uint32_t>(
src_buf_size,
std::max<uint32_t>(dst_buf_size, sizeof(kernel_arg_t))));
staging_buf.resize(staging_buf_size);
}
// upload kernel argument
{
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
std::cout << "uploading argument buffer to device, device mem address="
<< std::hex << KERNEL_ARG_DEV_MEM_ADDR << ", size=" << std::dec
<< sizeof(kernel_arg_t) << " bytes\n";
std::ofstream file("args.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(staging_buf.data()),
sizeof(kernel_arg_t));
file.close();
}
// upload source buffer
{
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_data.data(), src_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_src, staging_buf.data(),
src_buf_size));
std::cout << "uploading source data to device, device mem address="
<< std::hex << kernel_arg.addr_src << ", size=" << std::dec
<< src_buf_size << " bytes\n";
std::ofstream file("input.a.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open input.a.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_buf_size);
file.close();
}
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < ref_data.size(); ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_dst, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, kernel_arg.size));
std::cout << "PASSED!" << std::endl;
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
return 0;
}

View File

@@ -1,7 +1,5 @@
PROJECT = idle
SRCS = main.cpp common.h
VX_SRCS = kernel.cpp
OPTS ?= -n16

View File

@@ -1,274 +0,0 @@
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <vector>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
std::vector<float> src_a_data;
std::vector<float> src_b_data;
std::vector<float> ref_data;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.addr_a);
vx_mem_free(device, kernel_arg.addr_b);
vx_mem_free(device, kernel_arg.addr_c);
vx_dev_close(device);
}
}
void generate_source_matrix(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
src_a_data.resize(dim_m * dim_k);
src_b_data.resize(dim_k * dim_n);
for (uint32_t i = 0; i < src_a_data.size(); ++i) {
src_a_data[i] = static_cast<float>(i);
std::cout << "A: " << i << ": value=" << src_a_data[i] << std::endl;
}
for (uint32_t i = 0; i < src_b_data.size(); ++i) {
src_b_data[i] = static_cast<float>(i);
std::cout << "B: " << i << ": value=" << src_b_data[i] << std::endl;
}
}
void generate_reference_matmul(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
ref_data.resize(dim_m * dim_n);
for (uint32_t i = 0; i < dim_m; ++i) {
for (uint32_t j = 0; j < dim_n; ++j) {
float ref = 0.0f;
for (uint32_t k = 0; k < dim_k; ++k) {
ref += src_a_data[dim_k * i + k] * src_b_data[dim_n * k + j];
}
ref_data.at(dim_n * i + j) = ref;
}
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t dim_m, uint32_t dim_n) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_c, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (float*)staging_buf.data();
for (uint32_t i = 0; i < dim_m * dim_n; ++i) {
float ref = ref_data.at(i);
float cur = buf_ptr[i];
if (std::abs((cur - ref) / ref) > 1e-6) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual=" << cur << ", expected=" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
// FIXME: hardcoded
uint32_t dim_m = 64;
uint32_t dim_n = 64;
uint32_t dim_k = 64;
generate_source_matrix(dim_m, dim_n, dim_k);
generate_reference_matmul(dim_m, dim_n, dim_k);
uint32_t src_a_buf_size = src_a_data.size() * sizeof(src_a_data[0]);
uint32_t src_b_buf_size = src_b_data.size() * sizeof(src_b_data[0]);
uint32_t dst_buf_size = ref_data.size() * sizeof(src_a_data[0]);
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, src_a_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_a));
RT_CHECK(vx_mem_alloc(device, src_b_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_b));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_c));
kernel_arg.dim_m = dim_m;
kernel_arg.dim_n = dim_n;
kernel_arg.dim_k = dim_k;
std::cout << "dev_addr_a=0x" << std::hex << kernel_arg.addr_a << std::endl;
std::cout << "dev_addr_b=0x" << std::hex << kernel_arg.addr_b << std::endl;
std::cout << "dev_addr_c=0x" << std::hex << kernel_arg.addr_c << std::endl;
// allocate staging buffer
{
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(
src_a_buf_size,
std::max<uint32_t>(
src_b_buf_size,
std::max<uint32_t>(dst_buf_size, sizeof(kernel_arg_t))));
staging_buf.resize(staging_buf_size);
}
// upload kernel argument
{
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = staging_buf.data();
kernel_arg.addr_a = (uint64_t) 0xa0000000ULL;
kernel_arg.addr_b = (uint64_t) 0xa1000000ULL;
kernel_arg.addr_c = (uint64_t) 0xc0000000ULL;
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
std::cout << "uploading argument buffer to device, device mem address="
<< std::hex << KERNEL_ARG_DEV_MEM_ADDR << ", size=" << std::dec
<< sizeof(kernel_arg_t) << " bytes\n";
std::ofstream file("args.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(staging_buf.data()),
sizeof(kernel_arg_t));
file.close();
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
// upload source buffer
{
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_a_data.data(), src_a_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_a, staging_buf.data(),
src_a_buf_size));
std::cout << "uploading source A matrix to device, device mem address="
<< std::hex << kernel_arg.addr_a << ", size=" << std::dec
<< src_a_buf_size << " bytes\n";
std::ofstream file("input.a.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_a_buf_size);
file.close();
}
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_b_data.data(), src_b_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_b, staging_buf.data(),
src_b_buf_size));
std::cout << "uploading source B matrix to device, device mem address="
<< std::hex << kernel_arg.addr_b << ", size=" << std::dec
<< src_b_buf_size << " bytes\n";
std::ofstream file("input.b.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_b_buf_size);
file.close();
}
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < ref_data.size(); ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_c, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, kernel_arg.dim_m, kernel_arg.dim_n));
std::cout << "PASSED!" << std::endl;
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
return 0;
}

View File

@@ -1,9 +0,0 @@
PROJECT = io_addr
SRCS = main.cpp
VX_SRCS = kernel.cpp
OPTS ?= -n16
include ../common.mk

View File

@@ -1,12 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
typedef struct {
uint32_t num_points;
uint64_t src_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -1,19 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
uint64_t* src_ptr = (uint64_t*)arg->src_addr;
uint32_t* dst_ptr = (uint32_t*)arg->dst_addr;
int32_t* addr_ptr = (int32_t*)(src_ptr[task_id]);
dst_ptr[task_id] = *addr_ptr;
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View File

@@ -1,237 +0,0 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <vector>
#include <VX_config.h>
#include "common.h"
#define NUM_ADDRS 16
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
static uint64_t io_base_addr = IO_CSR_ADDR + IO_CSR_SIZE;
uint64_t usr_test_mem;
std::vector<uint64_t> src_addrs;
std::vector<int32_t> ref_data;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_mem_free(device, usr_test_mem);
vx_dev_close(device);
}
}
void gen_src_addrs(uint32_t num_points) {
src_addrs.resize(num_points);
uint32_t u = 0, k = 0;
for (uint32_t i = 0; i < num_points; ++i) {
if (0 ==(i % 4)) {
k = (i + u) % NUM_ADDRS;
++u;
}
uint32_t j = i % NUM_ADDRS;
uint64_t a = ((j == k) ? usr_test_mem : io_base_addr) + j * sizeof(uint32_t);
std::cout << std::dec << i << "," << k << ": value=0x" << std::hex << a << std::endl;
src_addrs[i] = a;
}
}
void gen_ref_data(uint32_t num_points) {
ref_data.resize(num_points);
for (uint32_t i = 0; i < num_points; ++i) {
int32_t j = i % NUM_ADDRS;
ref_data[i] = j * j;
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = ref_data.at(i);
int cur = buf_ptr[i];
if (cur != ref) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
uint64_t value;
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint32_t num_points = count;
RT_CHECK(vx_mem_alloc(device, NUM_ADDRS * sizeof(int32_t), VX_MEM_TYPE_GLOBAL, &usr_test_mem));
// generate input data
gen_src_addrs(num_points);
// generate reference data
gen_ref_data(num_points);
uint32_t src_buf_size = num_points * sizeof(uint64_t);
uint32_t dst_buf_size = num_points * sizeof(int32_t);
std::cout << "number of points: " << std::dec << num_points << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_TYPE_GLOBAL, &value));
kernel_arg.src_addr = value;
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &value));
kernel_arg.dst_addr = value;
kernel_arg.num_points = num_points;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(NUM_ADDRS * sizeof(uint64_t),
std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(dst_buf_size,
sizeof(kernel_arg_t))));
staging_buf.resize(staging_buf_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// upload test address data
{
std::cout << "upload test address data" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < NUM_ADDRS; ++i) {
buf_ptr[i] = i * i;
}
RT_CHECK(vx_copy_to_dev(device, io_base_addr, staging_buf.data(), NUM_ADDRS * sizeof(int32_t)));
RT_CHECK(vx_copy_to_dev(device, usr_test_mem, staging_buf.data(), NUM_ADDRS * sizeof(int32_t)));
}
// upload source buffer
{
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = (uint64_t*)staging_buf.data();
memcpy(buf_ptr, src_addrs.data(), src_buf_size);
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), src_buf_size));
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, num_points));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;
return 0;
}

View File

@@ -1,9 +0,0 @@
PROJECT = mstress
SRCS = main.cpp
VX_SRCS = kernel.cpp
OPTS ?= -n64
include ../common.mk

View File

@@ -1,17 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
#define NUM_LOADS 8
typedef struct {
uint32_t num_tasks;
uint32_t size;
uint32_t stride;
uint64_t src0_addr;
uint64_t src1_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -1,29 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
uint32_t stride = arg->stride;
uint32_t* addr_ptr = (uint32_t*)arg->src0_addr;
float* src_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * stride;
for (uint32_t i = 0; i < stride; ++i) {
float value = 0.0f;
for (uint32_t j = 0; j < NUM_LOADS; ++j) {
uint32_t addr = offset + i + j;
uint32_t index = addr_ptr[addr];
value *= src_ptr[index];
}
dst_ptr[offset+i] = value;
}
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View File

@@ -1,280 +0,0 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include "common.h"
#include <assert.h>
#include <limits>
#include <math.h>
#include <vector>
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
union Float_t {
float f;
int i;
struct {
uint32_t man : 23;
uint32_t exp : 8;
uint32_t sign : 1;
} parts;
};
inline float fround(float x, int32_t precision = 8) {
auto power_of_10 = std::pow(10, precision);
return std::round(x * power_of_10) / power_of_10;
}
inline bool almost_equal_eps(float a, float b, int ulp = 128) {
auto eps = std::numeric_limits<float>::epsilon() * (std::max(fabs(a), fabs(b)) * ulp);
auto d = fabs(a - b);
if (d > eps) {
std::cout << "*** almost_equal_eps: d=" << d << ", eps=" << eps << std::endl;
return false;
}
return true;
}
inline bool almost_equal_ulp(float a, float b, int32_t ulp = 6) {
Float_t fa{a}, fb{b};
auto d = std::abs(fa.i - fb.i);
if (d > ulp) {
std::cout << "*** almost_equal_ulp: a=" << a << ", b=" << b << ", ulp=" << d << ", ia=" << std::hex << fa.i << ", ib=" << fb.i << std::endl;
return false;
}
return true;
}
inline bool almost_equal(float a, float b) {
if (a == b)
return true;
/*if (almost_equal_eps(a, b))
return true;*/
return almost_equal_ulp(a, b);
}
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
std::vector<float> test_data;
std::vector<uint32_t> addr_table;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src0_addr);
vx_mem_free(device, kernel_arg.src1_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_dev_close(device);
}
}
void gen_input_data(uint32_t num_points) {
test_data.resize(num_points);
addr_table.resize(num_points + NUM_LOADS - 1);
for (uint32_t i = 0; i < num_points; ++i) {
float r = static_cast<float>(std::rand()) / RAND_MAX;
test_data[i] = r;
}
for (uint32_t i = 0; i < addr_table.size(); ++i) {
float r = static_cast<float>(std::rand()) / RAND_MAX;
uint32_t index = static_cast<uint32_t>(r * num_points);
assert(index < num_points);
addr_table[i] = index;
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t dst_buf_size,
uint32_t num_points) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, dst_buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (float*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
float ref = 0.0f;
for (uint32_t j = 0; j < NUM_LOADS; ++j) {
uint32_t addr = i + j;
uint32_t index = addr_table.at(addr);
float value = test_data.at(index);
//printf("*** [%d] addr=%d, index=%d, value=%f\n", i, addr, index, value);
ref *= value;
}
float cur = buf_ptr[i];
if (!almost_equal(cur, ref)) {
std::cout << "error at result #" << std::dec << i
<< ": actual " << cur << ", expected " << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_cores, num_warps, num_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
uint32_t num_tasks = num_cores * num_warps * num_threads;
uint32_t num_points = count * num_tasks;
// generate input data
gen_input_data(num_points);
uint32_t addr_buf_size = addr_table.size() * sizeof(int32_t);
uint32_t src_buf_size = test_data.size() * sizeof(int32_t);
uint32_t dst_buf_size = test_data.size() * sizeof(int32_t);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, addr_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.num_tasks = num_tasks;
kernel_arg.stride = count;
std::cout << "dev_addr=0x" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(addr_buf_size,
std::max<uint32_t>(dst_buf_size,
sizeof(kernel_arg_t))));
staging_buf.resize(staging_buf_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// upload source buffer0
{
std::cout << "upload address buffer" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, addr_table.data(), addr_table.size() * sizeof(int32_t));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), addr_buf_size));
}
// upload source buffer1
{
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, test_data.data(), test_data.size() * sizeof(int32_t));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), src_buf_size));
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < test_data.size(); ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, num_points));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;
return 0;
}

View File

@@ -1,9 +0,0 @@
PROJECT = no_mf_ext
SRCS = main.cpp
VX_SRCS = kernel.cpp
OPTS ?= -n8
include ../common.mk

View File

@@ -1,12 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
typedef struct {
uint32_t size;
uint64_t src_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -1,18 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
uint32_t size = arg->size;
int32_t* src_ptr = (int32_t*)arg->src_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
for (uint32_t i = 0; i < size; ++i) {
dst_ptr[i] = src_ptr[i];
}
return 0;
}

View File

@@ -1,174 +0,0 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_dev_close(device);
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = i-1;
int cur = buf_ptr[i];
if (cur != ref) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint32_t num_points = count;
uint32_t buf_size = num_points * sizeof(int32_t);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.size = num_points;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// upload source buffer0
{
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i-1;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), buf_size));
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, buf_size, num_points));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;
return 0;
}

View File

@@ -1,17 +0,0 @@
PROJECT = no_smem
OPTS ?= -n8
VX_SRCS = kernel.cpp $(VORTEX_KN_PATH)/src/vx_perf.c $(VORTEX_KN_PATH)/src/vx_syscalls.c $(VORTEX_KN_PATH)/src/vx_print.S $(VORTEX_KN_PATH)/src/vx_start.S
SRCS = main.cpp
include ../common.mk
VX_CFLAGS += -DSM_DISABLE
VX_LDFLAGS = -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR)
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy

View File

@@ -1,12 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
typedef struct {
uint32_t size;
uint64_t src_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -1,18 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
uint32_t size = arg->size;
int32_t* src_ptr = (int32_t*)arg->src_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
for (uint32_t i = 0; i < size; ++i) {
dst_ptr[i] = src_ptr[i];
}
return 0;
}

View File

@@ -1,174 +0,0 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_dev_close(device);
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = i-1;
int cur = buf_ptr[i];
if (cur != ref) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint32_t num_points = count;
uint32_t buf_size = num_points * sizeof(int32_t);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.size = num_points;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// upload source buffer0
{
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i-1;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), buf_size));
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, buf_size, num_points));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;
return 0;
}

View File

@@ -1,9 +0,0 @@
PROJECT = printf
SRCS = main.cpp
VX_SRCS = kernel.cpp
OPTS ?= -n4
include ../common.mk

View File

@@ -1,11 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
typedef struct {
uint32_t num_points;
uint64_t src_addr;
} kernel_arg_t;
#endif

View File

@@ -1,18 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_print.h>
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
int cid = vx_core_id();
int* src_ptr = (int*)arg->src_addr;
char value = 'A' + src_ptr[task_id];
vx_printf("cid=%d: task=%d, value=%c\n", cid, task_id, value);
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View File

@@ -1,138 +0,0 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 4;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_dev_close(device);
}
}
int run_test() {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_warps, num_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
uint32_t num_points = count;
uint32_t buf_size = count * sizeof(int32_t);
std::cout << "number of points: " << count << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src_addr));
kernel_arg.num_points = num_points;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// upload source buffer0
{
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test());
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;
return 0;
}

View File

@@ -1,7 +1,5 @@
PROJECT = rickroll
SRCS = main.cpp common.h
VX_SRCS = kernel.cpp
OPTS ?= -n16

View File

@@ -1,274 +0,0 @@
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <vector>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
std::vector<float> src_a_data;
std::vector<float> src_b_data;
std::vector<float> ref_data;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.addr_a);
vx_mem_free(device, kernel_arg.addr_b);
vx_mem_free(device, kernel_arg.addr_c);
vx_dev_close(device);
}
}
void generate_source_matrix(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
src_a_data.resize(dim_m * dim_k);
src_b_data.resize(dim_k * dim_n);
for (uint32_t i = 0; i < src_a_data.size(); ++i) {
src_a_data[i] = static_cast<float>(i);
std::cout << "A: " << i << ": value=" << src_a_data[i] << std::endl;
}
for (uint32_t i = 0; i < src_b_data.size(); ++i) {
src_b_data[i] = static_cast<float>(i);
std::cout << "B: " << i << ": value=" << src_b_data[i] << std::endl;
}
}
void generate_reference_matmul(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
ref_data.resize(dim_m * dim_n);
for (uint32_t i = 0; i < dim_m; ++i) {
for (uint32_t j = 0; j < dim_n; ++j) {
float ref = 0.0f;
for (uint32_t k = 0; k < dim_k; ++k) {
ref += src_a_data[dim_k * i + k] * src_b_data[dim_n * k + j];
}
ref_data.at(dim_n * i + j) = ref;
}
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t dim_m, uint32_t dim_n) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_c, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (float*)staging_buf.data();
for (uint32_t i = 0; i < dim_m * dim_n; ++i) {
float ref = ref_data.at(i);
float cur = buf_ptr[i];
if (std::abs((cur - ref) / ref) > 1e-6) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual=" << cur << ", expected=" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
// FIXME: hardcoded
uint32_t dim_m = 64;
uint32_t dim_n = 64;
uint32_t dim_k = 64;
generate_source_matrix(dim_m, dim_n, dim_k);
generate_reference_matmul(dim_m, dim_n, dim_k);
uint32_t src_a_buf_size = src_a_data.size() * sizeof(src_a_data[0]);
uint32_t src_b_buf_size = src_b_data.size() * sizeof(src_b_data[0]);
uint32_t dst_buf_size = ref_data.size() * sizeof(src_a_data[0]);
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, src_a_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_a));
RT_CHECK(vx_mem_alloc(device, src_b_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_b));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_c));
kernel_arg.dim_m = dim_m;
kernel_arg.dim_n = dim_n;
kernel_arg.dim_k = dim_k;
std::cout << "dev_addr_a=0x" << std::hex << kernel_arg.addr_a << std::endl;
std::cout << "dev_addr_b=0x" << std::hex << kernel_arg.addr_b << std::endl;
std::cout << "dev_addr_c=0x" << std::hex << kernel_arg.addr_c << std::endl;
// allocate staging buffer
{
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(
src_a_buf_size,
std::max<uint32_t>(
src_b_buf_size,
std::max<uint32_t>(dst_buf_size, sizeof(kernel_arg_t))));
staging_buf.resize(staging_buf_size);
}
// upload kernel argument
{
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = staging_buf.data();
kernel_arg.addr_a = (uint64_t) 0x20000;
kernel_arg.addr_b = (uint64_t) 0x28000;
kernel_arg.addr_c = (uint64_t) 0xc0000000ULL;
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
std::cout << "uploading argument buffer to device, device mem address="
<< std::hex << KERNEL_ARG_DEV_MEM_ADDR << ", size=" << std::dec
<< sizeof(kernel_arg_t) << " bytes\n";
std::ofstream file("args.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(staging_buf.data()),
sizeof(kernel_arg_t));
file.close();
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
// upload source buffer
{
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_a_data.data(), src_a_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_a, staging_buf.data(),
src_a_buf_size));
std::cout << "uploading source A matrix to device, device mem address="
<< std::hex << kernel_arg.addr_a << ", size=" << std::dec
<< src_a_buf_size << " bytes\n";
std::ofstream file("input.a.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_a_buf_size);
file.close();
}
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_b_data.data(), src_b_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_b, staging_buf.data(),
src_b_buf_size));
std::cout << "uploading source B matrix to device, device mem address="
<< std::hex << kernel_arg.addr_b << ", size=" << std::dec
<< src_b_buf_size << " bytes\n";
std::ofstream file("input.b.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_b_buf_size);
file.close();
}
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < ref_data.size(); ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_c, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, kernel_arg.dim_m, kernel_arg.dim_n));
std::cout << "PASSED!" << std::endl;
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
return 0;
}

Binary file not shown.

View File

@@ -1,7 +1,5 @@
PROJECT = sgemm_gemmini
SRCS = main.cpp common.h
VX_SRCS = kernel.cpp
OPTS ?= -n16

View File

@@ -1,274 +0,0 @@
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <vector>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
std::vector<float> src_a_data;
std::vector<float> src_b_data;
std::vector<float> ref_data;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.addr_a);
vx_mem_free(device, kernel_arg.addr_b);
vx_mem_free(device, kernel_arg.addr_c);
vx_dev_close(device);
}
}
void generate_source_matrix(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
src_a_data.resize(dim_m * dim_k);
src_b_data.resize(dim_k * dim_n);
for (uint32_t i = 0; i < src_a_data.size(); ++i) {
src_a_data[i] = static_cast<float>(i);
std::cout << "A: " << i << ": value=" << src_a_data[i] << std::endl;
}
for (uint32_t i = 0; i < src_b_data.size(); ++i) {
src_b_data[i] = static_cast<float>(i);
std::cout << "B: " << i << ": value=" << src_b_data[i] << std::endl;
}
}
void generate_reference_matmul(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
ref_data.resize(dim_m * dim_n);
for (uint32_t i = 0; i < dim_m; ++i) {
for (uint32_t j = 0; j < dim_n; ++j) {
float ref = 0.0f;
for (uint32_t k = 0; k < dim_k; ++k) {
ref += src_a_data[dim_k * i + k] * src_b_data[dim_n * k + j];
}
ref_data.at(dim_n * i + j) = ref;
}
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t dim_m, uint32_t dim_n) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_c, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (float*)staging_buf.data();
for (uint32_t i = 0; i < dim_m * dim_n; ++i) {
float ref = ref_data.at(i);
float cur = buf_ptr[i];
if (std::abs((cur - ref) / ref) > 1e-6) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual=" << cur << ", expected=" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
// FIXME: hardcoded
uint32_t dim_m = 64;
uint32_t dim_n = 64;
uint32_t dim_k = 64;
generate_source_matrix(dim_m, dim_n, dim_k);
generate_reference_matmul(dim_m, dim_n, dim_k);
uint32_t src_a_buf_size = src_a_data.size() * sizeof(src_a_data[0]);
uint32_t src_b_buf_size = src_b_data.size() * sizeof(src_b_data[0]);
uint32_t dst_buf_size = ref_data.size() * sizeof(src_a_data[0]);
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, src_a_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_a));
RT_CHECK(vx_mem_alloc(device, src_b_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_b));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_c));
kernel_arg.dim_m = dim_m;
kernel_arg.dim_n = dim_n;
kernel_arg.dim_k = dim_k;
std::cout << "dev_addr_a=0x" << std::hex << kernel_arg.addr_a << std::endl;
std::cout << "dev_addr_b=0x" << std::hex << kernel_arg.addr_b << std::endl;
std::cout << "dev_addr_c=0x" << std::hex << kernel_arg.addr_c << std::endl;
// allocate staging buffer
{
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(
src_a_buf_size,
std::max<uint32_t>(
src_b_buf_size,
std::max<uint32_t>(dst_buf_size, sizeof(kernel_arg_t))));
staging_buf.resize(staging_buf_size);
}
// upload kernel argument
{
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = staging_buf.data();
kernel_arg.addr_a = (uint64_t) 0x20000;
kernel_arg.addr_b = (uint64_t) 0x28000;
kernel_arg.addr_c = (uint64_t) 0xc0000000ULL;
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
std::cout << "uploading argument buffer to device, device mem address="
<< std::hex << KERNEL_ARG_DEV_MEM_ADDR << ", size=" << std::dec
<< sizeof(kernel_arg_t) << " bytes\n";
std::ofstream file("args.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(staging_buf.data()),
sizeof(kernel_arg_t));
file.close();
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
// upload source buffer
{
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_a_data.data(), src_a_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_a, staging_buf.data(),
src_a_buf_size));
std::cout << "uploading source A matrix to device, device mem address="
<< std::hex << kernel_arg.addr_a << ", size=" << std::dec
<< src_a_buf_size << " bytes\n";
std::ofstream file("input.a.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_a_buf_size);
file.close();
}
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_b_data.data(), src_b_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_b, staging_buf.data(),
src_b_buf_size));
std::cout << "uploading source B matrix to device, device mem address="
<< std::hex << kernel_arg.addr_b << ", size=" << std::dec
<< src_b_buf_size << " bytes\n";
std::ofstream file("input.b.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_b_buf_size);
file.close();
}
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < ref_data.size(); ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_c, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, kernel_arg.dim_m, kernel_arg.dim_n));
std::cout << "PASSED!" << std::endl;
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
return 0;
}

View File

@@ -1,7 +1,5 @@
PROJECT = sgemm_gemmini_dma
SRCS = main.cpp common.h
VX_SRCS = kernel.cpp
OPTS ?= -n16

View File

@@ -0,0 +1,11 @@
rm kernel.radiance.elf
rm -rf binaries
mkdir binaries
for a in args/*; do
cp -f $a args.bin
aa=$(basename "$a")
cp -f input.a/"$aa" input.a.bin
cp -f input.b/"$aa" input.b.bin
make > /dev/null
mv kernel.radiance.elf binaries/gemmini_debug_dma"$aa".elf
done

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -1,274 +0,0 @@
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <vector>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
std::vector<float> src_a_data;
std::vector<float> src_b_data;
std::vector<float> ref_data;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.addr_a);
vx_mem_free(device, kernel_arg.addr_b);
vx_mem_free(device, kernel_arg.addr_c);
vx_dev_close(device);
}
}
void generate_source_matrix(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
src_a_data.resize(dim_m * dim_k);
src_b_data.resize(dim_k * dim_n);
for (uint32_t i = 0; i < src_a_data.size(); ++i) {
src_a_data[i] = static_cast<float>(i);
std::cout << "A: " << i << ": value=" << src_a_data[i] << std::endl;
}
for (uint32_t i = 0; i < src_b_data.size(); ++i) {
src_b_data[i] = static_cast<float>(i);
std::cout << "B: " << i << ": value=" << src_b_data[i] << std::endl;
}
}
void generate_reference_matmul(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
ref_data.resize(dim_m * dim_n);
for (uint32_t i = 0; i < dim_m; ++i) {
for (uint32_t j = 0; j < dim_n; ++j) {
float ref = 0.0f;
for (uint32_t k = 0; k < dim_k; ++k) {
ref += src_a_data[dim_k * i + k] * src_b_data[dim_n * k + j];
}
ref_data.at(dim_n * i + j) = ref;
}
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t dim_m, uint32_t dim_n) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_c, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (float*)staging_buf.data();
for (uint32_t i = 0; i < dim_m * dim_n; ++i) {
float ref = ref_data.at(i);
float cur = buf_ptr[i];
if (std::abs((cur - ref) / ref) > 1e-6) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual=" << cur << ", expected=" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
// FIXME: hardcoded
uint32_t dim_m = 64;
uint32_t dim_n = 64;
uint32_t dim_k = 64;
generate_source_matrix(dim_m, dim_n, dim_k);
generate_reference_matmul(dim_m, dim_n, dim_k);
uint32_t src_a_buf_size = src_a_data.size() * sizeof(src_a_data[0]);
uint32_t src_b_buf_size = src_b_data.size() * sizeof(src_b_data[0]);
uint32_t dst_buf_size = ref_data.size() * sizeof(src_a_data[0]);
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, src_a_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_a));
RT_CHECK(vx_mem_alloc(device, src_b_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_b));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_c));
kernel_arg.dim_m = dim_m;
kernel_arg.dim_n = dim_n;
kernel_arg.dim_k = dim_k;
std::cout << "dev_addr_a=0x" << std::hex << kernel_arg.addr_a << std::endl;
std::cout << "dev_addr_b=0x" << std::hex << kernel_arg.addr_b << std::endl;
std::cout << "dev_addr_c=0x" << std::hex << kernel_arg.addr_c << std::endl;
// allocate staging buffer
{
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(
src_a_buf_size,
std::max<uint32_t>(
src_b_buf_size,
std::max<uint32_t>(dst_buf_size, sizeof(kernel_arg_t))));
staging_buf.resize(staging_buf_size);
}
// upload kernel argument
{
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = staging_buf.data();
kernel_arg.addr_a = (uint64_t) 0xa0000000ULL;
kernel_arg.addr_b = (uint64_t) 0xa1000000ULL;
kernel_arg.addr_c = (uint64_t) 0xc0000000ULL;
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
std::cout << "uploading argument buffer to device, device mem address="
<< std::hex << KERNEL_ARG_DEV_MEM_ADDR << ", size=" << std::dec
<< sizeof(kernel_arg_t) << " bytes\n";
std::ofstream file("args.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(staging_buf.data()),
sizeof(kernel_arg_t));
file.close();
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
// upload source buffer
{
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_a_data.data(), src_a_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_a, staging_buf.data(),
src_a_buf_size));
std::cout << "uploading source A matrix to device, device mem address="
<< std::hex << kernel_arg.addr_a << ", size=" << std::dec
<< src_a_buf_size << " bytes\n";
std::ofstream file("input.a.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_a_buf_size);
file.close();
}
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_b_data.data(), src_b_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_b, staging_buf.data(),
src_b_buf_size));
std::cout << "uploading source B matrix to device, device mem address="
<< std::hex << kernel_arg.addr_b << ", size=" << std::dec
<< src_b_buf_size << " bytes\n";
std::ofstream file("input.b.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_b_buf_size);
file.close();
}
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < ref_data.size(); ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_c, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, kernel_arg.dim_m, kernel_arg.dim_n));
std::cout << "PASSED!" << std::endl;
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
return 0;
}

View File

@@ -1,7 +1,5 @@
PROJECT = sgemm_gemmini_duo
SRCS = main.cpp common.h
VX_SRCS = kernel.cpp
OPTS ?= -n16

View File

@@ -1,282 +0,0 @@
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <vector>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
std::vector<float> src_a_data;
std::vector<float> src_b_data;
std::vector<float> ref_data;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
// vx_mem_free(device, kernel_arg.addr_a);
// vx_mem_free(device, kernel_arg.addr_b);
// vx_mem_free(device, kernel_arg.addr_c);
vx_dev_close(device);
}
}
void generate_source_matrix(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
src_a_data.resize(dim_m * dim_k);
src_b_data.resize(dim_k * dim_n);
for (uint32_t i = 0; i < src_a_data.size(); ++i) {
src_a_data[i] = static_cast<float>(i);
std::cout << "A: " << i << ": value=" << src_a_data[i] << std::endl;
}
for (uint32_t i = 0; i < src_b_data.size(); ++i) {
src_b_data[i] = static_cast<float>(i);
std::cout << "B: " << i << ": value=" << src_b_data[i] << std::endl;
}
}
void generate_reference_matmul(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
ref_data.resize(dim_m * dim_n);
for (uint32_t i = 0; i < dim_m; ++i) {
for (uint32_t j = 0; j < dim_n; ++j) {
float ref = 0.0f;
for (uint32_t k = 0; k < dim_k; ++k) {
ref += src_a_data[dim_k * i + k] * src_b_data[dim_n * k + j];
}
ref_data.at(dim_n * i + j) = ref;
}
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t dim_m, uint32_t dim_n) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_c, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (float*)staging_buf.data();
for (uint32_t i = 0; i < dim_m * dim_n; ++i) {
float ref = ref_data.at(i);
float cur = buf_ptr[i];
if (std::abs((cur - ref) / ref) > 1e-6) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual=" << cur << ", expected=" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
// FIXME: hardcoded
uint32_t dim_m = 128;
uint32_t dim_n = 128;
uint32_t dim_k = 128;
generate_source_matrix(dim_m, dim_n, dim_k);
generate_reference_matmul(dim_m, dim_n, dim_k);
std::cout << "write reference output" << std::endl;
std::ofstream ref_file("reference.c.bin", std::ios::binary | std::ios::out);
if (!ref_file) {
std::cerr << "error: failed to open reference.c.bin for writing\n";
exit(EXIT_FAILURE);
}
ref_file.write(reinterpret_cast<char *>(ref_data.data()), ref_data.size() * sizeof(ref_data[0]));
ref_file.close();
uint32_t src_a_buf_size = src_a_data.size() * sizeof(src_a_data[0]);
uint32_t src_b_buf_size = src_b_data.size() * sizeof(src_b_data[0]);
uint32_t dst_buf_size = ref_data.size() * sizeof(src_a_data[0]);
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
// RT_CHECK(vx_mem_alloc(device, src_a_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_a));
// RT_CHECK(vx_mem_alloc(device, src_b_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_b));
// RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_c));
kernel_arg.addr_a = 0xa0000000;
kernel_arg.addr_b = 0xa1000000;
kernel_arg.addr_c = 0xc0000000;
kernel_arg.dim_m = dim_m;
kernel_arg.dim_n = dim_n;
kernel_arg.dim_k = dim_k;
std::cout << "dev_addr_a=0x" << std::hex << kernel_arg.addr_a << std::endl;
std::cout << "dev_addr_b=0x" << std::hex << kernel_arg.addr_b << std::endl;
std::cout << "dev_addr_c=0x" << std::hex << kernel_arg.addr_c << std::endl;
// allocate staging buffer
{
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(
src_a_buf_size,
std::max<uint32_t>(
src_b_buf_size,
std::max<uint32_t>(dst_buf_size, sizeof(kernel_arg_t))));
staging_buf.resize(staging_buf_size);
}
// upload kernel argument
{
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
std::cout << "uploading argument buffer to device, device mem address="
<< std::hex << KERNEL_ARG_DEV_MEM_ADDR << ", size=" << std::dec
<< sizeof(kernel_arg_t) << " bytes\n";
std::ofstream file("args.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(staging_buf.data()),
sizeof(kernel_arg_t));
file.close();
}
// upload source buffer
{
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_a_data.data(), src_a_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_a, staging_buf.data(),
src_a_buf_size));
std::cout << "uploading source A matrix to device, device mem address="
<< std::hex << kernel_arg.addr_a << ", size=" << std::dec
<< src_a_buf_size << " bytes\n";
std::ofstream file("input.a.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_a_buf_size);
file.close();
}
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_b_data.data(), src_b_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_b, staging_buf.data(),
src_b_buf_size));
std::cout << "uploading source B matrix to device, device mem address="
<< std::hex << kernel_arg.addr_b << ", size=" << std::dec
<< src_b_buf_size << " bytes\n";
std::ofstream file("input.b.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_b_buf_size);
file.close();
}
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < ref_data.size(); ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_c, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, kernel_arg.dim_m, kernel_arg.dim_n));
std::cout << "PASSED!" << std::endl;
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
return 0;
}

View File

@@ -1,7 +1,5 @@
PROJECT = sgemm_tcore
SRCS = main.cpp common.h
VX_SRCS = kernel.cpp
VX_INCLUDES = sgemm_impl.hpp

View File

@@ -1,308 +0,0 @@
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <vector>
#include <cassert>
#include "common.h"
#include "half.hpp"
using half_float::half;
using half_float::half_cast;
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
template <typename T> std::vector<T> src_a_data;
template <typename T> std::vector<T> src_b_data;
std::vector<float> ref_data;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
// vx_mem_free(device, kernel_arg.addr_a);
// vx_mem_free(device, kernel_arg.addr_b);
// vx_mem_free(device, kernel_arg.addr_c);
vx_dev_close(device);
}
}
template <typename T>
void generate_source_matrix(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
static_assert(std::is_same_v<half, T> || std::is_same_v<float, T>,
"unsupported floating point datatype");
src_a_data<T>.resize(dim_m * dim_k);
src_b_data<T>.resize(dim_k * dim_n);
for (uint32_t i = 0; i < src_a_data<T>.size(); ++i) {
if constexpr (std::is_same_v<half, T>) {
src_a_data<T>[i] = half_cast<half>(static_cast<float>(i));
} else if (std::is_same_v<float, T>) {
src_a_data<T>[i] = static_cast<float>(i);
}
std::cout << "A: " << i << ": value=" << src_a_data<T>[i] << std::endl;
}
for (uint32_t i = 0; i < src_b_data<T>.size(); ++i) {
if constexpr (std::is_same_v<half, T>) {
src_b_data<T>[i] = half_cast<half>(static_cast<float>(i));
} else if (std::is_same_v<float, T>) {
src_b_data<T>[i] = static_cast<float>(i);
}
std::cout << "B: " << i << ": value=" << src_b_data<T>[i] << std::endl;
}
}
template <typename T>
void generate_reference_matmul(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
static_assert(std::is_same_v<half, T> || std::is_same_v<float, T>,
"unsupported floating point datatype");
ref_data.resize(dim_m * dim_n);
for (uint32_t i = 0; i < dim_m; ++i) {
for (uint32_t j = 0; j < dim_n; ++j) {
float ref = 0.0f;
for (uint32_t k = 0; k < dim_k; ++k) {
ref += static_cast<float>(src_a_data<T>[dim_k * i + k]) *
static_cast<float>(src_b_data<T>[dim_n * k + j]);
}
ref_data.at(dim_n * i + j) = ref;
}
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t dim_m, uint32_t dim_n) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_c, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (float*)staging_buf.data();
for (uint32_t i = 0; i < dim_m * dim_n; ++i) {
float ref = ref_data.at(i);
float cur = buf_ptr[i];
if (std::abs((cur - ref) / ref) > 1e-6) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual=" << cur << ", expected=" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
// FIXME: hardcoded
uint32_t dim_m = 64;
uint32_t dim_n = 64;
uint32_t dim_k = 64;
using float_type = half;
generate_source_matrix<float_type>(dim_m, dim_n, dim_k);
generate_reference_matmul<float_type>(dim_m, dim_n, dim_k);
std::cout << "write reference output" << std::endl;
std::ofstream ref_file("reference.c.bin", std::ios::binary | std::ios::out);
if (!ref_file) {
std::cerr << "error: failed to open reference.c.bin for writing\n";
exit(EXIT_FAILURE);
}
ref_file.write(reinterpret_cast<char *>(ref_data.data()), ref_data.size() * sizeof(ref_data[0]));
ref_file.close();
uint32_t src_a_buf_size = src_a_data<float_type>.size() * sizeof(src_a_data<float_type>[0]);
uint32_t src_b_buf_size = src_b_data<float_type>.size() * sizeof(src_b_data<float_type>[0]);
uint32_t dst_buf_size = ref_data.size() * sizeof(src_a_data<float_type>[0]);
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
// RT_CHECK(vx_mem_alloc(device, src_a_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_a));
// RT_CHECK(vx_mem_alloc(device, src_b_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_b));
// RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_c));
kernel_arg.addr_a = 0xa0000000;
kernel_arg.addr_b = 0xa1000000;
kernel_arg.addr_c = 0xc0000000;
kernel_arg.dim_m = dim_m;
kernel_arg.dim_n = dim_n;
kernel_arg.dim_k = dim_k;
std::cout << "dev_addr_a=0x" << std::hex << kernel_arg.addr_a << std::endl;
std::cout << "dev_addr_b=0x" << std::hex << kernel_arg.addr_b << std::endl;
std::cout << "dev_addr_c=0x" << std::hex << kernel_arg.addr_c << std::endl;
// allocate staging buffer
{
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(
src_a_buf_size,
std::max<uint32_t>(
src_b_buf_size,
std::max<uint32_t>(dst_buf_size, sizeof(kernel_arg_t))));
staging_buf.resize(staging_buf_size);
}
// upload kernel argument
{
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
std::cout << "uploading argument buffer to device, device mem address="
<< std::hex << KERNEL_ARG_DEV_MEM_ADDR << ", size=" << std::dec
<< sizeof(kernel_arg_t) << " bytes\n";
std::ofstream file("args.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(staging_buf.data()),
sizeof(kernel_arg_t));
file.close();
}
// upload source buffer
{
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_a_data<float_type>.data(),
src_a_data<float_type>.size() * sizeof(float_type));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_a, staging_buf.data(),
src_a_buf_size));
std::cout << "uploading source A matrix to device, device mem address="
<< std::hex << kernel_arg.addr_a << ", size=" << std::dec
<< src_a_buf_size << " bytes\n";
std::ofstream file("input.a.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_a_buf_size);
file.close();
}
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_b_data<float_type>.data(),
src_b_data<float_type>.size() * sizeof(float_type));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_b, staging_buf.data(),
src_b_buf_size));
std::cout << "uploading source B matrix to device, device mem address="
<< std::hex << kernel_arg.addr_b << ", size=" << std::dec
<< src_b_buf_size << " bytes\n";
std::ofstream file("input.b.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_b_buf_size);
file.close();
}
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < ref_data.size(); ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_c, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, kernel_arg.dim_m, kernel_arg.dim_n));
std::cout << "PASSED!" << std::endl;
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
return 0;
}

View File

@@ -1,7 +1,5 @@
PROJECT = sgemm_wg
SRCS = main.cpp common.h
VX_SRCS = kernel.cpp
OPTS ?= -n16

View File

@@ -1,292 +0,0 @@
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <vector>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
std::vector<float> src_a_data;
std::vector<float> src_b_data;
std::vector<float> ref_data;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
// vx_mem_free(device, kernel_arg.addr_a);
// vx_mem_free(device, kernel_arg.addr_b);
// vx_mem_free(device, kernel_arg.addr_c);
vx_dev_close(device);
}
}
void generate_source_matrix(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
src_a_data.resize(dim_m * dim_k);
src_b_data.resize(dim_k * dim_n);
for (uint32_t i = 0; i < src_a_data.size(); ++i) {
src_a_data[i] = static_cast<float>(i);
std::cout << "A: " << i << ": value=" << src_a_data[i] << std::endl;
}
for (uint32_t i = 0; i < src_b_data.size(); ++i) {
src_b_data[i] = static_cast<float>(i);
std::cout << "B: " << i << ": value=" << src_b_data[i] << std::endl;
}
}
void generate_reference_matmul(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) {
ref_data.resize(dim_m * dim_n);
for (uint32_t i = 0; i < dim_m; ++i) {
for (uint32_t j = 0; j < dim_n; ++j) {
float ref = 0.0f;
for (uint32_t k = 0; k < dim_k; ++k) {
ref += src_a_data[dim_k * i + k] * src_b_data[dim_n * k + j];
}
ref_data.at(dim_n * i + j) = ref;
}
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t dim_m, uint32_t dim_n) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_c, buf_size));
std::cout << "downloading result C matrix from device, device mem address="
<< std::hex << kernel_arg.addr_c << ", size=" << std::dec
<< buf_size << " bytes\n";
std::ofstream file("output.c.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open output.c.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(staging_buf.data()), buf_size);
file.close();
std::ofstream ref_file("reference.c.bin", std::ios::binary | std::ios::out);
if (!ref_file) {
std::cerr << "error: failed to open reference.c.bin for writing\n";
exit(EXIT_FAILURE);
}
ref_file.write(reinterpret_cast<char *>(ref_data.data()), buf_size);
ref_file.close();
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (float*)staging_buf.data();
for (uint32_t i = 0; i < dim_m * dim_n; ++i) {
float ref = ref_data.at(i);
float cur = buf_ptr[i];
if (std::abs((cur - ref) / ref) > 1e-6) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual=" << cur << ", expected=" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
// FIXME: hardcoded
uint32_t dim_m = 128;
uint32_t dim_n = 128;
uint32_t dim_k = 128;
generate_source_matrix(dim_m, dim_n, dim_k);
generate_reference_matmul(dim_m, dim_n, dim_k);
uint32_t src_a_buf_size = src_a_data.size() * sizeof(src_a_data[0]);
uint32_t src_b_buf_size = src_b_data.size() * sizeof(src_b_data[0]);
uint32_t dst_buf_size = ref_data.size() * sizeof(src_a_data[0]);
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
// RT_CHECK(vx_mem_alloc(device, src_a_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_a));
// RT_CHECK(vx_mem_alloc(device, src_b_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_b));
// RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_c));
kernel_arg.addr_a = 0x20000UL;
kernel_arg.addr_b = 0x28000UL;
kernel_arg.addr_c = 0xc0000000UL;
kernel_arg.dim_m = dim_m;
kernel_arg.dim_n = dim_n;
kernel_arg.dim_k = dim_k;
std::cout << "dev_addr_a=0x" << std::hex << kernel_arg.addr_a << std::endl;
std::cout << "dev_addr_b=0x" << std::hex << kernel_arg.addr_b << std::endl;
std::cout << "dev_addr_c=0x" << std::hex << kernel_arg.addr_c << std::endl;
// allocate staging buffer
{
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(
src_a_buf_size,
std::max<uint32_t>(
src_b_buf_size,
std::max<uint32_t>(dst_buf_size, sizeof(kernel_arg_t))));
staging_buf.resize(staging_buf_size);
}
// upload kernel argument
{
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
std::cout << "uploading argument buffer to device, device mem address="
<< std::hex << KERNEL_ARG_DEV_MEM_ADDR << ", size=" << std::dec
<< sizeof(kernel_arg_t) << " bytes\n";
std::ofstream file("args.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(staging_buf.data()),
sizeof(kernel_arg_t));
file.close();
}
// upload source buffer
{
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_a_data.data(), src_a_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_a, staging_buf.data(),
src_a_buf_size));
std::cout << "uploading source A matrix to device, device mem address="
<< std::hex << kernel_arg.addr_a << ", size=" << std::dec
<< src_a_buf_size << " bytes\n";
std::ofstream file("input.a.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open input.a.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_a_buf_size);
file.close();
}
{
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_b_data.data(), src_b_data.size() * sizeof(float));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_b, staging_buf.data(),
src_b_buf_size));
std::cout << "uploading source B matrix to device, device mem address="
<< std::hex << kernel_arg.addr_b << ", size=" << std::dec
<< src_b_buf_size << " bytes\n";
std::ofstream file("input.b.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open input.b.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), src_b_buf_size);
file.close();
}
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < ref_data.size(); ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_c, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, kernel_arg.dim_m, kernel_arg.dim_n));
std::cout << "PASSED!" << std::endl;
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
return 0;
}

View File

@@ -1,9 +0,0 @@
PROJECT = sgemmx
SRCS = main.cpp
VX_SRCS = kernel.cpp
OPTS ?= -n32
include ../common.mk

View File

@@ -1,18 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
#ifndef TYPE
#define TYPE float
#endif
typedef struct {
uint32_t num_tasks;
uint32_t size;
uint64_t A_addr;
uint64_t B_addr;
uint64_t C_addr;
} kernel_arg_t;
#endif

View File

@@ -1,41 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
inline char is_log2(uint32_t x) {
return ((x & (x-1)) == 0);
}
inline uint32_t log2_fast(uint32_t x) {
return 31 - __builtin_clz (x);
}
void kernel_body(uint32_t task_id, kernel_arg_t* __UNIFORM__ arg) {
auto A = reinterpret_cast<TYPE*>(arg->A_addr);
auto B = reinterpret_cast<TYPE*>(arg->B_addr);
auto C = reinterpret_cast<TYPE*>(arg->C_addr);
auto size = arg->size;
uint32_t row, col;
if (is_log2(size)) {
uint32_t log_size = log2_fast(size);
row = task_id >> log_size;
col = task_id & (size-1);
} else {
row = task_id / size;
col = task_id % size;
}
TYPE sum (0);
for (int e = 0; e < size; ++e) {
sum += A[row * size + e] * B[e * size + col];
}
C[row * size + col] = sum;
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View File

@@ -1,251 +0,0 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <chrono>
#include <vortex.h>
#include "common.h"
#define FLOAT_ULP 6
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
template <typename Type>
class Comparator {};
template <>
class Comparator<int> {
public:
static const char* type_str() {
return "integer";
}
static int generate() {
return rand();
}
static bool compare(int a, int b, int index, int errors) {
if (a != b) {
if (errors < 100) {
printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
}
return false;
}
return true;
}
};
template <>
class Comparator<float> {
public:
static const char* type_str() {
return "float";
}
static int generate() {
return static_cast<float>(rand()) / RAND_MAX;
}
static bool compare(float a, float b, int index, int errors) {
union fi_t { float f; int32_t i; };
fi_t fa, fb;
fa.f = a;
fb.f = b;
auto d = std::abs(fa.i - fb.i);
if (d > FLOAT_ULP) {
if (errors < 100) {
printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
}
return false;
}
return true;
}
};
static void matmul_cpu(TYPE* out, const TYPE* A, const TYPE* B, uint32_t width, uint32_t height) {
for (uint32_t row = 0; row < height; ++row) {
for (uint32_t col = 0; col < width; ++col) {
TYPE sum(0);
for (uint32_t e = 0; e < width; ++e) {
sum += A[row * width + e] * B[e * width + col];
}
out[row * width + col] = sum;
}
}
}
const char* kernel_file = "kernel.bin";
uint32_t size = 32;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n size] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
size = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.A_addr);
vx_mem_free(device, kernel_arg.B_addr);
vx_mem_free(device, kernel_arg.C_addr);
vx_dev_close(device);
}
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint32_t num_points = size * size;
uint32_t buf_size = num_points * sizeof(TYPE);
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
std::cout << "matrix size: " << size << "x" << size << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.A_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.B_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.C_addr));
kernel_arg.num_tasks = num_points;
kernel_arg.size = size;
std::cout << "dev_src0=0x" << std::hex << kernel_arg.A_addr << std::endl;
std::cout << "dev_src1=0x" << std::hex << kernel_arg.B_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.C_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// generate source data
std::vector<TYPE> src_A(num_points);
std::vector<TYPE> src_B(num_points);
std::vector<TYPE> refs(num_points);
for (uint32_t i = 0; i < num_points; ++i) {
auto a = static_cast<float>(std::rand()) / RAND_MAX;
auto b = static_cast<float>(std::rand()) / RAND_MAX;
src_A[i] = static_cast<TYPE>(a * size);
src_B[i] = static_cast<TYPE>(b * size);
}
matmul_cpu(refs.data(), src_A.data(), src_B.data(), size, size);
// upload source buffer0
{
std::cout << "upload source buffer0" << std::endl;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = src_A[i];
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.A_addr, staging_buf.data(), buf_size));
}
// upload source buffer1
{
std::cout << "upload source buffer1" << std::endl;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = src_B[i];
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.B_addr, staging_buf.data(), buf_size));
}
// clear destination buffer
std::cout << "clear destination buffer" << std::endl;
memset(staging_buf.data(), 0, num_points * sizeof(TYPE));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.C_addr, staging_buf.data(), buf_size));
auto time_start = std::chrono::high_resolution_clock::now();
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
auto time_end = std::chrono::high_resolution_clock::now();
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
printf("Elapsed time: %lg ms\n", elapsed);
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.C_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < refs.size(); ++i) {
auto ref = refs[i];
auto cur = buf_ptr[i];
if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;
return 0;
}

View File

@@ -1,9 +0,0 @@
PROJECT = sort
SRCS = main.cpp
VX_SRCS = kernel.cpp
OPTS ?= -n16
include ../common.mk

View File

@@ -1,16 +0,0 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
#ifndef TYPE
#define TYPE int
#endif
typedef struct {
uint32_t num_points;
uint64_t src_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -1,25 +0,0 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
uint32_t num_points = arg->num_points;
auto src_ptr = (TYPE*)arg->src_addr;
auto dst_ptr = (TYPE*)arg->dst_addr;
auto ref_value = src_ptr[task_id];
uint32_t pos = 0;
for (uint32_t i = 0; i < num_points; ++i) {
auto cur_value = src_ptr[i];
pos += (cur_value < ref_value) || ((cur_value == ref_value) && (i < task_id));
}
dst_ptr[pos] = ref_value;
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View File

@@ -1,214 +0,0 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <vector>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
std::vector<TYPE> src_data;
std::vector<TYPE> ref_data;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.src_addr);
vx_mem_free(device, kernel_arg.dst_addr);
vx_dev_close(device);
}
}
void gen_input_data(uint32_t num_points) {
src_data.resize(num_points);
for (uint32_t i = 0; i < num_points; ++i) {
auto r = static_cast<float>(std::rand()) / RAND_MAX;
auto value = static_cast<TYPE>(r * num_points);
src_data[i] = value;
std::cout << std::dec << i << ": value=" << value << std::endl;
}
}
void gen_ref_data(uint32_t num_points) {
ref_data.resize(num_points);
for (uint32_t i = 0; i < num_points; ++i) {
TYPE ref_value = src_data.at(i);
uint32_t pos = 0;
for (uint32_t j = 0; j < num_points; ++j) {
TYPE cur_value = src_data.at(j);
pos += (cur_value < ref_value) || (cur_value == ref_value && j < i);
}
ref_data.at(pos) = ref_value;
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
TYPE ref = ref_data.at(i);
TYPE cur = buf_ptr[i];
if (cur != ref) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual=" << cur << ", expected=" << ref << std::endl;
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint32_t num_points = count;
// generate input data
gen_input_data(num_points);
// generate reference data
gen_ref_data(num_points);
uint32_t src_buf_size = src_data.size() * sizeof(int32_t);
uint32_t dst_buf_size = ref_data.size() * sizeof(int32_t);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, src_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src_addr));
RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.num_points = num_points;
std::cout << "dev_src=0x" << std::hex << kernel_arg.src_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate staging buffer
{
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(dst_buf_size,
sizeof(kernel_arg_t)));
staging_buf.resize(staging_buf_size);
}
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// upload source buffer
{
std::cout << "upload source buffer" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, src_data.data(), num_points * sizeof(TYPE));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src_addr, staging_buf.data(), src_buf_size));
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), dst_buf_size));
}
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, dst_buf_size, num_points));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;
return 0;
}

View File

@@ -1,7 +1,5 @@
PROJECT = unaligned
SRCS = main.cpp common.h
VX_SRCS = kernel.cpp
OPTS ?= -n16

View File

@@ -1,92 +0,0 @@
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <string.h>
#include <vortex.h>
#include <vector>
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_dev_close(device);
}
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
return 0;
}

Binary file not shown.

View File

@@ -1,9 +1,7 @@
PROJECT = vecaddx
SRCS = main.cpp
VX_SRCS = kernel.cpp
OPTS ?= -n64
include ../common.mk
include ../common.mk

View File

@@ -1,275 +0,0 @@
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include "common.h"
#define FLOAT_ULP 6
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
template <typename Type>
class Comparator {};
template <>
class Comparator<int> {
public:
static const char* type_str() {
return "integer";
}
static int generate() {
return rand();
}
static bool compare(int a, int b, int index, int errors) {
if (a != b) {
if (errors < 100) {
printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
}
return false;
}
return true;
}
};
template <>
class Comparator<float> {
private:
union Float_t { float f; int i; };
public:
static const char* type_str() {
return "float";
}
static int generate() {
return static_cast<float>(rand()) / RAND_MAX;
}
static bool compare(float a, float b, int index, int errors) {
union fi_t { float f; int32_t i; };
fi_t fa, fb;
fa.f = a;
fb.f = b;
auto d = std::abs(fa.i - fb.i);
if (d > FLOAT_ULP) {
if (errors < 100) {
printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
}
return false;
}
return true;
}
};
const char* kernel_file = "kernel.bin";
uint32_t size = 16;
vx_device_h device = nullptr;
std::vector<TYPE> source_data;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
size = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
// vx_mem_free(device, kernel_arg.src0_addr);
// vx_mem_free(device, kernel_arg.src1_addr);
// vx_mem_free(device, kernel_arg.dst_addr);
vx_dev_close(device);
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
uint32_t num_points) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.dst_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
auto ref = source_data[2 * i + 0] + source_data[2 * i + 1];
auto cur = buf_ptr[i];
if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t num_cores, num_warps, num_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
std::cout << "number of cores: " << num_cores << std::endl;
std::cout << "number of warps: " << num_warps << std::endl;
std::cout << "number of threads: " << num_threads << std::endl;
uint32_t num_points = size;
uint32_t buf_size = num_points * sizeof(TYPE);
std::cout << "number of points: " << num_points << std::endl;
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
// RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src0_addr));
// RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src1_addr));
// RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.src0_addr = 0x20000UL;
kernel_arg.src1_addr = 0x28000UL;
kernel_arg.dst_addr = 0xc0000000UL;
kernel_arg.num_points = num_points;
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::endl;
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
std::ofstream file("args.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open args.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(staging_buf.data()), sizeof(kernel_arg_t));
file.close();
// generate source data
source_data.resize(2 * num_points);
for (uint32_t i = 0; i < source_data.size(); ++i) {
// source_data[i] = Comparator<TYPE>::generate();
source_data[i] = static_cast<float>(i);
}
// upload source buffer0
{
std::cout << "upload source buffer0" << std::endl;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = source_data[2 * i + 0];
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size));
std::ofstream file("input.a.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open input.a.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), buf_size);
file.close();
}
// upload source buffer1
{
std::cout << "upload source buffer1" << std::endl;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = source_data[2 * i + 1];
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size));
std::ofstream file("input.b.bin", std::ios::binary | std::ios::out);
if (!file) {
std::cerr << "error: failed to open input.b.bin for writing\n";
exit(EXIT_FAILURE);
}
file.write(reinterpret_cast<char *>(buf_ptr), buf_size);
file.close();
}
// clear destination buffer
std::cout << "clear destination buffer" << std::endl;
memset(staging_buf.data(), 0, num_points * sizeof(TYPE));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, buf_size, num_points));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;
return 0;
}