Vortex 2.0 changes:

+ Microarchitecture optimizations
+ 64-bit support
+ Xilinx FPGA support
+ LLVM-16 support
+ Refactoring and quality control fixes

minor update

minor update

minor update

minor update

minor update

minor update

cleanup

cleanup

cache bindings and memory perf refactory

minor update

minor update

hw unit tests fixes

minor update

minor update

minor update

minor update

minor update

minor udpate

minor update

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor update

minor update

minor update

minor update

minor update

minor update

minor updates

minor updates

minor updates

minor updates

minor update

minor update
This commit is contained in:
Blaise Tine
2023-10-19 20:51:22 -07:00
parent d69a64c32c
commit c1e168fdbe
1309 changed files with 247412 additions and 311463 deletions

View File

@@ -1,80 +1,9 @@
XLEN ?= 32
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
OPTS ?= -n64
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
VX_LDFLAGS += -lm
VX_SRCS = kernel.c
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include -I$(VORTEX_RT_PATH)/../hw
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = dogfood
SRCS = main.cpp
all: $(PROJECT) kernel.bin kernel.dump
kernel.dump: kernel.elf
$(VX_DP) -d -r -t kernel.elf > kernel.dump
VX_SRCS = kernel.cpp
kernel.bin: kernel.elf
$(VX_CP) -O binary kernel.elf kernel.bin
OPTS ?= -n64 -x19 -x20
kernel.elf: $(VX_SRCS)
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run-simx: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-fpga: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-asesim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-vlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.bin
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
.depend: $(SRCS)
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk

View File

@@ -7,9 +7,9 @@ typedef struct {
uint32_t testid;
uint32_t num_tasks;
uint32_t task_size;
uint32_t src0_addr;
uint32_t src1_addr;
uint32_t dst_addr;
uint64_t src0_addr;
uint64_t src1_addr;
uint64_t dst_addr;
} kernel_arg_t;
#endif

View File

@@ -1,334 +0,0 @@
#include <stdint.h>
#include <math.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
typedef void (*PFN_Kernel)(int task_id, kernel_arg_t* arg);
inline float __ieee754_sqrtf (float x) {
asm ("fsqrt.s %0, %1" : "=f" (x) : "f" (x));
return x;
}
void kernel_iadd(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
int32_t a = src0_ptr[offset+i];
int32_t b = src1_ptr[offset+i];
int32_t c = a + b;
dst_ptr[offset+i] = c;
}
}
void kernel_imul(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
int32_t a = src0_ptr[offset+i];
int32_t b = src1_ptr[offset+i];
int32_t c = a * b;
dst_ptr[offset+i] = c;
}
}
void kernel_idiv(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
int32_t a = src0_ptr[offset+i];
int32_t b = src1_ptr[offset+i];
int32_t c = a / b;
dst_ptr[offset+i] = c;
}
}
void kernel_idiv_mul(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
int32_t a = src0_ptr[offset+i];
int32_t b = src1_ptr[offset+i];
int32_t c = a / b;
int32_t d = a * b;
int32_t e = c + d;
dst_ptr[offset+i] = e;
}
}
void kernel_fadd(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a + b;
dst_ptr[offset+i] = c;
}
}
void kernel_fsub(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a - b;
dst_ptr[offset+i] = c;
}
}
void kernel_fmul(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a * b;
dst_ptr[offset+i] = c;
}
}
void kernel_fmadd(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a * b + b;
dst_ptr[offset+i] = c;
}
}
void kernel_fmsub(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a * b - b;
dst_ptr[offset+i] = c;
}
}
void kernel_fnmadd(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c =-a * b - b;
dst_ptr[offset+i] = c;
}
}
void kernel_fnmsub(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c =-a * b + b;
dst_ptr[offset+i] = c;
}
}
void kernel_fnmadd_madd(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c =-a * b - b;
float d = a * b + b;
float e = c + d;
dst_ptr[offset+i] = e;
}
}
void kernel_fdiv(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a / b;
dst_ptr[offset+i] = c;
}
}
void kernel_fdiv2(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a / b;
float d = b / a;
float e = c + d;
dst_ptr[offset+i] = e;
}
}
void kernel_fsqrt(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = __ieee754_sqrtf(a * b);
dst_ptr[offset+i] = c;
}
}
void kernel_ftoi(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a + b;
int32_t d = (int32_t)c;
dst_ptr[offset+i] = d;
}
}
void kernel_ftou(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
float* src0_ptr = (float*)arg->src0_addr;
float* src1_ptr = (float*)arg->src1_addr;
uint32_t* dst_ptr = (uint32_t*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a + b;
uint32_t d = (uint32_t)c;
dst_ptr[offset+i] = d;
}
}
void kernel_itof(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
int32_t a = src0_ptr[offset+i];
int32_t b = src1_ptr[offset+i];
int32_t c = a + b;
float d = (float)c;
dst_ptr[offset+i] = d;
}
}
void kernel_utof(int task_id, kernel_arg_t* arg) {
uint32_t count = arg->task_size;
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
float* dst_ptr = (float*)arg->dst_addr;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
int32_t a = src0_ptr[offset+i];
int32_t b = src1_ptr[offset+i];
uint32_t c = a + b;
float d = (float)c;
dst_ptr[offset+i] = d;
}
}
static const PFN_Kernel sc_tests[] = {
kernel_iadd,
kernel_imul,
kernel_idiv,
kernel_idiv_mul,
kernel_fadd,
kernel_fsub,
kernel_fmul,
kernel_fmadd,
kernel_fmsub,
kernel_fnmadd,
kernel_fnmsub,
kernel_fnmadd_madd,
kernel_fdiv,
kernel_fdiv2,
kernel_fsqrt,
kernel_ftoi,
kernel_ftou,
kernel_itof,
kernel_utof,
};
void main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)sc_tests[arg->testid], arg);
}

View File

@@ -0,0 +1,396 @@
#include <stdint.h>
#include <math.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
typedef void (*PFN_Kernel)(int task_id, kernel_arg_t* __UNIFORM__ arg);
inline float __ieee754_sqrtf (float x) {
asm ("fsqrt.s %0, %1" : "=f" (x) : "f" (x));
return x;
}
void kernel_iadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
int32_t a = src0_ptr[offset+i];
int32_t b = src1_ptr[offset+i];
int32_t c = a + b;
dst_ptr[offset+i] = c;
}
}
void kernel_imul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a * b;
dst_ptr[offset+i] = c;
}
}
void kernel_idiv(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a / b;
dst_ptr[offset+i] = c;
}
}
void kernel_idiv_mul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a / b;
auto d = a * b;
auto e = c + d;
dst_ptr[offset+i] = e;
}
}
void kernel_fadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
float a = src0_ptr[offset+i];
float b = src1_ptr[offset+i];
float c = a + b;
dst_ptr[offset+i] = c;
}
}
void kernel_fsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a - b;
dst_ptr[offset+i] = c;
}
}
void kernel_fmul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a * b;
dst_ptr[offset+i] = c;
}
}
void kernel_fmadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a * b + b;
dst_ptr[offset+i] = c;
}
}
void kernel_fmsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a * b - b;
dst_ptr[offset+i] = c;
}
}
void kernel_fnmadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c =-a * b - b;
dst_ptr[offset+i] = c;
}
}
void kernel_fnmsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c =-a * b + b;
dst_ptr[offset+i] = c;
}
}
void kernel_fnmadd_madd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c =-a * b - b;
auto d = a * b + b;
auto e = c + d;
dst_ptr[offset+i] = e;
}
}
void kernel_fdiv(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a / b;
dst_ptr[offset+i] = c;
}
}
void kernel_fdiv2(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a / b;
auto d = b / a;
auto e = c + d;
dst_ptr[offset+i] = e;
}
}
void kernel_fsqrt(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = __ieee754_sqrtf(a * b);
dst_ptr[offset+i] = c;
}
}
void kernel_ftoi(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (int32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a + b;
auto d = (int32_t)c;
dst_ptr[offset+i] = d;
}
}
void kernel_ftou(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (float*)arg->src0_addr;
auto src1_ptr = (float*)arg->src1_addr;
auto dst_ptr = (uint32_t*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a + b;
auto d = (uint32_t)c;
dst_ptr[offset+i] = d;
}
}
void kernel_itof(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a + b;
auto d = (float)c;
dst_ptr[offset+i] = d;
}
}
void kernel_utof(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto count = arg->task_size;
auto src0_ptr = (int32_t*)arg->src0_addr;
auto src1_ptr = (int32_t*)arg->src1_addr;
auto dst_ptr = (float*)arg->dst_addr;
auto offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {
auto a = src0_ptr[offset+i];
auto b = src1_ptr[offset+i];
auto c = a + b;
auto d = (float)c;
dst_ptr[offset+i] = d;
}
}
void kernel_bar(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto num_warps = vx_num_warps();
auto num_threads = vx_num_threads();
auto cid = vx_core_id();
auto wid = vx_warp_id();
auto tid = vx_thread_id();
auto src0_ptr = (uint32_t*)arg->src0_addr;
auto dst_ptr = (uint32_t*)arg->dst_addr;
// per warp delay
uint32_t barrier_stall = 0;
for (int i = 0; i <= wid; ++i) {
barrier_stall += src0_ptr[0] * src0_ptr[i];
}
// memory fence
vx_fence();
// local barrier
vx_barrier(0, num_warps);
// update destination
auto src_idx = (cid * num_warps + (num_warps - 1 - wid)) * num_threads + tid;
dst_ptr[task_id] = src0_ptr[src_idx] + barrier_stall;
}
void kernel_gbar(int task_id, kernel_arg_t* __UNIFORM__ arg) {
auto num_cores = vx_num_cores();
auto num_warps = vx_num_warps();
auto num_threads = vx_num_threads();
auto cid = vx_core_id();
auto wid = vx_warp_id();
auto tid = vx_thread_id();
auto src0_ptr = (uint32_t*)arg->src0_addr;
auto dst_ptr = (uint32_t*)arg->dst_addr;
// per core delay
uint32_t barrier_stall = 0;
for (int i = 0; i <= cid; ++i) {
for (int j = 0; j <= wid; ++j) {
barrier_stall += src0_ptr[0] * src0_ptr[i + j];
}
}
// memory fence
vx_fence();
// global barrier
vx_barrier(0x80000000, num_cores);
// update destination
auto src_idx = ((num_cores - 1 - cid) * num_warps + (num_warps - 1 - wid)) * num_threads + tid;
dst_ptr[task_id] = src0_ptr[src_idx] + barrier_stall;
}
static const PFN_Kernel sc_tests[] = {
kernel_iadd,
kernel_imul,
kernel_idiv,
kernel_idiv_mul,
kernel_fadd,
kernel_fsub,
kernel_fmul,
kernel_fmadd,
kernel_fmsub,
kernel_fnmadd,
kernel_fnmsub,
kernel_fnmadd_madd,
kernel_fdiv,
kernel_fdiv2,
kernel_fsqrt,
kernel_ftoi,
kernel_ftou,
kernel_itof,
kernel_utof,
kernel_bar,
kernel_gbar
};
int main() {
auto arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)sc_tests[arg->testid], arg);
return 0;
}

View File

@@ -1,109 +1,49 @@
#include <iostream>
#include <vector>
#include <unordered_set>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include <VX_config.h>
#include "testcases.h"
#include "common.h"
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
class TestMngr {
public:
TestMngr() {
this->add_test("iadd", new Test_IADD());
this->add_test("imul", new Test_IMUL());
this->add_test("idiv", new Test_IDIV());
this->add_test("idiv-mul", new Test_IDIV_MUL());
#ifdef EXT_F_ENABLE
this->add_test("fadd", new Test_FADD());
this->add_test("fsub", new Test_FSUB());
this->add_test("fmul", new Test_FMUL());
this->add_test("fmadd", new Test_FMADD());
this->add_test("fmsub", new Test_FMSUB());
this->add_test("fnmadd", new Test_FNMADD());
this->add_test("fnmsub", new Test_FNMSUB());
this->add_test("fnmadd-madd", new Test_FNMADD_MADD());
this->add_test("fdiv", new Test_FDIV());
this->add_test("fdiv2", new Test_FDIV2());
this->add_test("fsqrt", new Test_FSQRT());
this->add_test("ftoi", new Test_FTOI());
this->add_test("ftou", new Test_FTOU());
this->add_test("itof", new Test_ITOF());
this->add_test("utof", new Test_UTOF());
#endif
}
~TestMngr() {
for (size_t i = 0; i < _tests.size(); ++i) {
delete _tests[i];
}
}
const std::string& get_name(int testid) const {
return _names.at(testid);
}
ITestCase* get_test(int testid) const {
return _tests.at(testid);
}
void add_test(const char* name, ITestCase* test) {
_names.push_back(name);
_tests.push_back(test);
}
size_t size() const {
return _tests.size();
}
private:
std::vector<std::string> _names;
std::vector<ITestCase*> _tests;
};
///////////////////////////////////////////////////////////////////////////////
TestMngr testMngr;
TestSuite* testSuite = nullptr;
const char* kernel_file = "kernel.bin";
int count = 0;
int count = 0;
std::unordered_set<int> included;
std::unordered_set<int> excluded;
int testid_s = 0;
int testid_e = (testMngr.size() - 1);
int testid_e = 0;
bool stop_on_error = true;
vx_device_h device = nullptr;
vx_buffer_h arg_buf = nullptr;
vx_buffer_h src1_buf = nullptr;
vx_buffer_h src2_buf = nullptr;
vx_buffer_h dst_buf = nullptr;
kernel_arg_t kernel_arg;
vx_device_h device = nullptr;
std::vector<uint8_t> arg_buf;
std::vector<uint8_t> src1_buf;
std::vector<uint8_t> src2_buf;
std::vector<uint8_t> dst_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-t:testid] [-s:testid] [-e:testid] [-k: kernel] [-n words] [-c] [-h: help]" << std::endl;
std::cout << "Usage: [-t<testid>: selected test] [-s<testid>: start test] [-e<testid>: end test] [-x<testid>: excluded tests]" << std::endl;
std::cout << " [-k<kernel>] [-n<words>] [-c] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:t:s:e:k:ch?")) != -1) {
while ((c = getopt(argc, argv, "n:t:x:s:e:k:ch?")) != -1) {
switch (c) {
case 'n':
count = atoi(optarg);
break;
case 't':
testid_s = atoi(optarg);
testid_e = atoi(optarg);
included.insert(atoi(optarg));
break;
case 'x':
excluded.insert(atoi(optarg));
break;
case 's':
testid_s = atoi(optarg);
@@ -130,17 +70,8 @@ static void parse_args(int argc, char **argv) {
}
void cleanup() {
if (arg_buf) {
vx_buf_free(arg_buf);
}
if (src1_buf) {
vx_buf_free(src1_buf);
}
if (src2_buf) {
vx_buf_free(src2_buf);
}
if (dst_buf) {
vx_buf_free(dst_buf);
if (testSuite) {
delete testSuite;
}
if (device) {
vx_mem_free(device, kernel_arg.src0_addr);
@@ -152,7 +83,6 @@ void cleanup() {
int main(int argc, char *argv[]) {
int exitcode = 0;
size_t value;
// parse command arguments
parse_args(argc, argv);
@@ -171,12 +101,12 @@ int main(int argc, char *argv[]) {
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint64_t max_cores, max_warps, max_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads));
uint64_t num_cores, num_warps, num_threads;
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_CORES, &num_cores));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_WARPS, &num_warps));
RT_CHECK(vx_dev_caps(device, VX_CAPS_NUM_THREADS, &num_threads));
int num_tasks = max_cores * max_warps * max_threads;
int num_tasks = num_cores * num_warps * num_threads;
int num_points = count * num_tasks;
size_t buf_size = num_points * sizeof(uint32_t);
@@ -188,59 +118,69 @@ int main(int argc, char *argv[]) {
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.src0_addr = value;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.src1_addr = value;
RT_CHECK(vx_mem_alloc(device, buf_size, &value));
kernel_arg.dst_addr = value;
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src0_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src1_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr));
kernel_arg.num_tasks = num_tasks;
kernel_arg.task_size = count;
std::cout << "dev_src0=" << std::hex << kernel_arg.src0_addr << std::dec << std::endl;
std::cout << "dev_src1=" << std::hex << kernel_arg.src1_addr << std::dec << std::endl;
std::cout << "dev_dst=" << std::hex << kernel_arg.dst_addr << std::dec << std::endl;
std::cout << "dev_src0=0x" << std::hex << kernel_arg.src0_addr << std::dec << std::endl;
std::cout << "dev_src1=0x" << std::hex << kernel_arg.src1_addr << std::dec << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.dst_addr << std::dec << std::endl;
// allocate shared memory
std::cout << "allocate shared memory" << std::endl;
RT_CHECK(vx_buf_alloc(device, sizeof(kernel_arg_t), &arg_buf));
RT_CHECK(vx_buf_alloc(device, buf_size, &src1_buf));
RT_CHECK(vx_buf_alloc(device, buf_size, &src2_buf));
RT_CHECK(vx_buf_alloc(device, buf_size, &dst_buf));
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
arg_buf.resize(sizeof(kernel_arg_t));
src1_buf.resize(buf_size);
src2_buf.resize(buf_size);
dst_buf.resize(buf_size);
// allocate test suite
testSuite = new TestSuite(device);
if (testid_e == 0) {
testid_e = (testSuite->size() - 1);
}
// execute tests
for (int t = testid_s; t <= testid_e; ++t) {
auto name = testMngr.get_name(t);
auto test = testMngr.get_test(t);
if (!included.empty()) {
if (included.count(t) == 0)
continue;
}
if (!excluded.empty()) {
if (excluded.count(t) != 0)
continue;
}
auto test = testSuite->get_test(t);
auto name = test->name();
std::cout << "Test" << t << ": " << name << std::endl;
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
kernel_arg.testid = t;
memcpy((void*)vx_host_ptr(arg_buf), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(arg_buf, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
memcpy(arg_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, arg_buf.data(), sizeof(kernel_arg_t)));
// get test arguments
std::cout << "get test arguments" << std::endl;
test->setup(num_points, (void*)vx_host_ptr(src1_buf), (void*)vx_host_ptr(src2_buf));
RT_CHECK(test->setup(num_points, (void*)src1_buf.data(), (void*)src2_buf.data()));
// upload source buffer0
std::cout << "upload source buffer0" << std::endl;
RT_CHECK(vx_copy_to_dev(src1_buf, kernel_arg.src0_addr, buf_size, 0));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, src1_buf.data(), buf_size));
// upload source buffer1
std::cout << "upload source buffer1" << std::endl;
RT_CHECK(vx_copy_to_dev(src2_buf, kernel_arg.src1_addr, buf_size, 0));
std::cout << "upload source buffer1" << std::endl;
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, src2_buf.data(), buf_size));
// clear destination buffer
std::cout << "clear destination buffer" << std::endl;
for (int i = 0; i < num_points; ++i) {
((uint32_t*)vx_host_ptr(dst_buf))[i] = 0xdeadbeef;
((uint32_t*)dst_buf.data())[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(dst_buf, kernel_arg.dst_addr, buf_size, 0));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, dst_buf.data(), buf_size));
// start device
std::cout << "start device" << std::endl;
@@ -248,18 +188,15 @@ int main(int argc, char *argv[]) {
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, MAX_TIMEOUT));
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(dst_buf, kernel_arg.dst_addr, buf_size, 0));
RT_CHECK(vx_copy_from_dev(device, dst_buf.data(), kernel_arg.dst_addr, buf_size));
// verify destination
std::cout << "verify test result" << std::endl;
int errors = test->verify(num_points,
(void*)vx_host_ptr(dst_buf),
(void*)vx_host_ptr(src1_buf),
(void*)vx_host_ptr(src2_buf));
int errors = test->verify(num_points, dst_buf.data(), src1_buf.data(), src2_buf.data());
if (errors != 0) {
std::cout << "found " << std::dec << errors << " errors!" << std::endl;
std::cout << "Test" << t << "-" << name << " FAILED!" << std::endl << std::flush;

View File

@@ -3,6 +3,19 @@
#include <iostream>
#include <math.h>
#include <limits>
#include <assert.h>
void cleanup();
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
union Float_t {
float f;
@@ -47,33 +60,72 @@ inline bool almost_equal(float a, float b) {
return almost_equal_ulp(a, b);
}
class ITestCase;
class TestSuite {
public:
TestSuite(vx_device_h device);
~TestSuite();
ITestCase* get_test(int testid) const;
void add_test(ITestCase* test);
size_t size() const;
vx_device_h device() const;
private:
std::vector<ITestCase*> _tests;
vx_device_h device_;
};
class ITestCase {
public:
ITestCase() {}
ITestCase(TestSuite* suite, const char* name)
: suite_(suite)
, name_(name)
{}
virtual ~ITestCase() {}
virtual void setup(int n, void* src1, void* src2) = 0;
virtual int verify(int n, void* dst, const void* src1, const void* src2) = 0;
TestSuite* suite() const {
return suite_;
}
const char* name() const {
return name_;
}
virtual int setup(uint32_t n, void* src1, void* src2) = 0;
virtual int verify(uint32_t n, void* dst, const void* src1, const void* src2) = 0;
protected:
TestSuite* suite_;
const char* const name_;
};
class Test_IADD : public ITestCase {
public:
Test_IADD(TestSuite* suite) : ITestCase(suite, "iadd") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = n/2 - i;
b[i] = n/2 + i;
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
auto c = (int32_t*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] + b[i];
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -86,22 +138,24 @@ public:
class Test_IMUL : public ITestCase {
public:
Test_IMUL(TestSuite* suite) : ITestCase(suite, "imul") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = n/2 - i;
b[i] = n/2 + i;
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
auto c = (int32_t*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] * b[i];
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -114,22 +168,24 @@ public:
class Test_IDIV : public ITestCase {
public:
Test_IDIV(TestSuite* suite) : ITestCase(suite, "idiv") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = n/2 - i;
b[i] = n/2 + i;
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
auto c = (int32_t*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] / b[i];
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -142,22 +198,24 @@ public:
class Test_IDIV_MUL : public ITestCase {
public:
Test_IDIV_MUL(TestSuite* suite) : ITestCase(suite, "idiv-mul") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = n/2 - i;
b[i] = n/2 + i;
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
auto c = (int32_t*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] / b[i];
auto y = a[i] * b[i];
auto ref = x + y;
@@ -172,22 +230,24 @@ public:
class Test_FADD : public ITestCase {
public:
Test_FADD(TestSuite* suite) : ITestCase(suite, "fadd") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] + b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -200,22 +260,24 @@ public:
class Test_FSUB : public ITestCase {
public:
Test_FSUB(TestSuite* suite) : ITestCase(suite, "fsub") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] - b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -228,22 +290,24 @@ public:
class Test_FMUL : public ITestCase {
public:
Test_FMUL(TestSuite* suite) : ITestCase(suite, "fmul") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] * b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -256,22 +320,24 @@ public:
class Test_FMADD : public ITestCase {
public:
Test_FMADD(TestSuite* suite) : ITestCase(suite, "fmadd") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] * b[i] + b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -284,22 +350,24 @@ public:
class Test_FMSUB : public ITestCase {
public:
Test_FMSUB(TestSuite* suite) : ITestCase(suite, "fmsub") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] * b[i] - b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -312,22 +380,24 @@ public:
class Test_FNMADD : public ITestCase {
public:
Test_FNMADD(TestSuite* suite) : ITestCase(suite, "fnmadd") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = -a[i] * b[i] - b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -340,22 +410,24 @@ public:
class Test_FNMSUB : public ITestCase {
public:
Test_FNMSUB(TestSuite* suite) : ITestCase(suite, "fnmsub") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = -a[i] * b[i] + b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -368,22 +440,24 @@ public:
class Test_FNMADD_MADD : public ITestCase {
public:
Test_FNMADD_MADD(TestSuite* suite) : ITestCase(suite, "fnmadd-madd") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto x = -a[i] * b[i] - b[i];
auto y = a[i] * b[i] + b[i];
auto ref = x + y;
@@ -398,22 +472,24 @@ public:
class Test_FDIV : public ITestCase {
public:
Test_FDIV(TestSuite* suite) : ITestCase(suite, "fdiv") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = a[i] / b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -426,22 +502,24 @@ public:
class Test_FDIV2 : public ITestCase {
public:
Test_FDIV2(TestSuite* suite) : ITestCase(suite, "fdiv2") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = fround((n - i) * (1.0f/n));
b[i] = fround((n + i) * (1.0f/n));
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] / b[i];
auto y = b[i] / a[i];
auto ref = x + y;
@@ -456,23 +534,25 @@ public:
class Test_FSQRT : public ITestCase {
public:
Test_FSQRT(TestSuite* suite) : ITestCase(suite, "fsqrt") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
float q = 1.0f + (i % 64);
a[i] = q;
b[i] = q;
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto ref = sqrt(a[i] * b[i]);
if (!almost_equal(c[i], ref)) {
std::cout << "error at result #" << i << ": expected=" << ref << ", actual=" << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
@@ -485,22 +565,25 @@ public:
class Test_FTOI : public ITestCase {
public:
Test_FTOI(TestSuite* suite) : ITestCase(suite, "ftoi") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
a[i] = fround((n/2 - i) + (float(i)/n));
b[i] = fround((n/2 - i) + (float(i)/n));
for (uint32_t i = 0; i < n; ++i) {
float q = fround(float(n/2) - i + (float(i) / n));
a[i] = q;
b[i] = q;
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (int32_t*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] + b[i];
auto ref = (int32_t)x;
if (c[i] != ref) {
@@ -514,22 +597,25 @@ public:
class Test_FTOU : public ITestCase {
public:
Test_FTOU(TestSuite* suite) : ITestCase(suite, "ftou") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (float*)src1;
auto b = (float*)src2;
for (int i = 0; i < n; ++i) {
a[i] = fround(i + (float(i)/n));
b[i] = fround(i + (float(i)/n));
for (uint32_t i = 0; i < n; ++i) {
float q = fround(i + (float(i) / n));
a[i] = q;
b[i] = q;
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (float*)src1;
auto b = (float*)src2;
auto c = (uint32_t*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] + b[i];
auto ref = (uint32_t)x;
if (c[i] != ref) {
@@ -543,22 +629,24 @@ public:
class Test_ITOF : public ITestCase {
public:
Test_ITOF(TestSuite* suite) : ITestCase(suite, "itof") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = n/2 - i;
b[i] = n/2 - i;
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (int32_t*)src1;
auto b = (int32_t*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] + b[i];
auto ref = (float)x;
if (!almost_equal(c[i], ref)) {
@@ -572,22 +660,24 @@ public:
class Test_UTOF : public ITestCase {
public:
Test_UTOF(TestSuite* suite) : ITestCase(suite, "utof") {}
void setup(int n, void* src1, void* src2) override {
int setup(uint32_t n, void* src1, void* src2) override {
auto a = (uint32_t*)src1;
auto b = (uint32_t*)src2;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
a[i] = i;
b[i] = i;
}
return 0;
}
int verify(int n, void* dst, const void* src1, const void* src2) override {
int verify(uint32_t n, void* dst, const void* src1, const void* src2) override {
int errors = 0;
auto a = (uint32_t*)src1;
auto b = (uint32_t*)src2;
auto c = (float*)dst;
for (int i = 0; i < n; ++i) {
for (uint32_t i = 0; i < n; ++i) {
auto x = a[i] + b[i];
auto ref = (float)x;
if (!almost_equal(c[i], ref)) {
@@ -597,4 +687,135 @@ public:
}
return errors;
}
};
};
class Test_BAR : public ITestCase {
public:
Test_BAR(TestSuite* suite) : ITestCase(suite, "bar") {}
int setup(uint32_t n, void* src1, void* /*src2*/) override {
RT_CHECK(vx_dev_caps(suite_->device(), VX_CAPS_NUM_WARPS, &num_warps_));
if (num_warps_ == 1) {
std::cout << "Error: multiple warps configuration required!" << std::endl;
return -1;
}
RT_CHECK(vx_dev_caps(suite_->device(), VX_CAPS_NUM_THREADS, &num_threads_));
auto a = (uint32_t*)src1;
for (uint32_t i = 0; i < n; ++i) {
a[i] = i;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* /*src2*/) override {
int errors = 0;
auto a = (uint32_t*)src1;
auto c = (uint32_t*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto tid = i % num_threads_;
auto wid = (i / num_threads_) % num_warps_;
auto cid = i / (num_warps_ * num_threads_);
auto src_idx = (cid * num_warps_ + (num_warps_ - 1 - wid)) * num_threads_ + tid;
uint32_t ref = a[src_idx];
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << std::hex << ref << ", actual=" << c[i] << std::endl;
++errors;
}
}
return errors;
}
uint64_t num_warps_;
uint64_t num_threads_;
};
class Test_GBAR : public ITestCase {
public:
Test_GBAR(TestSuite* suite) : ITestCase(suite, "gbar") {}
int setup(uint32_t n, void* src1, void* /*src2*/) override {
RT_CHECK(vx_dev_caps(suite_->device(), VX_CAPS_NUM_CORES, &num_cores_));
if (num_cores_ == 1) {
std::cout << "Error: multiple cores configuration required!" << std::endl;
return -1;
}
RT_CHECK(vx_dev_caps(suite_->device(), VX_CAPS_NUM_WARPS, &num_warps_));
RT_CHECK(vx_dev_caps(suite_->device(), VX_CAPS_NUM_THREADS, &num_threads_));
auto a = (uint32_t*)src1;
for (uint32_t i = 0; i < n; ++i) {
a[i] = i;
}
return 0;
}
int verify(uint32_t n, void* dst, const void* src1, const void* /*src2*/) override {
int errors = 0;
auto a = (uint32_t*)src1;
auto c = (uint32_t*)dst;
for (uint32_t i = 0; i < n; ++i) {
auto tid = i % num_threads_;
auto wid = (i / num_threads_) % num_warps_;
auto cid = i / (num_warps_ * num_threads_);
auto src_idx = ((num_cores_ - 1 - cid) * num_warps_ + (num_warps_ - 1 - wid)) * num_threads_ + tid;
uint32_t ref = a[src_idx];
if (c[i] != ref) {
std::cout << "error at result #" << i << ": expected=" << std::hex << ref << ", actual=" << c[i] << std::endl;
++errors;
}
}
return errors;
}
uint64_t num_cores_;
uint64_t num_warps_;
uint64_t num_threads_;
};
///////////////////////////////////////////////////////////////////////////////
TestSuite::TestSuite(vx_device_h device)
: device_(device) {
this->add_test(new Test_IADD(this));
this->add_test(new Test_IMUL(this));
this->add_test(new Test_IDIV(this));
this->add_test(new Test_IDIV_MUL(this));
this->add_test(new Test_FADD(this));
this->add_test(new Test_FSUB(this));
this->add_test(new Test_FMUL(this));
this->add_test(new Test_FMADD(this));
this->add_test(new Test_FMSUB(this));
this->add_test(new Test_FNMADD(this));
this->add_test(new Test_FNMSUB(this));
this->add_test(new Test_FNMADD_MADD(this));
this->add_test(new Test_FDIV(this));
this->add_test(new Test_FDIV2(this));
this->add_test(new Test_FSQRT(this));
this->add_test(new Test_FTOI(this));
this->add_test(new Test_FTOU(this));
this->add_test(new Test_ITOF(this));
this->add_test(new Test_UTOF(this));
this->add_test(new Test_BAR(this));
this->add_test(new Test_GBAR(this));
}
TestSuite::~TestSuite() {
for (size_t i = 0; i < _tests.size(); ++i) {
delete _tests[i];
}
}
ITestCase* TestSuite::get_test(int testid) const {
return _tests.at(testid);
}
void TestSuite::add_test(ITestCase* test) {
_tests.push_back(test);
}
size_t TestSuite::size() const {
return _tests.size();
}
vx_device_h TestSuite::device() const {
return device_;
}