+ Microarchitecture optimizations + 64-bit support + Xilinx FPGA support + LLVM-16 support + Refactoring and quality control fixes minor update minor update minor update minor update minor update minor update cleanup cleanup cache bindings and memory perf refactory minor update minor update hw unit tests fixes minor update minor update minor update minor update minor update minor udpate minor update minor update minor update minor update minor update minor update minor update minor updates minor updates minor update minor update minor update minor update minor update minor update minor updates minor updates minor updates minor updates minor update minor update
397 lines
10 KiB
C++
397 lines
10 KiB
C++
#include <stdint.h>
|
|
#include <math.h>
|
|
#include <vx_intrinsics.h>
|
|
#include <vx_spawn.h>
|
|
#include "common.h"
|
|
|
|
typedef void (*PFN_Kernel)(int task_id, kernel_arg_t* __UNIFORM__ arg);
|
|
|
|
inline float __ieee754_sqrtf (float x) {
|
|
asm ("fsqrt.s %0, %1" : "=f" (x) : "f" (x));
|
|
return x;
|
|
}
|
|
|
|
void kernel_iadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|
auto count = arg->task_size;
|
|
auto src0_ptr = (int32_t*)arg->src0_addr;
|
|
auto src1_ptr = (int32_t*)arg->src1_addr;
|
|
auto dst_ptr = (int32_t*)arg->dst_addr;
|
|
auto offset = task_id * count;
|
|
|
|
for (uint32_t i = 0; i < count; ++i) {
|
|
int32_t a = src0_ptr[offset+i];
|
|
int32_t b = src1_ptr[offset+i];
|
|
int32_t c = a + b;
|
|
dst_ptr[offset+i] = c;
|
|
}
|
|
}
|
|
|
|
void kernel_imul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|
auto count = arg->task_size;
|
|
auto src0_ptr = (int32_t*)arg->src0_addr;
|
|
auto src1_ptr = (int32_t*)arg->src1_addr;
|
|
auto dst_ptr = (int32_t*)arg->dst_addr;
|
|
auto offset = task_id * count;
|
|
|
|
for (uint32_t i = 0; i < count; ++i) {
|
|
auto a = src0_ptr[offset+i];
|
|
auto b = src1_ptr[offset+i];
|
|
auto c = a * b;
|
|
dst_ptr[offset+i] = c;
|
|
}
|
|
}
|
|
|
|
void kernel_idiv(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|
auto count = arg->task_size;
|
|
auto src0_ptr = (int32_t*)arg->src0_addr;
|
|
auto src1_ptr = (int32_t*)arg->src1_addr;
|
|
auto dst_ptr = (int32_t*)arg->dst_addr;
|
|
auto offset = task_id * count;
|
|
|
|
for (uint32_t i = 0; i < count; ++i) {
|
|
auto a = src0_ptr[offset+i];
|
|
auto b = src1_ptr[offset+i];
|
|
auto c = a / b;
|
|
dst_ptr[offset+i] = c;
|
|
}
|
|
}
|
|
|
|
void kernel_idiv_mul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|
auto count = arg->task_size;
|
|
auto src0_ptr = (int32_t*)arg->src0_addr;
|
|
auto src1_ptr = (int32_t*)arg->src1_addr;
|
|
auto dst_ptr = (int32_t*)arg->dst_addr;
|
|
auto offset = task_id * count;
|
|
|
|
for (uint32_t i = 0; i < count; ++i) {
|
|
auto a = src0_ptr[offset+i];
|
|
auto b = src1_ptr[offset+i];
|
|
auto c = a / b;
|
|
auto d = a * b;
|
|
auto e = c + d;
|
|
dst_ptr[offset+i] = e;
|
|
}
|
|
}
|
|
|
|
void kernel_fadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|
auto count = arg->task_size;
|
|
auto src0_ptr = (float*)arg->src0_addr;
|
|
auto src1_ptr = (float*)arg->src1_addr;
|
|
auto dst_ptr = (float*)arg->dst_addr;
|
|
auto offset = task_id * count;
|
|
|
|
for (uint32_t i = 0; i < count; ++i) {
|
|
float a = src0_ptr[offset+i];
|
|
float b = src1_ptr[offset+i];
|
|
float c = a + b;
|
|
dst_ptr[offset+i] = c;
|
|
}
|
|
}
|
|
|
|
void kernel_fsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|
auto count = arg->task_size;
|
|
auto src0_ptr = (float*)arg->src0_addr;
|
|
auto src1_ptr = (float*)arg->src1_addr;
|
|
auto dst_ptr = (float*)arg->dst_addr;
|
|
auto offset = task_id * count;
|
|
|
|
for (uint32_t i = 0; i < count; ++i) {
|
|
auto a = src0_ptr[offset+i];
|
|
auto b = src1_ptr[offset+i];
|
|
auto c = a - b;
|
|
dst_ptr[offset+i] = c;
|
|
}
|
|
}
|
|
|
|
void kernel_fmul(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|
auto count = arg->task_size;
|
|
auto src0_ptr = (float*)arg->src0_addr;
|
|
auto src1_ptr = (float*)arg->src1_addr;
|
|
auto dst_ptr = (float*)arg->dst_addr;
|
|
auto offset = task_id * count;
|
|
|
|
for (uint32_t i = 0; i < count; ++i) {
|
|
auto a = src0_ptr[offset+i];
|
|
auto b = src1_ptr[offset+i];
|
|
auto c = a * b;
|
|
dst_ptr[offset+i] = c;
|
|
}
|
|
}
|
|
|
|
void kernel_fmadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|
auto count = arg->task_size;
|
|
auto src0_ptr = (float*)arg->src0_addr;
|
|
auto src1_ptr = (float*)arg->src1_addr;
|
|
auto dst_ptr = (float*)arg->dst_addr;
|
|
auto offset = task_id * count;
|
|
|
|
for (uint32_t i = 0; i < count; ++i) {
|
|
auto a = src0_ptr[offset+i];
|
|
auto b = src1_ptr[offset+i];
|
|
auto c = a * b + b;
|
|
dst_ptr[offset+i] = c;
|
|
}
|
|
}
|
|
|
|
void kernel_fmsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|
auto count = arg->task_size;
|
|
auto src0_ptr = (float*)arg->src0_addr;
|
|
auto src1_ptr = (float*)arg->src1_addr;
|
|
auto dst_ptr = (float*)arg->dst_addr;
|
|
auto offset = task_id * count;
|
|
|
|
for (uint32_t i = 0; i < count; ++i) {
|
|
auto a = src0_ptr[offset+i];
|
|
auto b = src1_ptr[offset+i];
|
|
auto c = a * b - b;
|
|
dst_ptr[offset+i] = c;
|
|
}
|
|
}
|
|
|
|
void kernel_fnmadd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|
auto count = arg->task_size;
|
|
auto src0_ptr = (float*)arg->src0_addr;
|
|
auto src1_ptr = (float*)arg->src1_addr;
|
|
auto dst_ptr = (float*)arg->dst_addr;
|
|
auto offset = task_id * count;
|
|
|
|
for (uint32_t i = 0; i < count; ++i) {
|
|
auto a = src0_ptr[offset+i];
|
|
auto b = src1_ptr[offset+i];
|
|
auto c =-a * b - b;
|
|
dst_ptr[offset+i] = c;
|
|
}
|
|
}
|
|
|
|
void kernel_fnmsub(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|
auto count = arg->task_size;
|
|
auto src0_ptr = (float*)arg->src0_addr;
|
|
auto src1_ptr = (float*)arg->src1_addr;
|
|
auto dst_ptr = (float*)arg->dst_addr;
|
|
auto offset = task_id * count;
|
|
|
|
for (uint32_t i = 0; i < count; ++i) {
|
|
auto a = src0_ptr[offset+i];
|
|
auto b = src1_ptr[offset+i];
|
|
auto c =-a * b + b;
|
|
dst_ptr[offset+i] = c;
|
|
}
|
|
}
|
|
|
|
void kernel_fnmadd_madd(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|
auto count = arg->task_size;
|
|
auto src0_ptr = (float*)arg->src0_addr;
|
|
auto src1_ptr = (float*)arg->src1_addr;
|
|
auto dst_ptr = (float*)arg->dst_addr;
|
|
auto offset = task_id * count;
|
|
|
|
for (uint32_t i = 0; i < count; ++i) {
|
|
auto a = src0_ptr[offset+i];
|
|
auto b = src1_ptr[offset+i];
|
|
auto c =-a * b - b;
|
|
auto d = a * b + b;
|
|
auto e = c + d;
|
|
dst_ptr[offset+i] = e;
|
|
}
|
|
}
|
|
|
|
void kernel_fdiv(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|
auto count = arg->task_size;
|
|
auto src0_ptr = (float*)arg->src0_addr;
|
|
auto src1_ptr = (float*)arg->src1_addr;
|
|
auto dst_ptr = (float*)arg->dst_addr;
|
|
auto offset = task_id * count;
|
|
|
|
for (uint32_t i = 0; i < count; ++i) {
|
|
auto a = src0_ptr[offset+i];
|
|
auto b = src1_ptr[offset+i];
|
|
auto c = a / b;
|
|
dst_ptr[offset+i] = c;
|
|
}
|
|
}
|
|
|
|
void kernel_fdiv2(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|
auto count = arg->task_size;
|
|
auto src0_ptr = (float*)arg->src0_addr;
|
|
auto src1_ptr = (float*)arg->src1_addr;
|
|
auto dst_ptr = (float*)arg->dst_addr;
|
|
auto offset = task_id * count;
|
|
|
|
for (uint32_t i = 0; i < count; ++i) {
|
|
auto a = src0_ptr[offset+i];
|
|
auto b = src1_ptr[offset+i];
|
|
auto c = a / b;
|
|
auto d = b / a;
|
|
auto e = c + d;
|
|
dst_ptr[offset+i] = e;
|
|
}
|
|
}
|
|
|
|
void kernel_fsqrt(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|
auto count = arg->task_size;
|
|
auto src0_ptr = (float*)arg->src0_addr;
|
|
auto src1_ptr = (float*)arg->src1_addr;
|
|
auto dst_ptr = (float*)arg->dst_addr;
|
|
auto offset = task_id * count;
|
|
|
|
for (uint32_t i = 0; i < count; ++i) {
|
|
auto a = src0_ptr[offset+i];
|
|
auto b = src1_ptr[offset+i];
|
|
auto c = __ieee754_sqrtf(a * b);
|
|
dst_ptr[offset+i] = c;
|
|
}
|
|
}
|
|
|
|
void kernel_ftoi(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|
auto count = arg->task_size;
|
|
auto src0_ptr = (float*)arg->src0_addr;
|
|
auto src1_ptr = (float*)arg->src1_addr;
|
|
auto dst_ptr = (int32_t*)arg->dst_addr;
|
|
auto offset = task_id * count;
|
|
|
|
for (uint32_t i = 0; i < count; ++i) {
|
|
auto a = src0_ptr[offset+i];
|
|
auto b = src1_ptr[offset+i];
|
|
auto c = a + b;
|
|
auto d = (int32_t)c;
|
|
dst_ptr[offset+i] = d;
|
|
}
|
|
}
|
|
|
|
void kernel_ftou(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|
auto count = arg->task_size;
|
|
auto src0_ptr = (float*)arg->src0_addr;
|
|
auto src1_ptr = (float*)arg->src1_addr;
|
|
auto dst_ptr = (uint32_t*)arg->dst_addr;
|
|
auto offset = task_id * count;
|
|
|
|
for (uint32_t i = 0; i < count; ++i) {
|
|
auto a = src0_ptr[offset+i];
|
|
auto b = src1_ptr[offset+i];
|
|
auto c = a + b;
|
|
auto d = (uint32_t)c;
|
|
dst_ptr[offset+i] = d;
|
|
}
|
|
}
|
|
|
|
void kernel_itof(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|
auto count = arg->task_size;
|
|
auto src0_ptr = (int32_t*)arg->src0_addr;
|
|
auto src1_ptr = (int32_t*)arg->src1_addr;
|
|
auto dst_ptr = (float*)arg->dst_addr;
|
|
auto offset = task_id * count;
|
|
|
|
for (uint32_t i = 0; i < count; ++i) {
|
|
auto a = src0_ptr[offset+i];
|
|
auto b = src1_ptr[offset+i];
|
|
auto c = a + b;
|
|
auto d = (float)c;
|
|
dst_ptr[offset+i] = d;
|
|
}
|
|
}
|
|
|
|
void kernel_utof(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|
auto count = arg->task_size;
|
|
auto src0_ptr = (int32_t*)arg->src0_addr;
|
|
auto src1_ptr = (int32_t*)arg->src1_addr;
|
|
auto dst_ptr = (float*)arg->dst_addr;
|
|
auto offset = task_id * count;
|
|
|
|
for (uint32_t i = 0; i < count; ++i) {
|
|
auto a = src0_ptr[offset+i];
|
|
auto b = src1_ptr[offset+i];
|
|
auto c = a + b;
|
|
auto d = (float)c;
|
|
dst_ptr[offset+i] = d;
|
|
}
|
|
}
|
|
|
|
void kernel_bar(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|
auto num_warps = vx_num_warps();
|
|
auto num_threads = vx_num_threads();
|
|
|
|
auto cid = vx_core_id();
|
|
auto wid = vx_warp_id();
|
|
auto tid = vx_thread_id();
|
|
|
|
auto src0_ptr = (uint32_t*)arg->src0_addr;
|
|
auto dst_ptr = (uint32_t*)arg->dst_addr;
|
|
|
|
// per warp delay
|
|
uint32_t barrier_stall = 0;
|
|
for (int i = 0; i <= wid; ++i) {
|
|
barrier_stall += src0_ptr[0] * src0_ptr[i];
|
|
}
|
|
|
|
// memory fence
|
|
vx_fence();
|
|
|
|
// local barrier
|
|
vx_barrier(0, num_warps);
|
|
|
|
// update destination
|
|
auto src_idx = (cid * num_warps + (num_warps - 1 - wid)) * num_threads + tid;
|
|
dst_ptr[task_id] = src0_ptr[src_idx] + barrier_stall;
|
|
}
|
|
|
|
void kernel_gbar(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
|
auto num_cores = vx_num_cores();
|
|
auto num_warps = vx_num_warps();
|
|
auto num_threads = vx_num_threads();
|
|
|
|
auto cid = vx_core_id();
|
|
auto wid = vx_warp_id();
|
|
auto tid = vx_thread_id();
|
|
|
|
auto src0_ptr = (uint32_t*)arg->src0_addr;
|
|
auto dst_ptr = (uint32_t*)arg->dst_addr;
|
|
|
|
// per core delay
|
|
uint32_t barrier_stall = 0;
|
|
for (int i = 0; i <= cid; ++i) {
|
|
for (int j = 0; j <= wid; ++j) {
|
|
barrier_stall += src0_ptr[0] * src0_ptr[i + j];
|
|
}
|
|
}
|
|
|
|
// memory fence
|
|
vx_fence();
|
|
|
|
// global barrier
|
|
vx_barrier(0x80000000, num_cores);
|
|
|
|
// update destination
|
|
auto src_idx = ((num_cores - 1 - cid) * num_warps + (num_warps - 1 - wid)) * num_threads + tid;
|
|
dst_ptr[task_id] = src0_ptr[src_idx] + barrier_stall;
|
|
}
|
|
|
|
static const PFN_Kernel sc_tests[] = {
|
|
kernel_iadd,
|
|
kernel_imul,
|
|
kernel_idiv,
|
|
kernel_idiv_mul,
|
|
kernel_fadd,
|
|
kernel_fsub,
|
|
kernel_fmul,
|
|
kernel_fmadd,
|
|
kernel_fmsub,
|
|
kernel_fnmadd,
|
|
kernel_fnmsub,
|
|
kernel_fnmadd_madd,
|
|
kernel_fdiv,
|
|
kernel_fdiv2,
|
|
kernel_fsqrt,
|
|
kernel_ftoi,
|
|
kernel_ftou,
|
|
kernel_itof,
|
|
kernel_utof,
|
|
kernel_bar,
|
|
kernel_gbar
|
|
};
|
|
|
|
int main() {
|
|
auto arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
|
|
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)sc_tests[arg->testid], arg);
|
|
return 0;
|
|
}
|