Files
kernels/kernels/wu_arch_cases/case03_dual_fetch_issue/kernel.cpp

60 lines
1.4 KiB
C++

#include "common_wu_min.h"
extern "C" void scalar_worker() {
wu_short_delay(wu_wid());
wu_mark_seen(WU_CASE_SCALAR_BASE);
wu_stop_warp();
}
extern "C" void __attribute__((naked, noinline, used)) tensor_worker() {
asm volatile(
"csrr x5, %[csr_wid]\n\t"
"li x6, %[spin]\n\t"
"1:\n\t"
"addi x6, x6, -1\n\t"
"bnez x6, 1b\n\t"
"slli x6, x5, 2\n\t"
"la x7, g_seen\n\t"
"add x7, x7, x6\n\t"
"li x6, %[tensor_base]\n\t"
"or x6, x6, x5\n\t"
"sw x6, 0(x7)\n\t"
".insn r %[custom0], 0, 0, x0, x0, x0\n\t"
"2: j 2b\n\t"
:
: [csr_wid] "i"(VX_CSR_WARP_ID),
[custom0] "i"(RISCV_CUSTOM0),
[spin] "i"(WU_CASE_SHORT_SPIN),
[tensor_base] "i"(WU_CASE_TENSOR_BASE)
: "memory");
}
extern "C" int wu_main() {
if (!wu_is_leader()) {
return 0;
}
wu_case_reset();
const uint32_t scalar_mask = wu_scalar_mask_without_warp0();
if (scalar_mask != 0) {
vx_spawn_scalar(scalar_mask, scalar_worker);
}
vx_spawn_tensor(vx_tensor_warp_mask(), tensor_worker);
wu_short_delay(0);
wu_mark_seen(WU_CASE_SCALAR_BASE);
if (wu_wait_seen_range(0, NUM_SCALAR_WARPS, WU_CASE_SCALAR_BASE) != 0) {
wu_case_fail(0x31u);
return 1;
}
if (wu_wait_seen_range(NUM_SCALAR_WARPS, NUM_WARPS, WU_CASE_TENSOR_BASE) != 0) {
wu_case_fail(0x32u);
return 1;
}
wu_case_pass();
return 0;
}