Update Wu architecture kernel implementations and runtime library

2026-05-26 12:59:35 +00:00
parent e7229dae27
commit 9f4be1b8f7
8 changed files with 142 additions and 31 deletions
--- a/lib/src/vx_spawn.c
+++ b/lib/src/vx_spawn.c
@@ -76,7 +76,7 @@ static void __attribute__ ((noinline)) spawn_tasks_all_stub() {

 static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_stub() {
  int NT  = vx_num_threads();
-  int NW  = vx_num_warps();
+  int NW  = NUM_SCALAR_WARPS;
  int cid = vx_core_id();
  int wid = vx_warp_id();
  int tid = vx_thread_id();
@@ -96,7 +96,7 @@ static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_stub() {

 static void __attribute__ ((noinline)) spawn_tasks_cluster_all_stub() {
  int NT  = vx_num_threads();
-  int NW  = vx_num_warps();
+  int NW  = NUM_SCALAR_WARPS;
  int cid = vx_core_id();
  int wid = vx_warp_id();
  int tid = vx_thread_id();
@@ -187,7 +187,7 @@ static void __attribute__ ((noinline)) spawn_tasks_all_cb() {
 void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg) {
  // device specs
  const int NC = vx_num_cores();
-  const int NW = vx_num_warps();
+  const int NW = NUM_SCALAR_WARPS;
  const int NT = vx_num_threads();
  // NOTE: assumes divisible
  const int num_cluster = NC / CORES_PER_CLUSTER;
@@ -243,7 +243,7 @@ void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg
  const int num_full_waves = num_warps_this_core / NW;
  const int rem_full_warps_in_last_wave = num_warps_this_core % NW;

-  const const int offset = cluster_id * num_tasks_this_cluster;
+  const int offset = cluster_id * num_tasks_this_cluster;
  wspawn_tasks_args_t wspawn_args = {callback, arg, offset, num_full_waves,
                                     rem_full_warps_in_last_wave};
  g_wspawn_args[core_id] = &wspawn_args;
@@ -289,7 +289,7 @@ void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg
 void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void * arg) {
 	// device specs
  int NC = vx_num_cores();
-  int NW = vx_num_warps();
+  int NW = NUM_SCALAR_WARPS;
  int NT = vx_num_threads();

  // current core id
@@ -361,7 +361,7 @@ void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void
 void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) {
 	// device specs
  int NC = vx_num_cores();
-  int NW = vx_num_warps();
+  int NW = NUM_SCALAR_WARPS;
  int NT = vx_num_threads();

  // current core id
@@ -515,7 +515,7 @@ void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg) {
  
  // device specs
  int NC = vx_num_cores();
-  int NW = vx_num_warps();
+  int NW = NUM_SCALAR_WARPS;
  int NT = vx_num_threads();

  // current core id
--- a/lib/src/vx_start.S
+++ b/lib/src/vx_start.S
@@ -22,9 +22,9 @@
 _start:  

  # initialize per-thread registers
-  csrr  t0, VX_CSR_NUM_WARPS  # get num warps
+  li    t0, ((1 << NUM_SCALAR_WARPS) - 1)  # scalar warp mask
  la    t1, init_regs_all
-  .insn r RISCV_CUSTOM0, 1, 0, x0, t0, t1  # wspawn t0, t1
+  .insn r RISCV_CUSTOM0, 6, 0, x0, t0, t1  # wspawn_mask t0, t1
  li    t0, -1
  .insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0  # tmc t0
  jal   init_regs
@@ -35,9 +35,9 @@ _start:
  jal vx_wspawn_wait

  # initialize TLS for all warps
-  csrr  t0, VX_CSR_NUM_WARPS  # get num warps
+  li    t0, ((1 << NUM_SCALAR_WARPS) - 1)  # scalar warp mask
  la    t1, init_tls_all
-  .insn r RISCV_CUSTOM0, 1, 0, x0, t0, t1  # wspawn t0, t1
+  .insn r RISCV_CUSTOM0, 6, 0, x0, t0, t1  # wspawn_mask t0, t1
  li    t0, -1
  .insn r RISCV_CUSTOM0, 0, 0, x0, t0, x0  # tmc t0
  call  __init_tls
@@ -150,4 +150,3 @@ vx_wspawn_wait:
 	.weak __dso_handle
 __dso_handle:
 	.long	0
-