From 510a834db529f20bac2fdea09f7a92372462402e Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Tue, 12 Mar 2024 15:34:42 -0700
Subject: [PATCH] sgemm_wg: Implement software barrier for inter-core
 synchronization

---
 tests/regression/sgemm_wg/kernel.cpp | 45 +++++++++++++++++++++++-----
 tests/regression/sgemm_wg/main.cpp   |  6 ++--
 2 files changed, 41 insertions(+), 10 deletions(-)

diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp
index 7f33a90a..a65e1e5f 100644
--- a/tests/regression/sgemm_wg/kernel.cpp
+++ b/tests/regression/sgemm_wg/kernel.cpp
@@ -1,5 +1,6 @@
 #include <stdint.h>
 #include <vx_intrinsics.h>
+#include <vx_print.h>
 #include <vx_spawn.h>
 #include "common.h"
 
@@ -8,8 +9,35 @@
 #define BK 2
 // #define TM (BM/BK)
 // #define TN (BN/BK)
-#define TM 4
-#define TN 4
+#define TM 2
+#define TN 2
+
+#define DEV_BARRIER_MMIO_BASE_ADDR 0xff003f00UL
+#define CORES_PER_CLUSTER 4
+
+void threadblock_barrier(unsigned int barrier_id, unsigned int count) {
+    vx_barrier(barrier_id, count);
+    vx_fence();
+
+#if CORES_PER_CLUSTER != 1
+    if (vx_thread_id() == 0) {
+      volatile uint32_t *mmio = (volatile uint32_t *)(DEV_BARRIER_MMIO_BASE_ADDR);
+      int core_id = vx_core_id();
+      const uint32_t barrier_stride = CORES_PER_CLUSTER;
+      const uint32_t barrier_offset = barrier_stride * barrier_id;
+      // 1 : 0x00 is reserved for mmio read reg
+      mmio[barrier_offset + 1 + core_id] = 1;
+      vx_printf("========== barrier written! barrier_id=%u, count=%u\n", barrier_id, count);
+
+      // wait for other cores in the cluster to finish by waiting on the
+      // all-synced read-only mmio reg
+      while (mmio[barrier_offset] == 0);
+
+      // reset per-core flag back to zero for the next barrier
+      mmio[barrier_offset + 1 + core_id] = 0;
+    }
+#endif
+}
 
 void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg,
                               const uint32_t tid_in_threadblock,
@@ -73,8 +101,7 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg,
           B[global_b_offset];
     }
 
-    vx_barrier(threadblock_id_in_core, threadblock_dim_y);
-    vx_fence();
+    threadblock_barrier(threadblock_id_in_core, threadblock_dim_y);
 
     for (uint32_t local_k = 0; local_k < BK; local_k++) {
 #pragma GCC unroll TM
@@ -103,8 +130,7 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg,
       }
     }
 
-    vx_barrier(threadblock_id_in_core, threadblock_dim_y);
-    vx_fence();
+    threadblock_barrier(threadblock_id_in_core, threadblock_dim_y);
   }
 
 #pragma GCC unroll TM
@@ -123,7 +149,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
   // @perf: All threads are running these compute whose result is mostly same
   // across the threadblock
 
-  const uint32_t threads_per_threadblock = ((BM * BN) / (TM * TN));
+  const uint32_t threads_per_threadblock = (BM * BN) / (TM * TN);
   const uint32_t threadblocks_per_core =
       vx_num_threads() * vx_num_warps() / threads_per_threadblock;
   const uint32_t threadblock_dim_x = vx_num_threads();
@@ -138,6 +164,11 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
   const int threadblock_id_x = threadblock_id % dim_n_in_blocks;
   const int threadblock_id_y = threadblock_id / dim_n_in_blocks;
 
+  // initialize barrier MMIO
+  volatile uint32_t *barrier_mmio = (volatile uint32_t *)(DEV_BARRIER_MMIO_BASE_ADDR);
+  *barrier_mmio = 0;
+  vx_fence();
+
   float *sharedmem_per_threadblock =
       (float *)DEV_SMEM_START_ADDR +
       (2 * BM * BK) * threadblock_id_in_core;
diff --git a/tests/regression/sgemm_wg/main.cpp b/tests/regression/sgemm_wg/main.cpp
index c86f7aaf..229463ef 100644
--- a/tests/regression/sgemm_wg/main.cpp
+++ b/tests/regression/sgemm_wg/main.cpp
@@ -147,9 +147,9 @@ int main(int argc, char *argv[]) {
   RT_CHECK(vx_dev_open(&device));
 
   // FIXME: hardcoded
-  uint32_t dim_m = 64;
-  uint32_t dim_n = 64;
-  uint32_t dim_k = 64;
+  uint32_t dim_m = 32;
+  uint32_t dim_n = 32;
+  uint32_t dim_k = 32;
 
   generate_source_matrix(dim_m, dim_n, dim_k);
   generate_reference_matmul(dim_m, dim_n, dim_k);