From 2036d37840e2ec72f49dff3a3e06b593dbd3d610 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Wed, 13 Mar 2024 21:32:57 -0700
Subject: [PATCH] sgemm_wg: Prevent run-ahead using ternary flags; reduce mem
 accesses

---
 tests/regression/sgemm_wg/kernel.cpp | 61 ++++++++++++++++++++--------
 1 file changed, 45 insertions(+), 16 deletions(-)

diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp
index a65e1e5f..44299934 100644
--- a/tests/regression/sgemm_wg/kernel.cpp
+++ b/tests/regression/sgemm_wg/kernel.cpp
@@ -13,29 +13,61 @@
 #define TN 2
 
 #define DEV_BARRIER_MMIO_BASE_ADDR 0xff003f00UL
-#define CORES_PER_CLUSTER 4
+#define CORES_PER_CLUSTER 2
+#define BARRIER_STRIDE 4
 
-void threadblock_barrier(unsigned int barrier_id, unsigned int count) {
+void threadblock_barrier(unsigned int tid_in_threadblock, unsigned int barrier_id, unsigned int count) {
     vx_barrier(barrier_id, count);
     vx_fence();
 
-#if CORES_PER_CLUSTER != 1
-    if (vx_thread_id() == 0) {
+    // vx_printf("========== barrier! barrier_id=%u, count=%u\n", barrier_id, count);
+
+#if CORES_PER_CLUSTER != 0
+    // this code doesn't work without the memory-mapped register implemented in
+    // hardware, hence the #ifdef.
+
+    if (tid_in_threadblock == 0) {
       volatile uint32_t *mmio = (volatile uint32_t *)(DEV_BARRIER_MMIO_BASE_ADDR);
       int core_id = vx_core_id();
-      const uint32_t barrier_stride = CORES_PER_CLUSTER;
+      // FIXME: hardcoded
+      const uint32_t barrier_stride = BARRIER_STRIDE;
       const uint32_t barrier_offset = barrier_stride * barrier_id;
-      // 1 : 0x00 is reserved for mmio read reg
+
+      // wait for the barrier to be initialized
+      while (mmio[barrier_offset + 1 + core_id] != 0);
+
+      // signal internal-core synchronization done
       mmio[barrier_offset + 1 + core_id] = 1;
-      vx_printf("========== barrier written! barrier_id=%u, count=%u\n", barrier_id, count);
 
       // wait for other cores in the cluster to finish by waiting on the
       // all-synced read-only mmio reg
       while (mmio[barrier_offset] == 0);
 
-      // reset per-core flag back to zero for the next barrier
-      mmio[barrier_offset + 1 + core_id] = 0;
+      // need to signal that this core passed the barrier; otherwise, if we
+      // reset this to 0 right away, the other core still waiting for the
+      // barrier might never see the all-sync mmio reg as 1.
+      mmio[barrier_offset + 1 + core_id] = 2;
+
+      // // if this core is the last one passing the barrier, reset all per-core
+      // // flags to 0 to get ready for the next barrier
+      // bool all_passed = true;
+      // for (int i = 0; i < CORES_PER_CLUSTER; i++) {
+      //   // if (i == core_id) continue;
+      //   // NOTE: this requires coherent access of store-to-load to the same
+      //   // address
+      //   if (mmio[barrier_offset + 1 + i] != 2) {
+      //     all_passed = false;
+      //     break;
+      //   }
+      // }
+      // if (all_passed) {
+      //   for (int i = 0; i < CORES_PER_CLUSTER; i++) {
+      //     mmio[barrier_offset + 1 + i] = 0;
+      //   }
+      // }
     }
+
+    vx_barrier(barrier_id, count);
 #endif
 }
 
@@ -101,7 +133,8 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg,
           B[global_b_offset];
     }
 
-    threadblock_barrier(threadblock_id_in_core, threadblock_dim_y);
+    threadblock_barrier(tid_in_threadblock, threadblock_id_in_core,
+                        threadblock_dim_y);
 
     for (uint32_t local_k = 0; local_k < BK; local_k++) {
 #pragma GCC unroll TM
@@ -130,7 +163,8 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg,
       }
     }
 
-    threadblock_barrier(threadblock_id_in_core, threadblock_dim_y);
+    threadblock_barrier(tid_in_threadblock, threadblock_id_in_core,
+                        threadblock_dim_y);
   }
 
 #pragma GCC unroll TM
@@ -164,11 +198,6 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
   const int threadblock_id_x = threadblock_id % dim_n_in_blocks;
   const int threadblock_id_y = threadblock_id / dim_n_in_blocks;
 
-  // initialize barrier MMIO
-  volatile uint32_t *barrier_mmio = (volatile uint32_t *)(DEV_BARRIER_MMIO_BASE_ADDR);
-  *barrier_mmio = 0;
-  vx_fence();
-
   float *sharedmem_per_threadblock =
       (float *)DEV_SMEM_START_ADDR +
       (2 * BM * BK) * threadblock_id_in_core;