From 510a834db529f20bac2fdea09f7a92372462402e Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 12 Mar 2024 15:34:42 -0700 Subject: [PATCH] sgemm_wg: Implement software barrier for inter-core synchronization --- tests/regression/sgemm_wg/kernel.cpp | 45 +++++++++++++++++++++++----- tests/regression/sgemm_wg/main.cpp | 6 ++-- 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index 7f33a90a..a65e1e5f 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include "common.h" @@ -8,8 +9,35 @@ #define BK 2 // #define TM (BM/BK) // #define TN (BN/BK) -#define TM 4 -#define TN 4 +#define TM 2 +#define TN 2 + +#define DEV_BARRIER_MMIO_BASE_ADDR 0xff003f00UL +#define CORES_PER_CLUSTER 4 + +void threadblock_barrier(unsigned int barrier_id, unsigned int count) { + vx_barrier(barrier_id, count); + vx_fence(); + +#if CORES_PER_CLUSTER != 1 + if (vx_thread_id() == 0) { + volatile uint32_t *mmio = (volatile uint32_t *)(DEV_BARRIER_MMIO_BASE_ADDR); + int core_id = vx_core_id(); + const uint32_t barrier_stride = CORES_PER_CLUSTER; + const uint32_t barrier_offset = barrier_stride * barrier_id; + // 1 : 0x00 is reserved for mmio read reg + mmio[barrier_offset + 1 + core_id] = 1; + vx_printf("========== barrier written! barrier_id=%u, count=%u\n", barrier_id, count); + + // wait for other cores in the cluster to finish by waiting on the + // all-synced read-only mmio reg + while (mmio[barrier_offset] == 0); + + // reset per-core flag back to zero for the next barrier + mmio[barrier_offset + 1 + core_id] = 0; + } +#endif +} void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, const uint32_t tid_in_threadblock, @@ -73,8 +101,7 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, B[global_b_offset]; } - vx_barrier(threadblock_id_in_core, threadblock_dim_y); - vx_fence(); + threadblock_barrier(threadblock_id_in_core, threadblock_dim_y); for (uint32_t local_k = 0; local_k < BK; local_k++) { #pragma GCC unroll TM @@ -103,8 +130,7 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, } } - vx_barrier(threadblock_id_in_core, threadblock_dim_y); - vx_fence(); + threadblock_barrier(threadblock_id_in_core, threadblock_dim_y); } #pragma GCC unroll TM @@ -123,7 +149,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { // @perf: All threads are running these compute whose result is mostly same // across the threadblock - const uint32_t threads_per_threadblock = ((BM * BN) / (TM * TN)); + const uint32_t threads_per_threadblock = (BM * BN) / (TM * TN); const uint32_t threadblocks_per_core = vx_num_threads() * vx_num_warps() / threads_per_threadblock; const uint32_t threadblock_dim_x = vx_num_threads(); @@ -138,6 +164,11 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { const int threadblock_id_x = threadblock_id % dim_n_in_blocks; const int threadblock_id_y = threadblock_id / dim_n_in_blocks; + // initialize barrier MMIO + volatile uint32_t *barrier_mmio = (volatile uint32_t *)(DEV_BARRIER_MMIO_BASE_ADDR); + *barrier_mmio = 0; + vx_fence(); + float *sharedmem_per_threadblock = (float *)DEV_SMEM_START_ADDR + (2 * BM * BK) * threadblock_id_in_core; diff --git a/tests/regression/sgemm_wg/main.cpp b/tests/regression/sgemm_wg/main.cpp index c86f7aaf..229463ef 100644 --- a/tests/regression/sgemm_wg/main.cpp +++ b/tests/regression/sgemm_wg/main.cpp @@ -147,9 +147,9 @@ int main(int argc, char *argv[]) { RT_CHECK(vx_dev_open(&device)); // FIXME: hardcoded - uint32_t dim_m = 64; - uint32_t dim_n = 64; - uint32_t dim_k = 64; + uint32_t dim_m = 32; + uint32_t dim_n = 32; + uint32_t dim_k = 32; generate_source_matrix(dim_m, dim_n, dim_k); generate_reference_matmul(dim_m, dim_n, dim_k);