From ce4f3a24e3fd8a8ec0d1dd469198c61e38fd6015 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 12 Jun 2024 21:01:37 -0700 Subject: [PATCH] sgemm_tcore: Replace hardcoded NUM_LANES with NUM_THREADS --- tests/regression/sgemm_tcore/kernel.cpp | 6 +++--- .../sgemm_tcore/kernel.warpspecial.cpp | 6 +++--- .../sgemm_tcore/kernel.warpspecial_dma.cpp | 16 +++++++--------- tests/regression/sgemm_tcore/util.hpp | 12 +++++------- 4 files changed, 18 insertions(+), 22 deletions(-) diff --git a/tests/regression/sgemm_tcore/kernel.cpp b/tests/regression/sgemm_tcore/kernel.cpp index 0294a8b6..31586ee7 100644 --- a/tests/regression/sgemm_tcore/kernel.cpp +++ b/tests/regression/sgemm_tcore/kernel.cpp @@ -7,7 +7,7 @@ #include "include/gemmini.h" #include "gemmini_mmio.h" -#define GEMMINI_DMA 1 +#define GEMMINI_DMA 0 #if SMEM_SIZE == 0x4000 #define SMEM_ADDR_Q0 ((float * const) 0xff000000) #define SMEM_ADDR_Q1 ((float * const) 0xff001000) @@ -273,10 +273,10 @@ inline void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, // no double-buffering const uint32_t threads_per_warpgroup = threads_per_threadblock; - const uint32_t warp_id_in_warpgroup = tid_in_threadblock / NUM_LANES; + const uint32_t warp_id_in_warpgroup = tid_in_threadblock / NUM_THREADS; const uint32_t warp_row = warp_id_in_warpgroup / (BN / WN); const uint32_t warp_col = warp_id_in_warpgroup % (BN / WN); - const uint32_t tid_in_warp = tid_in_threadblock % NUM_LANES; + const uint32_t tid_in_warp = tid_in_threadblock % NUM_THREADS; volatile float *local_a = sharedmem_per_threadblock; constexpr size_t local_a_elems = (BM * BK); diff --git a/tests/regression/sgemm_tcore/kernel.warpspecial.cpp b/tests/regression/sgemm_tcore/kernel.warpspecial.cpp index 5cb8c7fb..4b8cd759 100644 --- a/tests/regression/sgemm_tcore/kernel.warpspecial.cpp +++ b/tests/regression/sgemm_tcore/kernel.warpspecial.cpp @@ -9,7 +9,7 @@ #define DOUBLE_BUFFER 1 #undef ELEM_PER_THREAD -#define ELEM_PER_THREAD (WMITER * WNITER * ((TCM * TCN) / NUM_LANES) / (DOUBLE_BUFFER ? 2 : 1)) +#define ELEM_PER_THREAD (WMITER * WNITER * ((TCM * TCN) / NUM_THREADS) / (DOUBLE_BUFFER ? 2 : 1)) // FIXME: NUM_THREADS and NUM_WARPS hardcoded #if ((BM * BN / ELEM_PER_THREAD) > (CORES_PER_CLUSTER * 8 * 8)) @@ -291,11 +291,11 @@ inline void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, const uint32_t threads_per_warpgroup = threads_per_threadblock / (DOUBLE_BUFFER ? 2 : 1); const uint32_t warpgroup_id = tid_in_threadblock / threads_per_warpgroup; const uint32_t tid_in_warpgroup = tid_in_threadblock % threads_per_warpgroup; // FIXME - const uint32_t warp_in_warpgroup = tid_in_warpgroup / NUM_LANES; + const uint32_t warp_in_warpgroup = tid_in_warpgroup / NUM_THREADS; // FIXME: warp_row / BN should be warp-specialized? const uint32_t warp_row = warp_in_warpgroup / (BN / WN); const uint32_t warp_col = warp_in_warpgroup % (BN / WN); - const uint32_t tid_in_warp = tid_in_threadblock % NUM_LANES; + const uint32_t tid_in_warp = tid_in_threadblock % NUM_THREADS; volatile float *local_a = sharedmem_per_threadblock; // const size_t local_a_elems = threadblock_dim_x * threadblock_dim_y; diff --git a/tests/regression/sgemm_tcore/kernel.warpspecial_dma.cpp b/tests/regression/sgemm_tcore/kernel.warpspecial_dma.cpp index a114aebe..00f02bfa 100644 --- a/tests/regression/sgemm_tcore/kernel.warpspecial_dma.cpp +++ b/tests/regression/sgemm_tcore/kernel.warpspecial_dma.cpp @@ -8,8 +8,6 @@ #include "include/gemmini.h" #include "gemmini_mmio.h" -#define NUM_LANES 8 - #define SMEM_ADDR_Q0 ((float * const) 0xff000000) #define SMEM_ADDR_Q1 ((float * const) 0xff001000) #define SMEM_ADDR_Q2 ((float * const) 0xff002000) @@ -52,7 +50,7 @@ #define TCK 8 #define WMITER (WM / TCM) #define WNITER (WN / TCN) -#define ELEM_PER_THREAD (WMITER * WNITER * ((TCM * TCN) / NUM_LANES) / (DOUBLE_BUFFER ? 2 : 1)) +#define ELEM_PER_THREAD (WMITER * WNITER * ((TCM * TCN) / NUM_THREADS) / (DOUBLE_BUFFER ? 2 : 1)) // FIXME: NUM_THREADS and NUM_WARPS hardcoded #if ((BM * BN / ELEM_PER_THREAD) > (CORES_PER_CLUSTER * 8 * 8)) @@ -101,9 +99,9 @@ inline constexpr void map_operand_8lanes(const int tid, int &row, int &col) { } inline constexpr void map_operand(const int tid, int &row, int &col) { - if constexpr (NUM_LANES == 32) { + if constexpr (NUM_THREADS == 32) { map_operand_32lanes(tid, row, col); - } else if constexpr (NUM_LANES == 8) { + } else if constexpr (NUM_THREADS == 8) { map_operand_8lanes(tid, row, col); } else { // FIXME: not allowed @@ -137,9 +135,9 @@ inline constexpr void map_c_8lanes(const int tid, int &row, int &col) { } inline constexpr void map_c(const int tid, int &row, int &col) { - if constexpr (NUM_LANES == 32) { + if constexpr (NUM_THREADS == 32) { map_c_32lanes(tid, row, col); - } else if constexpr (NUM_LANES == 8) { + } else if constexpr (NUM_THREADS == 8) { map_c_8lanes(tid, row, col); } else { // FIXME: not allowed @@ -571,12 +569,12 @@ inline void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, const uint32_t threads_per_warpgroup = threads_per_threadblock / 1; const uint32_t warpgroup_id = tid_in_threadblock / threads_per_warpgroup; const uint32_t tid_in_warpgroup = tid_in_threadblock % threads_per_warpgroup; // FIXME - const uint32_t warp_in_warpgroup = tid_in_warpgroup / NUM_LANES; + const uint32_t warp_in_warpgroup = tid_in_warpgroup / NUM_THREADS; // FIXME: warp_row / BN should be warp-specialized? const uint32_t warp_row = warp_in_warpgroup / (BN / WN); const uint32_t warp_col = warp_in_warpgroup % (BN / WN); - const uint32_t tid_in_warp = tid_in_threadblock % NUM_LANES; + const uint32_t tid_in_warp = tid_in_threadblock % NUM_THREADS; volatile float *local_a = sharedmem_per_threadblock; // const size_t local_a_elems = threadblock_dim_x * threadblock_dim_y; diff --git a/tests/regression/sgemm_tcore/util.hpp b/tests/regression/sgemm_tcore/util.hpp index 5d54cd4b..f9452ad1 100644 --- a/tests/regression/sgemm_tcore/util.hpp +++ b/tests/regression/sgemm_tcore/util.hpp @@ -6,8 +6,6 @@ #include "include/gemmini.h" #include "gemmini_mmio.h" -#define NUM_LANES 8 - // Constraints on parameters: // * Memory: // (BM + BN) * BK * sizeof(float) <= sharedmem size. @@ -30,7 +28,7 @@ #define TCK 8 #define WMITER (WM / TCM) #define WNITER (WN / TCN) -#define ELEM_PER_THREAD (WMITER * WNITER * (TCM * TCN) / NUM_LANES) +#define ELEM_PER_THREAD (WMITER * WNITER * (TCM * TCN) / NUM_THREADS) // number of loop around the inner 0..TCK..BK loop to simulate perfect-DRAM // scenario @@ -91,9 +89,9 @@ inline constexpr void map_operand_8lanes(const int tid, int &row, int &col) { } inline constexpr void map_operand(const int tid, int &row, int &col) { - if constexpr (NUM_LANES == 32) { + if constexpr (NUM_THREADS == 32) { map_operand_32lanes(tid, row, col); - } else if constexpr (NUM_LANES == 8) { + } else if constexpr (NUM_THREADS == 8) { map_operand_8lanes(tid, row, col); } else { // FIXME: not allowed @@ -127,9 +125,9 @@ inline constexpr void map_c_8lanes(const int tid, int &row, int &col) { } inline constexpr void map_c(const int tid, int &row, int &col) { - if constexpr (NUM_LANES == 32) { + if constexpr (NUM_THREADS == 32) { map_c_32lanes(tid, row, col); - } else if constexpr (NUM_LANES == 8) { + } else if constexpr (NUM_THREADS == 8) { map_c_8lanes(tid, row, col); } else { // FIXME: not allowed