From e55c8b480e2bb3655e8d191b91328aa4ec1ce88a Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 28 Oct 2024 12:47:49 -0700 Subject: [PATCH] sgemm_impl: Comment out GEMMINI_DMA code in single_tile This is already done in the higher-level thread_block_gemm function, and flash also has explicit DMA sync code. Also having this executed twice sometimes triggers vx_bar movement into a branch which we really want to avoid. --- tests/regression/sgemm_tcore/sgemm_impl.hpp | 23 +++++++++++---------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/tests/regression/sgemm_tcore/sgemm_impl.hpp b/tests/regression/sgemm_tcore/sgemm_impl.hpp index 626b2f52..41cc5a17 100644 --- a/tests/regression/sgemm_tcore/sgemm_impl.hpp +++ b/tests/regression/sgemm_tcore/sgemm_impl.hpp @@ -870,18 +870,18 @@ __attribute__((always_inline)) inline void thread_block_gemm_single_tile( } } - if constexpr (GEMMINI_DMA) { - // Call gemmini fence at the end of the loop to overlap dma & wmma. - // Usually, by this time, dma has finished the copy so that this - // becomes a no-op. - if (tid_in_threadblock == 0) { - gemmini_fence(); - } + // if constexpr (GEMMINI_DMA) { + // // Call gemmini fence at the end of the loop to overlap dma & wmma. + // // Usually, by this time, dma has finished the copy so that this + // // becomes a no-op. + // if (tid_in_threadblock == 0) { + // gemmini_fence(); + // } - // reconverge after mmio - threadblock_barrier(threadblock_id_in_cluster, - warps_per_threadblock_per_core); - } + // // reconverge after mmio + // threadblock_barrier(threadblock_id_in_cluster, + // warps_per_threadblock_per_core); + // } if constexpr (write_to_mem) { // need to protect smem reads in the earlier step from writes in below, @@ -1135,6 +1135,7 @@ inline void thread_block_gemm(const T *A, const T *B, float *C, local_a_consume = local_a; local_b_consume = local_b; } + asm volatile("dbuf_sel_end_%=:" ::); constexpr MemLayout layout_a =