sgemm_impl: Comment out GEMMINI_DMA code in single_tile

This is already done in the higher-level thread_block_gemm function, and
flash also has explicit DMA sync code.  Also having this executed twice
sometimes triggers vx_bar movement into a branch which we really want to
avoid.
This commit is contained in:
Hansung Kim
2024-10-28 12:47:49 -07:00
parent 36eb50060f
commit e55c8b480e

View File

@@ -870,18 +870,18 @@ __attribute__((always_inline)) inline void thread_block_gemm_single_tile(
} }
} }
if constexpr (GEMMINI_DMA) { // if constexpr (GEMMINI_DMA) {
// Call gemmini fence at the end of the loop to overlap dma & wmma. // // Call gemmini fence at the end of the loop to overlap dma & wmma.
// Usually, by this time, dma has finished the copy so that this // // Usually, by this time, dma has finished the copy so that this
// becomes a no-op. // // becomes a no-op.
if (tid_in_threadblock == 0) { // if (tid_in_threadblock == 0) {
gemmini_fence(); // gemmini_fence();
} // }
// reconverge after mmio // // reconverge after mmio
threadblock_barrier(threadblock_id_in_cluster, // threadblock_barrier(threadblock_id_in_cluster,
warps_per_threadblock_per_core); // warps_per_threadblock_per_core);
} // }
if constexpr (write_to_mem) { if constexpr (write_to_mem) {
// need to protect smem reads in the earlier step from writes in below, // need to protect smem reads in the earlier step from writes in below,
@@ -1135,6 +1135,7 @@ inline void thread_block_gemm(const T *A, const T *B, float *C,
local_a_consume = local_a; local_a_consume = local_a;
local_b_consume = local_b; local_b_consume = local_b;
} }
asm volatile("dbuf_sel_end_%=:" ::); asm volatile("dbuf_sel_end_%=:" ::);
constexpr MemLayout layout_a = constexpr MemLayout layout_a =