sgemm_impl: Comment out GEMMINI_DMA code in single_tile

This is already done in the higher-level thread_block_gemm function, and flash also has explicit DMA sync code. Also having this executed twice sometimes triggers vx_bar movement into a branch which we really want to avoid.
2024-10-28 12:47:49 -07:00
parent 36eb50060f
commit e55c8b480e
1 changed files with 12 additions and 11 deletions
--- a/tests/regression/sgemm_tcore/sgemm_impl.hpp
+++ b/tests/regression/sgemm_tcore/sgemm_impl.hpp
@@ -870,18 +870,18 @@ __attribute__((always_inline)) inline void thread_block_gemm_single_tile(
    }
  }

-  if constexpr (GEMMINI_DMA) {
-    // Call gemmini fence at the end of the loop to overlap dma & wmma.
-    // Usually, by this time, dma has finished the copy so that this
-    // becomes a no-op.
-    if (tid_in_threadblock == 0) {
-      gemmini_fence();
-    }
+  // if constexpr (GEMMINI_DMA) {
+  //   // Call gemmini fence at the end of the loop to overlap dma & wmma.
+  //   // Usually, by this time, dma has finished the copy so that this
+  //   // becomes a no-op.
+  //   if (tid_in_threadblock == 0) {
+  //     gemmini_fence();
+  //   }

-    // reconverge after mmio
-    threadblock_barrier(threadblock_id_in_cluster,
-                        warps_per_threadblock_per_core);
-  }
+  //   // reconverge after mmio
+  //   threadblock_barrier(threadblock_id_in_cluster,
+  //                       warps_per_threadblock_per_core);
+  // }

  if constexpr (write_to_mem) {
    // need to protect smem reads in the earlier step from writes in below,
@@ -1135,6 +1135,7 @@ inline void thread_block_gemm(const T *A, const T *B, float *C,
          local_a_consume = local_a;
          local_b_consume = local_b;
        }
+
        asm volatile("dbuf_sel_end_%=:" ::);

        constexpr MemLayout layout_a =