flash: Disable perf loop multiplier

2024-11-10 22:44:02 -08:00
parent 4448f31fdc
commit 7d7cb5f60a
2 changed files with 2 additions and 2 deletions
--- a/tests/regression/flash_attention/kernel.cpp
+++ b/tests/regression/flash_attention/kernel.cpp
@@ -355,7 +355,7 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {

  // "inner loop" along the columns of K^T
  const uint32_t k_tiles = (dim_seqlen / B_COL);
-  for (uint32_t tile_k = 0; tile_k < (4 /* for perf measurement */ * k_tiles);
+  for (uint32_t tile_k = 0; tile_k < (1 /* for perf measurement */ * k_tiles);
       tile_k++) {
    // float *smem_P_produce = (tile_k % 2) ? smem_P0 : smem_P1;
    // float *smem_P_consume = (tile_k % 2) ? smem_P1 : smem_P0;
--- a/tests/regression/flash_attention/kernel.gemmini.cpp
+++ b/tests/regression/flash_attention/kernel.gemmini.cpp
@@ -347,7 +347,7 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
  // "inner loop" along the columns of K^T
  const uint32_t k_tiles = (dim_seqlen / B_COL);
  for (uint32_t tile_k = 0;
-       tile_k < (4 /*for perf measurement*/ *
+       tile_k < (1 /*for perf measurement*/ *
                 // virgo kernel is fully pipelined around (2 GEMMs | softmax);
                 // requires two loop iterations to finish one tile compute
                 (2 * k_tiles)) +