diff --git a/tests/regression/sgemm_gemmini/kernel.cpp b/tests/regression/sgemm_gemmini/kernel.cpp index 221af0a5..e1a33df6 100644 --- a/tests/regression/sgemm_gemmini/kernel.cpp +++ b/tests/regression/sgemm_gemmini/kernel.cpp @@ -32,6 +32,7 @@ #define REMATERIALIZE #define DBUF //#define CISC +#define POWER //#define DEBUG_PRINT //#define DETAILED_PERF @@ -504,6 +505,11 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, if (threadblock_id == NUM_CLUSTERS - 1) { threadblock_barrier(/*barrier_id=*/0, /*count=*/NUM_WARPS); rd_cycles_force(marker9); + #ifdef POWER + if (HW_TID() == 0) { + PRINTF("\nstart %d end %d\n", marker0, marker9); + } + #else if (HW_TID() == 0) { PRINTF("\ncomplete\n"); PRINTF("total cycles: %d\n", marker9 - marker0); @@ -541,7 +547,9 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, PRINTF("\n"); } } + #endif } + threadblock_barrier(/*barrier_id=*/0, /*count=*/NUM_WARPS); vx_tmc(0); } diff --git a/tests/regression/sgemm_gemmini_dma/kernel.cpp b/tests/regression/sgemm_gemmini_dma/kernel.cpp index 128c49aa..02c99077 100644 --- a/tests/regression/sgemm_gemmini_dma/kernel.cpp +++ b/tests/regression/sgemm_gemmini_dma/kernel.cpp @@ -41,6 +41,7 @@ #define PRINTF(...) sprintf(PRINT_BUF, __VA_ARGS__) // #define PRINTF(...) vx_printf(__VA_ARGS__) #define SWISH(beta, x) ((x) / (1 + exp(-(beta) * (x)))) +#define POWER inline void threadblock_barrier(unsigned int barrier_id, unsigned int count) { vx_fence(); @@ -139,16 +140,21 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, threadblock_barrier(/*barrier_id=*/0, /*count=*/NUM_WARPS); rd_cycles_force(marker1); if (HW_TID() == 0) { - PRINTF("\ncomplete\n"); - PRINTF("total cycles: %d\n", marker1 - marker0); - for (int i = 0; i < dim_m; i += 8) { - for (int j = 0; j < dim_n; j += 8) { - PRINTF("%d %d ", (int) (C[i * dim_n + j]), (int) (C[i * dim_n + j + 4])); + #ifdef POWER + PRINTF("\nstart %d end %d\n", marker0, marker1); + #else + PRINTF("\ncomplete\n"); + PRINTF("total cycles: %d\n", marker1 - marker0); + for (int i = 0; i < dim_m; i += 8) { + for (int j = 0; j < dim_n; j += 8) { + PRINTF("%d %d ", (int) (C[i * dim_n + j]), (int) (C[i * dim_n + j + 4])); + } + PRINTF("\n"); } - PRINTF("\n"); - } + #endif } } + threadblock_barrier(/*barrier_id=*/0, /*count=*/NUM_WARPS); vx_tmc(0); }