diff --git a/tests/regression/sgemm_tcore/kernel.cpp b/tests/regression/sgemm_tcore/kernel.cpp index 869de1ce..fbb34f8b 100644 --- a/tests/regression/sgemm_tcore/kernel.cpp +++ b/tests/regression/sgemm_tcore/kernel.cpp @@ -84,10 +84,10 @@ inline void global_dmem_load(const uint32_t dim_n, const uint32_t dim_k, // this is equivalent to threadblock_dim_y (assuming threadblock_dim_x == // BK) constexpr uint32_t row_stride_a = threads_in_threadblock / BK_adjusted; - const float *global_a = reinterpret_cast(A) + + const float *global_a = reinterpret_cast(A) + dim_k_adjusted * global_a_row + (k_adjusted + local_a_col); - volatile float *local_a_tmp = reinterpret_cast(local_a) + + volatile float *local_a_tmp = reinterpret_cast(local_a) + BK_adjusted * local_a_row + local_a_col; #pragma GCC unroll 1