diff --git a/tests/regression/sgemm_tcore/kernel.cpp b/tests/regression/sgemm_tcore/kernel.cpp index 42443de7..2354a3e0 100644 --- a/tests/regression/sgemm_tcore/kernel.cpp +++ b/tests/regression/sgemm_tcore/kernel.cpp @@ -547,7 +547,7 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, const uint32_t threadblock_dim_y, /*const uint32_t threadblock_id_x, const uint32_t threadblock_id_y,*/ - const uint32_t threadblock_id_in_cluster, + // const uint32_t threadblock_id_in_cluster, float *sharedmem_per_threadblock) { const float *A = (const float *)arg->addr_a; const float *B = (const float *)arg->addr_b; @@ -602,7 +602,7 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, global_dmem_load(dim_n, dim_k, 0 /*k*/, A, B, local_a, local_b, tid_in_warpgroup, block_n, block_m); - threadblock_barrier(threadblock_id_in_cluster, threadblock_dim_y); + threadblock_barrier(0/*threadblock_id_in_cluster*/, threadblock_dim_y); } // NOTE: this *should* be signed integer to trigger arithmetic @@ -633,11 +633,11 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, local_a_produce, local_b_produce, tid_in_warpgroup, block_n, block_m); - threadblock_barrier(threadblock_id_in_cluster, threadblock_dim_y); + threadblock_barrier(0/*threadblock_id_in_cluster*/, threadblock_dim_y); } // sync with final consumer stage in the k-loop - threadblock_barrier(threadblock_id_in_cluster, threadblock_dim_y); + threadblock_barrier(0/*threadblock_id_in_cluster*/, threadblock_dim_y); } } } else { @@ -650,7 +650,7 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, initialize_C(1); // sync with initial producer stage in the k-loop - threadblock_barrier(threadblock_id_in_cluster, threadblock_dim_y); + threadblock_barrier(0/*threadblock_id_in_cluster*/, threadblock_dim_y); // NOTE: this *should* be signed integer to trigger arithmetic // right-shift @@ -718,7 +718,7 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, } } - threadblock_barrier(threadblock_id_in_cluster, threadblock_dim_y); + threadblock_barrier(0/*threadblock_id_in_cluster*/, threadblock_dim_y); #else @@ -819,7 +819,7 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) { const int warp_id = vx_warp_id(); thread_block_gemm(arg, tid_in_threadblock, threads_per_threadblock, threadblock_dim_x, threadblock_dim_y, /*threadblock_id_x, - threadblock_id_y,*/ threadblock_id_in_cluster, + threadblock_id_y,*/ /*threadblock_id_in_cluster, */ sharedmem_per_threadblock); }