flash: Fix grid size to hw cluster size

Verified fast config, minus the barrier stall at the end.
This commit is contained in:
Hansung Kim
2024-09-09 15:43:31 -07:00
parent 829af5d429
commit d31c8ffd7d
2 changed files with 3 additions and 7 deletions

View File

@@ -538,6 +538,7 @@ wmma_store(const int thread_in_warp, const int warp_col, const int warp_row,
__attribute__((convergent)) inline void
threadblock_barrier(const uint32_t barrier_id, const uint32_t count) {
asm volatile("" ::: "memory");
vx_fence();
vx_barrier(barrier_id, count);
}