flash: Optimize smem alloc for tcore for 8banks

Divide into first half & last half for warpgroup 0 & 1, and
allocate Q/K and P/V in different banks for parallel acccess.
This commit is contained in:
Hansung Kim
2024-09-19 21:31:11 -07:00
parent d0ef06cec1
commit 221d5f75c2
2 changed files with 79 additions and 63 deletions

View File

@@ -11,8 +11,8 @@
#define ROW_REMAINDER_LOGIC
constexpr uint32_t ROWMAX_SETS = 3;
constexpr bool WARP_SPECIALIZED = false;
constexpr bool TENSOR_CORE = false;
constexpr bool WARP_SPECIALIZED = true;
constexpr bool TENSOR_CORE = true;
// temporary safety stop for wrong configs
static_assert(NUM_CORES == 4);