sgemm_tcore: Blocksize 64; Fix kernel launch on larger dim

& fix addrgen assembly too large offset error
This commit is contained in:
Hansung Kim
2024-06-11 22:27:12 -07:00
parent 03d1df8f53
commit 32e31c51a4
3 changed files with 80 additions and 57 deletions

View File

@@ -20,9 +20,9 @@
// (BM*BN) / (TM*TN) == threadblock size >= NT * CORES_PER_CLUSTER
// * Combining BM * BK >= (BM*BN) / (TM*TN) == threadblock yields
// BM <= BK*TM*TN
#define BM 32
#define BN 32
#define BK 32
#define BM 64
#define BN 64
#define BK 64
#define WM 16
#define WN 8
#define TCM 8
@@ -42,29 +42,12 @@
// For correctness, only one of either should be 1. To model the case where
// the entire A matrix is already stored transposed in GMEM ("TN" kernel), set
// both to 0.
#define TRANSPOSE_AT_PRODUCE 0
#define TRANSPOSE_AT_PRODUCE 1
#define TRANSPOSE_AT_CONSUME 0
// GMEM_COALESCED sets bank conflict-free accesses for
// 1: GMEM loads of A matrix
// 0: SMEM stores of A matrix
#define GMEM_COALESCED_A 1
#define GEMMINI_DMA 0
#if SMEM_SIZE != 0x4000
#error Currently only supports 16K spad
#endif
#define SMEM_ADDR_Q0 ((float * const) 0xff000000)
#define SMEM_ADDR_Q1 ((float * const) 0xff001000)
#define SMEM_ADDR_Q2 ((float * const) 0xff002000)
#define SMEM_ADDR_Q3 ((float * const) 0xff003000)
#define SPAD_ADDR_Q0 0x0
#define SPAD_ADDR_Q1 0x80
#define SPAD_ADDR_Q2 0x100
#define SPAD_ADDR_Q3 0x180
// FIXME: NUM_THREADS and NUM_WARPS hardcoded
#if ((BM * BN / ELEM_PER_THREAD) > (CORES_PER_CLUSTER * 8 * 8))
#error "threadblock size too big for cluster"
#endif
inline constexpr void map_operand_32lanes(const int tid, int &row, int &col) {
const int tg = tid / 4;