Set BM==BN==64, update doc
This commit is contained in:
@@ -19,7 +19,7 @@
|
|||||||
// * Combining BM * BK >= (BM*BN) / (TM*TN) == threadblock yields
|
// * Combining BM * BK >= (BM*BN) / (TM*TN) == threadblock yields
|
||||||
// BM <= BK*TM*TN
|
// BM <= BK*TM*TN
|
||||||
#define BM 64
|
#define BM 64
|
||||||
#define BN 32
|
#define BN 64
|
||||||
#define BK 64
|
#define BK 64
|
||||||
#define WM 16
|
#define WM 16
|
||||||
#define WN 8
|
#define WN 8
|
||||||
@@ -33,13 +33,15 @@
|
|||||||
// number of loop around the inner 0..TCK..BK loop to simulate perfect-DRAM
|
// number of loop around the inner 0..TCK..BK loop to simulate perfect-DRAM
|
||||||
// scenario
|
// scenario
|
||||||
#define BK_LOOP 1
|
#define BK_LOOP 1
|
||||||
// whether to transpose smem A tile at GMEM->SMEM (produce), or SMEM->RF
|
// Whether to transpose smem A tile at GMEM->SMEM (produce), or SMEM->RF
|
||||||
// (consume). This is because the tensor core expects the A tile to be stored
|
// (consume). This is because the tensor core expects the A tile to be stored
|
||||||
// in column-major order in SMEM.
|
// in column-major order in SMEM, whereas it is stored row-major in GMEM.
|
||||||
//
|
//
|
||||||
// For correctness, only one of either should be 1. To model the case where
|
// For correctness, only one of either should be 1. To model the case where
|
||||||
// the entire A matrix is already stored transposed in GMEM ("TN" kernel), set
|
// the A matrix is already stored transposed in GMEM ("TN" kernel), set
|
||||||
// both to 0.
|
// both to 0.
|
||||||
|
//
|
||||||
|
// For reference, PRODUCE 1 CONSUME 0 generates the performant NN kernel.
|
||||||
#define TRANSPOSE_AT_PRODUCE 1
|
#define TRANSPOSE_AT_PRODUCE 1
|
||||||
#define TRANSPOSE_AT_CONSUME 0
|
#define TRANSPOSE_AT_CONSUME 0
|
||||||
// GMEM_COALESCED sets bank conflict-free accesses for
|
// GMEM_COALESCED sets bank conflict-free accesses for
|
||||||
|
|||||||
Reference in New Issue
Block a user