flash: Revert to gemmini config, remove DEBUG and unnecessary checks
This commit is contained in:
@@ -11,8 +11,8 @@
|
|||||||
#define ROW_REMAINDER_LOGIC
|
#define ROW_REMAINDER_LOGIC
|
||||||
|
|
||||||
constexpr uint32_t ROWMAX_SETS = 3;
|
constexpr uint32_t ROWMAX_SETS = 3;
|
||||||
constexpr bool WARP_SPECIALIZED = true;
|
constexpr bool WARP_SPECIALIZED = false;
|
||||||
constexpr bool TENSOR_CORE = true;
|
constexpr bool TENSOR_CORE = false;
|
||||||
|
|
||||||
// temporary safety stop for wrong configs
|
// temporary safety stop for wrong configs
|
||||||
static_assert(NUM_CORES == 4);
|
static_assert(NUM_CORES == 4);
|
||||||
|
|||||||
@@ -10,7 +10,7 @@
|
|||||||
|
|
||||||
#define FENCE_GEMM_II
|
#define FENCE_GEMM_II
|
||||||
|
|
||||||
constexpr bool DEBUG = true;
|
constexpr bool DEBUG = false;
|
||||||
|
|
||||||
static_assert(GEMMINI_DMA && !WARP_SPECIALIZED,
|
static_assert(GEMMINI_DMA && !WARP_SPECIALIZED,
|
||||||
"GEMMINI_DMA should be set and WARP_SPECIALIZED unset");
|
"GEMMINI_DMA should be set and WARP_SPECIALIZED unset");
|
||||||
@@ -192,9 +192,6 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
|
|||||||
|
|
||||||
threadblock_barrier(global_barrier_id, warps_per_threadblock_per_core);
|
threadblock_barrier(global_barrier_id, warps_per_threadblock_per_core);
|
||||||
|
|
||||||
static_assert(!GEMMINI_DMA || Q_IS_K_MAJOR,
|
|
||||||
"DMA code assumes Q matrix is stored K-major");
|
|
||||||
|
|
||||||
// skip everything except DMA in the loop FSM
|
// skip everything except DMA in the loop FSM
|
||||||
constexpr uint32_t skips =
|
constexpr uint32_t skips =
|
||||||
loop_matmul_skips(/*skip_lda=*/0, /*skip_ldb=*/0, /*skip_ldd=*/1,
|
loop_matmul_skips(/*skip_lda=*/0, /*skip_ldb=*/0, /*skip_ldd=*/1,
|
||||||
@@ -339,8 +336,7 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
|
|||||||
// "inner loop" along the columns of K^T
|
// "inner loop" along the columns of K^T
|
||||||
const uint32_t k_tiles = (dim_seqlen / B_COL);
|
const uint32_t k_tiles = (dim_seqlen / B_COL);
|
||||||
for (uint32_t tile_k = 0;
|
for (uint32_t tile_k = 0;
|
||||||
tile_k <
|
tile_k < (4 /*for perf measurement*/ * k_tiles) + 2 /*pipeline latency*/;
|
||||||
(4 /*FIXME: for perf measurement*/ * k_tiles) + 2 /*pipeline latency*/;
|
|
||||||
tile_k++) {
|
tile_k++) {
|
||||||
if constexpr (DEBUG || true) {
|
if constexpr (DEBUG || true) {
|
||||||
threadblock_barrier(global_barrier_id, warps_per_threadblock_per_core);
|
threadblock_barrier(global_barrier_id, warps_per_threadblock_per_core);
|
||||||
|
|||||||
Reference in New Issue
Block a user