sgemm_impl: Add DMA_FAST option; fix dbuf offset for dma
This commit is contained in:
@@ -90,13 +90,22 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) {
|
||||
thread_block_gemm<float_type, threads_per_threadblock,
|
||||
/*write_to_gmem=*/true,
|
||||
/*smem_a_offset=*/0,
|
||||
/*smem_a_dbuf_offset=*/0,
|
||||
#ifdef GEMMINI_DMA
|
||||
/*smem_a_dbuf_offset=*/1 * 128 * 128 * sizeof(float_type),
|
||||
/*smem_b_offset=*/2 * 128 * 128 * sizeof(float_type),
|
||||
/*smem_b_dbuf_offset=*/3 * 128 * 128 * sizeof(float_type)
|
||||
// FIXME: above offsets are hardcoded to agree with CISC
|
||||
// spadQuartile
|
||||
#else
|
||||
/*smem_a_dbuf_offset=*/1 * BM * BK * sizeof(float_type),
|
||||
/*smem_b_offset=*/2 * BM * BK * sizeof(float_type),
|
||||
/*smem_b_dbuf_offset=*/2 * BM * BK * sizeof(float_type)>(
|
||||
(const float_type *)arg->addr_a, (const float_type *)arg->addr_b,
|
||||
(float *)arg->addr_c, arg->dim_m, arg->dim_n, arg->dim_k,
|
||||
tid_in_threadblock, threadblocks_per_cluster, threadblock_id_in_cluster,
|
||||
sharedmem_per_threadblock);
|
||||
/*smem_b_dbuf_offset=*/(2 * BM * BK + BK * BN) * sizeof(float_type)
|
||||
#endif
|
||||
>((const float_type *)arg->addr_a,
|
||||
(const float_type *)arg->addr_b, (float *)arg->addr_c,
|
||||
arg->dim_m, arg->dim_n, arg->dim_k, tid_in_threadblock,
|
||||
threadblocks_per_cluster, threadblock_id_in_cluster,
|
||||
sharedmem_per_threadblock);
|
||||
|
||||
float *gmem_tmp_d0 = reinterpret_cast<float *>(0xd0000000UL);
|
||||
float *gmem_tmp_d1 = reinterpret_cast<float *>(0xd1000000UL);
|
||||
|
||||
Reference in New Issue
Block a user