sgemm_tcore: Improve agen for !transpose_as smem load
This commit is contained in:
@@ -173,18 +173,28 @@ inline void vx_wmma_load_a(volatile float *smem_A, const int local_k,
|
|||||||
constexpr int smem_AS_cols = BM;
|
constexpr int smem_AS_cols = BM;
|
||||||
|
|
||||||
if constexpr (!TRANSPOSE_AS) {
|
if constexpr (!TRANSPOSE_AS) {
|
||||||
int A_offset = (WM * warp_row + TCM * wm_iter + row) * smem_A_cols;
|
// int A_offset = (WM * warp_row + TCM * wm_iter + row) * smem_A_cols;
|
||||||
|
|
||||||
// @perf: bank conflicts
|
// @perf: bank conflicts
|
||||||
// f8-f15 stores a single row of A
|
// f8-f15 stores a single row of A
|
||||||
asm volatile("flw f0, %0" ::"m"(smem_A[A_offset + (local_k + 0)]));
|
volatile float *smem_addr;
|
||||||
asm volatile("flw f1, %0" ::"m"(smem_A[A_offset + (local_k + 1)]));
|
smem_addr = &smem_A[(WM * warp_row + TCM * wm_iter + row) * smem_A_cols + local_k];
|
||||||
asm volatile("flw f2, %0" ::"m"(smem_A[A_offset + (local_k + 2)]));
|
asm volatile("flw f0, %0(%1)" ::"i"(0 * sizeof(float)), "r"(smem_addr));
|
||||||
asm volatile("flw f3, %0" ::"m"(smem_A[A_offset + (local_k + 3)]));
|
asm volatile("flw f1, %0(%1)" ::"i"(1 * sizeof(float)), "r"(smem_addr));
|
||||||
asm volatile("flw f4, %0" ::"m"(smem_A[A_offset + (local_k + 4)]));
|
asm volatile("flw f2, %0(%1)" ::"i"(2 * sizeof(float)), "r"(smem_addr));
|
||||||
asm volatile("flw f5, %0" ::"m"(smem_A[A_offset + (local_k + 5)]));
|
asm volatile("flw f3, %0(%1)" ::"i"(3 * sizeof(float)), "r"(smem_addr));
|
||||||
asm volatile("flw f6, %0" ::"m"(smem_A[A_offset + (local_k + 6)]));
|
asm volatile("flw f4, %0(%1)" ::"i"(4 * sizeof(float)), "r"(smem_addr));
|
||||||
asm volatile("flw f7, %0" ::"m"(smem_A[A_offset + (local_k + 7)]));
|
asm volatile("flw f5, %0(%1)" ::"i"(5 * sizeof(float)), "r"(smem_addr));
|
||||||
|
asm volatile("flw f6, %0(%1)" ::"i"(6 * sizeof(float)), "r"(smem_addr));
|
||||||
|
asm volatile("flw f7, %0(%1)" ::"i"(7 * sizeof(float)), "r"(smem_addr));
|
||||||
|
// asm volatile("flw f0, %0" ::"m"(smem_A[A_offset + (local_k + 0)]));
|
||||||
|
// asm volatile("flw f1, %0" ::"m"(smem_A[A_offset + (local_k + 1)]));
|
||||||
|
// asm volatile("flw f2, %0" ::"m"(smem_A[A_offset + (local_k + 2)]));
|
||||||
|
// asm volatile("flw f3, %0" ::"m"(smem_A[A_offset + (local_k + 3)]));
|
||||||
|
// asm volatile("flw f4, %0" ::"m"(smem_A[A_offset + (local_k + 4)]));
|
||||||
|
// asm volatile("flw f5, %0" ::"m"(smem_A[A_offset + (local_k + 5)]));
|
||||||
|
// asm volatile("flw f6, %0" ::"m"(smem_A[A_offset + (local_k + 6)]));
|
||||||
|
// asm volatile("flw f7, %0" ::"m"(smem_A[A_offset + (local_k + 7)]));
|
||||||
} else {
|
} else {
|
||||||
// transposed A
|
// transposed A
|
||||||
// f8-f15 stores a single row of A
|
// f8-f15 stores a single row of A
|
||||||
@@ -610,7 +620,10 @@ inline void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg,
|
|||||||
k_index++;
|
k_index++;
|
||||||
|
|
||||||
// producer code: GMEM->SMEM memory movement
|
// producer code: GMEM->SMEM memory movement
|
||||||
// ----------------------------------------------------------------------
|
// ---------------------------------------------------------------------
|
||||||
|
//
|
||||||
|
// this is either done using DMA or SIMT cores depending on GEMMINI_DMA
|
||||||
|
|
||||||
#if (GEMMINI_DMA == 1)
|
#if (GEMMINI_DMA == 1)
|
||||||
if (tid_in_threadblock == 0) {
|
if (tid_in_threadblock == 0) {
|
||||||
// configure dma gmem address to load from
|
// configure dma gmem address to load from
|
||||||
|
|||||||
Reference in New Issue
Block a user