diff --git a/tests/regression/sgemm_gemmini_dma/generate_operands.py b/tests/regression/sgemm_gemmini_dma/generate_operands.py new file mode 100644 index 00000000..a0c0e68a --- /dev/null +++ b/tests/regression/sgemm_gemmini_dma/generate_operands.py @@ -0,0 +1,35 @@ +import numpy as np + +# Function to generate random fp16 values +def generate_fp16_matrix(size): + return np.random.rand(size, size).astype(np.float16) + +# Function to save the matrix to a binary file +def save_matrix_to_bin(file_name, matrix): + matrix.tofile(file_name) + +# Function to perform matrix multiplication and truncate to specified size +def truncated_matrix_multiplication(matrix_a, matrix_b, size): + truncated_a = matrix_a.flatten()[:size * size].reshape(size, size) + truncated_b = matrix_b.flatten()[:size * size].reshape(size, size) + result = np.matmul(truncated_a, truncated_b) + return result.astype(np.float16) + +# Generate the 512x512 matrices +size = 512 +matrix_a = generate_fp16_matrix(size) +matrix_b = generate_fp16_matrix(size) + +# Save the operand matrices to binary files +# save_matrix_to_bin("input.a.bin", matrix_a) +# save_matrix_to_bin("input.b.bin", matrix_b) + +# Generate and save the reference matrices for 128x128, 256x256, and 512x512 sizes +sizes = [128, 256, 512] +for s in sizes: + ref_matrix = truncated_matrix_multiplication(matrix_a, matrix_b, s) + print(ref_matrix) + # save_matrix_to_bin(f"ref{s}.bin", ref_matrix) + +print("All files generated successfully.") + diff --git a/tests/regression/sgemm_gemmini_dma/kernel.cpp b/tests/regression/sgemm_gemmini_dma/kernel.cpp index c9e38ab0..89e3600c 100644 --- a/tests/regression/sgemm_gemmini_dma/kernel.cpp +++ b/tests/regression/sgemm_gemmini_dma/kernel.cpp @@ -42,7 +42,10 @@ #define PRINTF(...) sprintf(PRINT_BUF, __VA_ARGS__) // #define PRINTF(...) vx_printf(__VA_ARGS__) #define SWISH(beta, x) ((x) / (1 + exp(-(beta) * (x)))) -#define POWER +// #define POWER + +typedef uint16_t smem_elem_t; +// typedef float smem_elem_t; inline void threadblock_barrier(unsigned int barrier_id, unsigned int count) { vx_fence(); @@ -53,9 +56,9 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, const uint32_t threadblock_id, const uint32_t tid_in_threadblock) { asm volatile ("matmul_start_%=:" :: ); - const float * const A = (const float * const) arg->addr_a; - const float * const B = (const float * const) arg->addr_b; - float * const C = (float * const) arg->addr_c; + const smem_elem_t * const A = (const smem_elem_t * const) arg->addr_a; + const smem_elem_t * const B = (const smem_elem_t * const) arg->addr_b; + smem_elem_t * const C = (smem_elem_t * const) arg->addr_c; if (HW_TID() == 0) { gemmini_extended_config_ex(WEIGHT_STATIONARY, 0, 0, 1, 0, 0); @@ -80,11 +83,13 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, const uint32_t num_tile_rows_per_tb = num_tiles_m / NUM_CLUSTERS; + constexpr scale_t MVIN_SCALE_IDENTITY_HEX = 0x3c00; + if (HW_TID() == 0) { - gemmini_extended3_config_ld(dim_k * sizeof(elem_t), MVIN_SCALE_IDENTITY, false, 0); - gemmini_extended3_config_ld(dim_n * sizeof(elem_t), MVIN_SCALE_IDENTITY, false, 1); + gemmini_extended3_config_ld(dim_k * sizeof(elem_t), MVIN_SCALE_IDENTITY_HEX, false, 0); + gemmini_extended3_config_ld(dim_n * sizeof(elem_t), MVIN_SCALE_IDENTITY_HEX, false, 1); // gemmini_extended3_config_ld(repeating_bias ? 0 : (stride_D * sizeof_D), D_scale_factor, low_D, 2); - gemmini_extended_config_st(dim_n * sizeof(elem_t), 0, MVIN_SCALE_IDENTITY); + gemmini_extended_config_st(dim_n * sizeof(elem_t), 0, MVIN_SCALE_IDENTITY_HEX); // gemmini_extended_config_st(stride_C * sizeof_C, act & 3, scale); } @@ -130,7 +135,7 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, // // move out to dram // if (HW_TID() == 0) { - float * const dram_c_tile_start = C + tile_i * TILE_M * dim_n + tile_j * TILE_N; + smem_elem_t * const dram_c_tile_start = C + tile_i * TILE_M * dim_n + tile_j * TILE_N; ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, 0, BOUND_INST, k_LOOP_WS_CONFIG_BOUNDS) ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, 0, (uint64_t) dram_c_tile_start, k_LOOP_WS_CONFIG_ADDRS_DC) ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, 0, dim_n, k_LOOP_WS_CONFIG_STRIDES_DC) @@ -150,7 +155,8 @@ void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, PRINTF("total cycles: %d\n", marker1 - marker0); for (int i = 0; i < dim_m; i += 8) { for (int j = 0; j < dim_n; j += 8) { - PRINTF("%d %d ", (int) (C[i * dim_n + j]), (int) (C[i * dim_n + j + 4])); + // PRINTF("%d %d ", (int) (C[i * dim_n + j]), (int) (C[i * dim_n + j + 4])); + PRINTF("%04x %04x ", (int) (C[i * dim_n + j]), (int) (C[i * dim_n + j + 4])); } PRINTF("\n"); } @@ -181,4 +187,4 @@ int main() { vx_spawn_tasks_contiguous(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); #endif return 0; -} \ No newline at end of file +}