diff --git a/tests/regression/common.mk b/tests/regression/common.mk index 8f4c4db1..81df3139 100644 --- a/tests/regression/common.mk +++ b/tests/regression/common.mk @@ -22,6 +22,7 @@ RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX) VORTEX_RT_PATH ?= $(realpath ../../../runtime) VORTEX_KN_PATH ?= $(realpath ../../../kernel) +GEMMINI_SW_PATH ?= $(realpath ../../../third_party/gemmini-rocc-tests) FPGA_BIN_DIR ?= $(VORTEX_RT_PATH)/opae @@ -49,7 +50,7 @@ VX_CP = $(LLVM_VORTEX)/bin/llvm-objcopy VX_CFLAGS += -v -O3 -std=c++17 VX_CFLAGS += -mcmodel=medany -fno-rtti -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections -VX_CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw +VX_CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw -I$(GEMMINI_SW_PATH) VX_CFLAGS += -DNDEBUG -DLLVM_VORTEX VX_LDFLAGS += -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=$(STARTUP_ADDR) $(VORTEX_KN_PATH)/libvortexrt.a diff --git a/tests/regression/sgemm_gemmini/.gitignore b/tests/regression/sgemm_gemmini/.gitignore new file mode 100644 index 00000000..7c35ba59 --- /dev/null +++ b/tests/regression/sgemm_gemmini/.gitignore @@ -0,0 +1,5 @@ +*.bin +*.dump +*.elf +sgemm_wg +.depend diff --git a/tests/regression/sgemm_gemmini/Makefile b/tests/regression/sgemm_gemmini/Makefile new file mode 100644 index 00000000..a36f6d21 --- /dev/null +++ b/tests/regression/sgemm_gemmini/Makefile @@ -0,0 +1,9 @@ +PROJECT = sgemm_gemmini + +SRCS = main.cpp common.h + +VX_SRCS = kernel.cpp + +OPTS ?= -n16 + +include ../common.mk diff --git a/tests/regression/sgemm_gemmini/common.h b/tests/regression/sgemm_gemmini/common.h new file mode 100644 index 00000000..74941562 --- /dev/null +++ b/tests/regression/sgemm_gemmini/common.h @@ -0,0 +1,18 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#include + +#define KERNEL_ARG_DEV_MEM_ADDR 0x7fff0000 +#define DEV_SMEM_START_ADDR 0xff000000 + +typedef struct { + uint32_t dim_m; + uint32_t dim_n; + uint32_t dim_k; + uint64_t addr_a; + uint64_t addr_b; + uint64_t addr_c; +} kernel_arg_t; + +#endif diff --git a/tests/regression/sgemm_gemmini/kernel.cpp b/tests/regression/sgemm_gemmini/kernel.cpp new file mode 100644 index 00000000..34c72d00 --- /dev/null +++ b/tests/regression/sgemm_gemmini/kernel.cpp @@ -0,0 +1,269 @@ +#include +#include +#include +#include +#include "common.h" +#include "include/gemmini.h" +#include "gemmini_mmio.h" + +#define MATRIX_M 64 // TODO: remove hardcode +#define MATRIX_N 64 +#define MATRIX_K 64 +#define TILE_M 32 // tile size = SMEM size / 2 (double buffering) / 4 (A, B, C, Psum) +#define TILE_N 32 +#define TILE_K 32 +#define TILE_MN 1024 +#define TILE_MK 1024 +#define TILE_NK 1024 + +#define NUM_CLUSTERS 1 +#define TB_M (MATRIX_M / NUM_CLUSTERS) +#define TB_N MATRIX_N +#define TB_SIZE (TB_M * TB_N) +#define NUM_TILE_ROWS_PER_TB (TB_M / TILE_M) +#define THREAD_ELEMS 8 // elements per thread in a tile +#define THREAD_STRIDE 8 // threads per core + +#define SMEM_ADDR_0K ((float *) 0xff000000) +#define SMEM_ADDR_4K ((float *) 0xff001000) +#define SMEM_ADDR_8K ((float *) 0xff002000) +#define SMEM_ADDR_12K ((float *) 0xff003000) + +#define SPAD_ADDR_0K 0x0 +#define SPAD_ADDR_4K 0x80 +#define SPAD_ADDR_8K 0x100 +#define SPAD_ADDR_12K 0x180 + +// #define DEBUG_PRINT +#define rd_cycles(x) asm volatile ("csrr %0, mcycle" : "=r" (x)) + +void threadblock_barrier(unsigned int tid_in_threadblock, unsigned int barrier_id, unsigned int count) { + vx_fence(); + vx_barrier(barrier_id, count); +} + +void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, + const uint32_t threadblock_id, + const uint32_t tid_in_threadblock) { + const float * const A = (const float * const) arg->addr_a; + const float * const B = (const float * const) arg->addr_b; + float * const C = (float * const) arg->addr_c; + + const uint32_t dim_m = arg->dim_m; + const uint32_t dim_n = arg->dim_n; + const uint32_t dim_k = arg->dim_k; + const uint32_t num_tiles_n = dim_n / TILE_N; + const uint32_t num_tiles_k = dim_k / TILE_K; + // TODO: make this into constexpr by subbing architectural params with macros + const uint32_t num_threads_in_cluster = vx_num_threads() * vx_num_warps() * CORES_PER_CLUSTER; + const uint32_t hw_tid = tid_in_threadblock % num_threads_in_cluster; + const uint32_t a_elems_per_thread = TILE_MK / num_threads_in_cluster; + const uint32_t b_elems_per_thread = TILE_NK / num_threads_in_cluster; + const uint32_t c_elems_per_thread = TILE_MN / num_threads_in_cluster; + const uint32_t thread_load_offset = hw_tid; + const uint32_t thread_load_stride = num_threads_in_cluster; + + uint32_t marker0, marker1, marker2, marker3, marker4; + uint32_t marker5, marker6, marker7, marker8, marker9; + + if (hw_tid == 0) { + gemmini_config_ld(0); + gemmini_extended_config_ex(WEIGHT_STATIONARY, 0, 0, 1, 0, 0); + gemmini_config_st(0); + sprintf(PRINT_BUF, "start\n"); + } + + + // TODO: check for tb id + rd_cycles(marker0); + + for (int tile_i = NUM_TILE_ROWS_PER_TB * threadblock_id; + tile_i < NUM_TILE_ROWS_PER_TB * (threadblock_id + 1); + tile_i += 1) { + for (int tile_j = 0; tile_j < num_tiles_n; tile_j += 1) { + float * const smem_c_tile_start = SMEM_ADDR_4K; + float * const dram_c_tile_start = C + tile_i * TILE_M * dim_n + tile_j * TILE_N; + + for (int tile_k = 0; tile_k < num_tiles_k; tile_k += 1) { + // TODO: double buffer + const float * const dram_a_tile_start = A + tile_i * TILE_M * dim_k + tile_k * TILE_K; + const float * const dram_b_tile_start = B + tile_k * TILE_K * dim_n + tile_j * TILE_N; + float * const smem_a_tile_start = SMEM_ADDR_0K; + float * const smem_b_tile_start = SMEM_ADDR_12K; + + rd_cycles(marker1); + + // preload A matrix +#pragma GCC unroll 8 // TODO: macro computed + for (int thread_i = 0; thread_i < a_elems_per_thread; thread_i++) { + uint32_t elem_offset = thread_load_offset + thread_load_stride * thread_i; + smem_a_tile_start[SMEM_MAT_OFFSET(elem_offset / TILE_K, elem_offset % TILE_K, TILE_K)] = \ + dram_a_tile_start[elem_offset / TILE_K * dim_k + elem_offset % TILE_K]; + } + +#ifdef DEBUG_PRINT + if (hw_tid == 0) { + sprintf(PRINT_BUF, "\nA %d %d\n", tile_i, tile_k); + for (int i = 0; i < TILE_M; i += 8) { + for (int j = 0; j < TILE_K; j += 8) { + uint32_t mat_offset = SMEM_MAT_OFFSET(i, j, TILE_K); + sprintf(PRINT_BUF, "%x %x ", + (int) (smem_a_tile_start[mat_offset]), + (int) (smem_a_tile_start[mat_offset + 4]) + ); + } + sprintf(PRINT_BUF, "\n"); + } + } +#endif + + // preload B matrix +#pragma GCC unroll 8 + for (int thread_i = 0; thread_i < b_elems_per_thread; thread_i++) { + uint32_t elem_offset = thread_load_offset + thread_load_stride * thread_i; + smem_b_tile_start[SMEM_MAT_OFFSET(elem_offset / TILE_N, elem_offset % TILE_N, TILE_N)] = \ + dram_b_tile_start[elem_offset / TILE_N * dim_n + elem_offset % TILE_N]; + } + +#ifdef DEBUG_PRINT + if (hw_tid == 0) { + sprintf(PRINT_BUF, "\nB %d %d\n", tile_k, tile_j); + for (int i = 0; i < TILE_K; i += 8) { + for (int j = 0; j < TILE_N; j += 8) { + uint32_t mat_offset = SMEM_MAT_OFFSET(i, j, TILE_N); + sprintf(PRINT_BUF, "%x %x ", + (int) (smem_b_tile_start[mat_offset]), + (int) (smem_b_tile_start[mat_offset + 4]) + ); + } + sprintf(PRINT_BUF, "\n"); + } + } +#endif + rd_cycles(marker2); + + // cluster wide barrier to wait for A and B loads to complete + threadblock_barrier(0, /*barrier_id=*/threadblock_id, /*count=*/num_threads_in_cluster); + rd_cycles(marker3); + if (hw_tid == 0) { + sp_tiled_matmul_full_spad_ws(SPAD_ADDR_0K, SPAD_ADDR_12K, /*spad_D=*/0, SPAD_ADDR_4K, + /*I=*/TILE_M / DIM, /*J=*/TILE_N / DIM, /*K=*/TILE_K / DIM, /*pad_I=*/0, /*pad_J=*/0, /*pad_K=*/0, + /*a_transpose=*/0, /*b_transpose=*/0, /*full_C=*/0, /*low_D=*/0, + /*no_bias=*/1, /*repeating_bias=*/0, /*act=*/NO_ACTIVATION); + gemmini_fence(); + } + rd_cycles(marker4); + threadblock_barrier(0, /*barrier_id=*/threadblock_id, /*count=*/num_threads_in_cluster); + rd_cycles(marker5); + + // accumulate C matrix + if (tile_k == 0) { +#pragma GCC unroll 8 + for (int thread_i = 0; thread_i < c_elems_per_thread; thread_i++) { + uint32_t elem_offset = thread_load_offset + thread_load_stride * thread_i; + *(SMEM_ADDR_8K + elem_offset) = smem_c_tile_start[elem_offset]; + } + } else { +#pragma GCC unroll 8 + for (int thread_i = 0; thread_i < c_elems_per_thread; thread_i++) { + uint32_t elem_offset = thread_load_offset + thread_load_stride * thread_i; + *(SMEM_ADDR_8K + elem_offset) += smem_c_tile_start[elem_offset]; + } + } + + rd_cycles(marker6); +#ifdef DEBUG_PRINT + if (hw_tid == 0) { + sprintf(PRINT_BUF, "\nC %d %d %d\n", tile_i, tile_j, tile_k); + for (int i = 0; i < TILE_M; i += 8) { + for (int j = 0; j < TILE_N; j += 8) { + uint32_t mat_offset = SMEM_MAT_OFFSET(i, j, TILE_N); + sprintf(PRINT_BUF, "%d %d ", + (int) (smem_c_tile_start[mat_offset]), + (int) (smem_c_tile_start[mat_offset + 4]) + ); + } + sprintf(PRINT_BUF, "\n"); + } + } +#endif + } + + rd_cycles(marker7); + // move out to dram + #pragma GCC unroll 8 // TODO: macro computed + for (int thread_i = 0; thread_i < c_elems_per_thread; thread_i++) { + uint32_t elem_offset = thread_load_offset + thread_load_stride * thread_i; + dram_c_tile_start[elem_offset / TILE_N * dim_n + elem_offset % TILE_N] = \ + *(SMEM_ADDR_8K + SMEM_MAT_OFFSET(elem_offset / TILE_N, elem_offset % TILE_N, TILE_N)); + } + + rd_cycles(marker8); + /* if (hw_tid == 0) { + sprintf(PRINT_BUF, "\nC %d %d\n", tile_i, tile_j); + for (int i = 0; i < TILE_M; i += 8) { + for (int j = 0; j < TILE_N; j += 8) { + uint32_t mat_offset = SMEM_MAT_OFFSET(i, j, TILE_N); + sprintf(PRINT_BUF, "%d %d ", + (int) (C[(tile_i * TILE_M + i) * dim_n + tile_j * TILE_N + j]), + (int) (C[(tile_i * TILE_M + i) * dim_n + tile_j * TILE_N + j + 4]) + ); + } + sprintf(PRINT_BUF, "\n"); + } + } */ + } + } + // last thread block complete + if (threadblock_id == NUM_CLUSTERS - 1) { + threadblock_barrier(0, /*barrier_id=*/0, /*count=*/num_threads_in_cluster); + rd_cycles(marker9); + if (hw_tid == 0) { + sprintf(PRINT_BUF, "complete\n"); + sprintf(PRINT_BUF, "total cycles: %d\n", marker9 - marker0); + sprintf(PRINT_BUF, "single tile cycles: %d\n", marker6 - marker1); + sprintf(PRINT_BUF, "A/B tile load cycles: %d\n", marker2 - marker1); + sprintf(PRINT_BUF, "gemmini cycles: %d\n", marker4 - marker3); + sprintf(PRINT_BUF, "first barrier: %d\n", marker3 - marker2); + sprintf(PRINT_BUF, "second barrier: %d\n", marker5 - marker4); + sprintf(PRINT_BUF, "accumulation cycles: %d\n", marker6 - marker5); + sprintf(PRINT_BUF, "dram mvout cycles: %d\n", marker8 - marker7); + } + threadblock_barrier(0, /*barrier_id=*/0, /*count=*/num_threads_in_cluster); + if (hw_tid == num_threads_in_cluster - 1) { + sprintf(PRINT_BUF, "single tile cycles: %d\n", marker6 - marker1); + sprintf(PRINT_BUF, "A/B tile load cycles: %d\n", marker2 - marker1); + sprintf(PRINT_BUF, "gemmini cycles: %d\n", marker4 - marker3); + sprintf(PRINT_BUF, "first barrier: %d\n", marker3 - marker2); + sprintf(PRINT_BUF, "second barrier: %d\n", marker5 - marker4); + } + vx_tmc_one(); + } + vx_tmc(0); +} + +void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) { + // @perf: All threads are running these compute whose result is mostly same + // across the threadblock + + const int threadblock_id = task_id / TB_SIZE; + const int tid_in_threadblock = task_id % TB_SIZE; + + thread_block_matmul_gemmini(arg, threadblock_id, tid_in_threadblock); +} + +int main() { + kernel_arg_t *arg = (kernel_arg_t *)KERNEL_ARG_DEV_MEM_ADDR; + sprintf(PRINT_BUF, "m=%d, n=%d\n", arg->dim_m, arg->dim_n); + + const uint32_t num_threads_in_cluster = vx_num_threads() * vx_num_warps() * CORES_PER_CLUSTER; + const uint32_t grid_size = num_threads_in_cluster * NUM_CLUSTERS; +#ifdef RADIANCE + vx_spawn_tasks_cluster(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); +#else + // NOTE: This kernel assumes contiguous thread scheduling for efficient shared + // memory allocation, and therefore does not work with original vx_spawn_tasks + vx_spawn_tasks_contiguous(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); +#endif + return 0; +} \ No newline at end of file diff --git a/tests/regression/sgemm_gemmini/main.cpp b/tests/regression/sgemm_gemmini/main.cpp new file mode 100644 index 00000000..54531062 --- /dev/null +++ b/tests/regression/sgemm_gemmini/main.cpp @@ -0,0 +1,274 @@ +#include +#include +#include +#include +#include +#include +#include "common.h" + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + exit(-1); \ + } while (false) + +/////////////////////////////////////////////////////////////////////////////// + +const char* kernel_file = "kernel.bin"; +uint32_t count = 0; + +std::vector src_a_data; +std::vector src_b_data; +std::vector ref_data; + +vx_device_h device = nullptr; +std::vector staging_buf; +kernel_arg_t kernel_arg = {}; + +static void show_usage() { + std::cout << "Vortex Test." << std::endl; + std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl; +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:k:h?")) != -1) { + switch (c) { + case 'n': + count = atoi(optarg); + break; + case 'k': + kernel_file = optarg; + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } +} + +void cleanup() { + if (device) { + vx_mem_free(device, kernel_arg.addr_a); + vx_mem_free(device, kernel_arg.addr_b); + vx_mem_free(device, kernel_arg.addr_c); + vx_dev_close(device); + } +} + +void generate_source_matrix(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) { + src_a_data.resize(dim_m * dim_k); + src_b_data.resize(dim_k * dim_n); + + for (uint32_t i = 0; i < src_a_data.size(); ++i) { + src_a_data[i] = static_cast(i); + std::cout << "A: " << i << ": value=" << src_a_data[i] << std::endl; + } + for (uint32_t i = 0; i < src_b_data.size(); ++i) { + src_b_data[i] = static_cast(i); + std::cout << "B: " << i << ": value=" << src_b_data[i] << std::endl; + } +} + +void generate_reference_matmul(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) { + ref_data.resize(dim_m * dim_n); + + for (uint32_t i = 0; i < dim_m; ++i) { + for (uint32_t j = 0; j < dim_n; ++j) { + float ref = 0.0f; + for (uint32_t k = 0; k < dim_k; ++k) { + ref += src_a_data[dim_k * i + k] * src_b_data[dim_n * k + j]; + } + ref_data.at(dim_n * i + j) = ref; + } + } +} + +int run_test(const kernel_arg_t& kernel_arg, + uint32_t buf_size, + uint32_t dim_m, uint32_t dim_n) { + // start device + std::cout << "start device" << std::endl; + RT_CHECK(vx_start(device)); + + // wait for completion + std::cout << "wait for completion" << std::endl; + RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); + + // download destination buffer + std::cout << "download destination buffer" << std::endl; + RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_c, buf_size)); + + // verify result + std::cout << "verify result" << std::endl; + { + int errors = 0; + auto buf_ptr = (float*)staging_buf.data(); + for (uint32_t i = 0; i < dim_m * dim_n; ++i) { + float ref = ref_data.at(i); + float cur = buf_ptr[i]; + if (std::abs((cur - ref) / ref) > 1e-6) { + std::cout << "error at result #" << std::dec << i + << std::hex << ": actual=" << cur << ", expected=" << ref << std::endl; + ++errors; + } + } + if (errors != 0) { + std::cout << "Found " << std::dec << errors << " errors!" << std::endl; + std::cout << "FAILED!" << std::endl; + return 1; + } + } + + return 0; +} + +int main(int argc, char *argv[]) { + // parse command arguments + parse_args(argc, argv); + + if (count == 0) { + count = 1; + } + + std::srand(50); + + // open device connection + std::cout << "open device connection" << std::endl; + RT_CHECK(vx_dev_open(&device)); + + // FIXME: hardcoded + uint32_t dim_m = 64; + uint32_t dim_n = 64; + uint32_t dim_k = 64; + + generate_source_matrix(dim_m, dim_n, dim_k); + generate_reference_matmul(dim_m, dim_n, dim_k); + + uint32_t src_a_buf_size = src_a_data.size() * sizeof(src_a_data[0]); + uint32_t src_b_buf_size = src_b_data.size() * sizeof(src_b_data[0]); + uint32_t dst_buf_size = ref_data.size() * sizeof(src_a_data[0]); + + std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl; + + // upload program + std::cout << "upload program" << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file)); + + // allocate device memory + std::cout << "allocate device memory" << std::endl; + RT_CHECK(vx_mem_alloc(device, src_a_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_a)); + RT_CHECK(vx_mem_alloc(device, src_b_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_b)); + RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_c)); + + kernel_arg.dim_m = dim_m; + kernel_arg.dim_n = dim_n; + kernel_arg.dim_k = dim_k; + + std::cout << "dev_addr_a=0x" << std::hex << kernel_arg.addr_a << std::endl; + std::cout << "dev_addr_b=0x" << std::hex << kernel_arg.addr_b << std::endl; + std::cout << "dev_addr_c=0x" << std::hex << kernel_arg.addr_c << std::endl; + + // allocate staging buffer + { + std::cout << "allocate staging buffer" << std::endl; + uint32_t staging_buf_size = std::max( + src_a_buf_size, + std::max( + src_b_buf_size, + std::max(dst_buf_size, sizeof(kernel_arg_t)))); + staging_buf.resize(staging_buf_size); + } + + // upload kernel argument + { + std::cout << "upload kernel argument" << std::endl; + auto buf_ptr = staging_buf.data(); + kernel_arg.addr_a = (uint64_t) 0x20000; + kernel_arg.addr_b = (uint64_t) 0x28000; + kernel_arg.addr_c = (uint64_t) 0xc0000000ULL; + memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); + + std::cout << "uploading argument buffer to device, device mem address=" + << std::hex << KERNEL_ARG_DEV_MEM_ADDR << ", size=" << std::dec + << sizeof(kernel_arg_t) << " bytes\n"; + std::ofstream file("args.bin", std::ios::binary | std::ios::out); + if (!file) { + std::cerr << "error: failed to open args.bin for writing\n"; + exit(EXIT_FAILURE); + } + file.write(reinterpret_cast(staging_buf.data()), + sizeof(kernel_arg_t)); + file.close(); + + RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); + } + + // upload source buffer + { + { + auto buf_ptr = staging_buf.data(); + memcpy(buf_ptr, src_a_data.data(), src_a_data.size() * sizeof(float)); + RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_a, staging_buf.data(), + src_a_buf_size)); + + std::cout << "uploading source A matrix to device, device mem address=" + << std::hex << kernel_arg.addr_a << ", size=" << std::dec + << src_a_buf_size << " bytes\n"; + std::ofstream file("input.a.bin", std::ios::binary | std::ios::out); + if (!file) { + std::cerr << "error: failed to open args.bin for writing\n"; + exit(EXIT_FAILURE); + } + file.write(reinterpret_cast(buf_ptr), src_a_buf_size); + file.close(); + } + { + auto buf_ptr = staging_buf.data(); + memcpy(buf_ptr, src_b_data.data(), src_b_data.size() * sizeof(float)); + RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_b, staging_buf.data(), + src_b_buf_size)); + + std::cout << "uploading source B matrix to device, device mem address=" + << std::hex << kernel_arg.addr_b << ", size=" << std::dec + << src_b_buf_size << " bytes\n"; + std::ofstream file("input.b.bin", std::ios::binary | std::ios::out); + if (!file) { + std::cerr << "error: failed to open args.bin for writing\n"; + exit(EXIT_FAILURE); + } + file.write(reinterpret_cast(buf_ptr), src_b_buf_size); + file.close(); + } + } + + // clear destination buffer + { + std::cout << "clear destination buffer" << std::endl; + auto buf_ptr = (int32_t*)staging_buf.data(); + for (uint32_t i = 0; i < ref_data.size(); ++i) { + buf_ptr[i] = 0xdeadbeef; + } + RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_c, staging_buf.data(), dst_buf_size)); + } + + // run tests + std::cout << "run tests" << std::endl; + RT_CHECK(run_test(kernel_arg, dst_buf_size, kernel_arg.dim_m, kernel_arg.dim_n)); + std::cout << "PASSED!" << std::endl; + + // cleanup + std::cout << "cleanup" << std::endl; + cleanup(); + + return 0; +} diff --git a/tests/regression/sgemm_gemmini/sgemm_gemmini b/tests/regression/sgemm_gemmini/sgemm_gemmini new file mode 100755 index 00000000..67ade61b Binary files /dev/null and b/tests/regression/sgemm_gemmini/sgemm_gemmini differ