diff --git a/tests/regression/sgemm_gemmini_dma/kernel.cpp b/tests/regression/sgemm_gemmini_dma/kernel.cpp index 049d1970..8e629d5d 100644 --- a/tests/regression/sgemm_gemmini_dma/kernel.cpp +++ b/tests/regression/sgemm_gemmini_dma/kernel.cpp @@ -33,7 +33,8 @@ // #define BOUND_INST 0x400040004ULL #define NUM_CLUSTERS 1 -#define NUM_THREADS_IN_CLUSTER 128 +#define NUM_THREADS_IN_CLUSTER 256 \ +// (NUM_CORES * NUM_WARPS * NUM_THREADS) #define rd_cycles_force(x) asm volatile ("csrr %0, mcycle" : "=r" (x)) #define rd_cycles(x) rd_cycles_force(x) @@ -41,7 +42,7 @@ #define PRINTF(...) sprintf(PRINT_BUF, __VA_ARGS__) // #define PRINTF(...) vx_printf(__VA_ARGS__) #define SWISH(beta, x) ((x) / (1 + exp(-(beta) * (x)))) -#define POWER +//#define POWER inline void threadblock_barrier(unsigned int barrier_id, unsigned int count) { vx_fence(); @@ -168,7 +169,7 @@ void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) { int main() { kernel_arg_t *arg = (kernel_arg_t *)KERNEL_ARG_DEV_MEM_ADDR; - const uint32_t num_threads_in_cluster = vx_num_threads() * vx_num_warps() * CORES_PER_CLUSTER; + const uint32_t num_threads_in_cluster = NUM_THREADS_IN_CLUSTER; const uint32_t grid_size = num_threads_in_cluster * NUM_CLUSTERS; #ifdef RADIANCE vx_spawn_tasks_cluster(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); @@ -178,4 +179,4 @@ int main() { vx_spawn_tasks_contiguous(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); #endif return 0; -} +} \ No newline at end of file diff --git a/tests/regression/sgemm_gemmini_duo/.gitignore b/tests/regression/sgemm_gemmini_duo/.gitignore new file mode 100644 index 00000000..7c35ba59 --- /dev/null +++ b/tests/regression/sgemm_gemmini_duo/.gitignore @@ -0,0 +1,5 @@ +*.bin +*.dump +*.elf +sgemm_wg +.depend diff --git a/tests/regression/sgemm_gemmini_duo/Makefile b/tests/regression/sgemm_gemmini_duo/Makefile new file mode 100644 index 00000000..05737084 --- /dev/null +++ b/tests/regression/sgemm_gemmini_duo/Makefile @@ -0,0 +1,9 @@ +PROJECT = sgemm_gemmini_duo + +SRCS = main.cpp common.h + +VX_SRCS = kernel.cpp + +OPTS ?= -n16 + +include ../common.mk diff --git a/tests/regression/sgemm_gemmini_duo/common.h b/tests/regression/sgemm_gemmini_duo/common.h new file mode 100644 index 00000000..5c84f3b7 --- /dev/null +++ b/tests/regression/sgemm_gemmini_duo/common.h @@ -0,0 +1,18 @@ +#ifndef _COMMON_H_ +#define _COMMON_H_ + +#include + +#define KERNEL_ARG_DEV_MEM_ADDR 0x9fff0000 +#define DEV_SMEM_START_ADDR 0xff000000 + +typedef struct { + uint32_t dim_m; + uint32_t dim_n; + uint32_t dim_k; + uint64_t addr_a; + uint64_t addr_b; + uint64_t addr_c; +} kernel_arg_t; + +#endif diff --git a/tests/regression/sgemm_gemmini_duo/kernel.cpp b/tests/regression/sgemm_gemmini_duo/kernel.cpp new file mode 100644 index 00000000..341dd301 --- /dev/null +++ b/tests/regression/sgemm_gemmini_duo/kernel.cpp @@ -0,0 +1,177 @@ +#include +#include +#include +#include +#include "common.h" +#include "include/gemmini.h" +#include "gemmini_mmio.h" + +#define TILE_M 64 +#define TILE_N 64 +#define TILE_K 64 +#define SMEM_ADDR_Q0 ((float * const) 0xff000000) +#define SMEM_ADDR_Q1 ((float * const) 0xff004000) +#define SMEM_ADDR_Q2 ((float * const) 0xff008000) +#define SMEM_ADDR_Q3 ((float * const) 0xff00c000) +#define SPAD_ADDR_Q0 0x0 +#define SPAD_ADDR_Q1 0x200 +#define SPAD_ADDR_Q2 0x400 +#define SPAD_ADDR_Q3 0x600 +#define BOUND_INST 0x800080008ULL + +// #define TILE_M 32 +// #define TILE_N 32 +// #define TILE_K 32 +// #define SMEM_ADDR_Q0 ((float * const) 0xff000000) +// #define SMEM_ADDR_Q1 ((float * const) 0xff001000) +// #define SMEM_ADDR_Q2 ((float * const) 0xff002000) +// #define SMEM_ADDR_Q3 ((float * const) 0xff003000) +// #define SPAD_ADDR_Q0 0x0 +// #define SPAD_ADDR_Q1 0x80 +// #define SPAD_ADDR_Q2 0x100 +// #define SPAD_ADDR_Q3 0x180 +// #define BOUND_INST 0x400040004ULL + +#define NUM_CLUSTERS 1 +#define NUM_THREADS_IN_CLUSTER 256 \ +// (NUM_CORES * NUM_WARPS * NUM_THREADS) + +#define rd_cycles_force(x) asm volatile ("csrr %0, mcycle" : "=r" (x)) +#define rd_cycles(x) rd_cycles_force(x) +#define HW_TID() ({uint32_t gtid; asm volatile ("csrr %0, mhartid" : "=r" (gtid)); gtid;}) +#define PRINTF(...) sprintf(PRINT_BUF, __VA_ARGS__) +// #define PRINTF(...) vx_printf(__VA_ARGS__) +#define SWISH(beta, x) ((x) / (1 + exp(-(beta) * (x)))) +//#define POWER + +inline void threadblock_barrier(unsigned int barrier_id, unsigned int count) { + vx_fence(); + vx_barrier(barrier_id, count); +} + +void thread_block_matmul_gemmini(kernel_arg_t *__UNIFORM__ arg, + const uint32_t threadblock_id, + const uint32_t tid_in_threadblock) { + asm volatile ("matmul_start_%=:"::); + const float *const A = (const float *const) arg->addr_a; + const float *const B = (const float *const) arg->addr_b; + float *const C = (float *const) arg->addr_c; + + if (HW_TID() == 0) { + gemmini_extended_config_ex(WEIGHT_STATIONARY, 0, 0, 1, 0, 0); + use_gemmini(1); + gemmini_extended_config_ex(WEIGHT_STATIONARY, 0, 0, 1, 0, 0); + use_gemmini(0); + // gemmini_extended_config_ex(dataflow, act & 3, 0, 1, a_transpose, b_transpose); + PRINTF("start\n"); + } + + vx_fence(); + + uint32_t marker0, marker1; + rd_cycles_force(marker0); + + const uint32_t dim_m = arg->dim_m; + const uint32_t dim_n = arg->dim_n; + const uint32_t dim_k = arg->dim_k; + const uint32_t num_tiles_m = dim_m / TILE_M; + const uint32_t num_tiles_n = dim_n / TILE_N; + const uint32_t num_tiles_k = dim_k / TILE_K; + constexpr uint32_t num_threads_in_cluster = NUM_THREADS_IN_CLUSTER; + + const uint32_t num_tile_rows_per_tb = num_tiles_m / NUM_CLUSTERS; + + #define RUN_ON_GEMMINI(gemmini_i) { \ + use_gemmini(gemmini_i); \ + if (HW_TID() == 0) { \ + gemmini_extended3_config_ld(dim_k * sizeof(elem_t), MVIN_SCALE_IDENTITY, false, 0); \ + gemmini_extended3_config_ld(dim_n * sizeof(elem_t), MVIN_SCALE_IDENTITY, false, 1); \ + gemmini_extended_config_st(dim_n * sizeof(elem_t), 0, MVIN_SCALE_IDENTITY); \ + } \ + for (uint32_t tile_i = num_tile_rows_per_tb * threadblock_id; \ + tile_i < num_tile_rows_per_tb * (threadblock_id + 1); \ + tile_i += 1) { \ + for (int tile_j = 0; tile_j < num_tiles_n; tile_j += 1) { \ + if (HW_TID() == 0) { \ + for (int tile_k = 0; tile_k < num_tiles_k; tile_k += 1) { \ + ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, \ + (uint64_t) (A + tile_i * TILE_M * dim_k + tile_k * TILE_K), \ + (uint64_t) (B + tile_k * TILE_K * dim_n + tile_j * TILE_N), \ + k_LOOP_WS_CONFIG_ADDRS_AB) \ + GEMMINI_CISC_CMD_R((dim_n) << 16 | (dim_k << 8) | GEMMINI_CISC_IMM(8, gemmini_i)); \ + if (tile_k & 1) { \ + GEMMINI_CISC_CMD_I(GEMMINI_CISC_IMM(11, gemmini_i)); \ + } else { \ + GEMMINI_CISC_CMD_I(GEMMINI_CISC_IMM(10, gemmini_i)); \ + } \ + if (tile_k == 0) { \ + gemmini_fence(); \ + GEMMINI_CISC_CMD_I(GEMMINI_CISC_IMM(0, gemmini_i)); \ + } else if (tile_k & 1) { \ + gemmini_fence(); \ + GEMMINI_CISC_CMD_I(GEMMINI_CISC_IMM(2, gemmini_i)); \ + } else { \ + gemmini_fence(); \ + GEMMINI_CISC_CMD_I(GEMMINI_CISC_IMM(1, gemmini_i)); \ + } \ + } \ + gemmini_fence(); \ + gemmini_fence(); \ + gemmini_fence(); \ + gemmini_fence(); \ + GEMMINI_CISC_CMD_I(GEMMINI_CISC_IMM(9, gemmini_i)); \ + gemmini_fence(); \ + } \ + threadblock_barrier(/*barrier_id=*/0, /*count=*/NUM_WARPS); \ + if (HW_TID() == 0) { \ + float *const dram_c_tile_start = C + tile_i * TILE_M * dim_n + tile_j * TILE_N; \ + ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, 0, BOUND_INST, k_LOOP_WS_CONFIG_BOUNDS) \ + ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, 0, (uint64_t) dram_c_tile_start, k_LOOP_WS_CONFIG_ADDRS_DC) \ + ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, 0, dim_n, k_LOOP_WS_CONFIG_STRIDES_DC) \ + ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, 0, loop_matmul_skips(1, 1, 1, 1, 0), k_LOOP_WS) \ + } \ + } \ + } \ + if (threadblock_id == NUM_CLUSTERS - 1) { \ + threadblock_barrier(/*barrier_id=*/0, /*count=*/NUM_WARPS); \ + rd_cycles_force(marker1); \ + if (HW_TID() == 0) { \ + PRINTF("\ncomplete on core %d\n", gemmini_i); \ + PRINTF("total cycles: %d\n", marker1 - marker0); \ + for (int i = 0; i < 1 /*dim_m*/; i += 8) { /* print one line only for quick test running */ \ + for (int j = 0; j < dim_n; j += 8) { \ + PRINTF("%d %d ", (int) (C[i * dim_n + j]), (int) (C[i * dim_n + j + 4])); \ + } \ + PRINTF("\n"); \ + } \ + } \ + } \ + threadblock_barrier(/*barrier_id=*/0, /*count=*/NUM_WARPS); \ + } + + RUN_ON_GEMMINI(0) + RUN_ON_GEMMINI(1) + vx_tmc(0); +} + +void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) { + const int threadblock_id = task_id / NUM_THREADS_IN_CLUSTER; + const int tid_in_threadblock = task_id % NUM_THREADS_IN_CLUSTER; + + thread_block_matmul_gemmini(arg, threadblock_id, tid_in_threadblock); +} + +int main() { + kernel_arg_t *arg = (kernel_arg_t *)KERNEL_ARG_DEV_MEM_ADDR; + + const uint32_t num_threads_in_cluster = NUM_THREADS_IN_CLUSTER; + const uint32_t grid_size = num_threads_in_cluster * NUM_CLUSTERS; +#ifdef RADIANCE + vx_spawn_tasks_cluster(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); +#else + // NOTE: This kernel assumes contiguous thread scheduling for efficient shared + // memory allocation, and therefore does not work with original vx_spawn_tasks + vx_spawn_tasks_contiguous(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); +#endif + return 0; +} \ No newline at end of file diff --git a/tests/regression/sgemm_gemmini_duo/main.cpp b/tests/regression/sgemm_gemmini_duo/main.cpp new file mode 100644 index 00000000..54531062 --- /dev/null +++ b/tests/regression/sgemm_gemmini_duo/main.cpp @@ -0,0 +1,274 @@ +#include +#include +#include +#include +#include +#include +#include "common.h" + +#define RT_CHECK(_expr) \ + do { \ + int _ret = _expr; \ + if (0 == _ret) \ + break; \ + printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \ + cleanup(); \ + exit(-1); \ + } while (false) + +/////////////////////////////////////////////////////////////////////////////// + +const char* kernel_file = "kernel.bin"; +uint32_t count = 0; + +std::vector src_a_data; +std::vector src_b_data; +std::vector ref_data; + +vx_device_h device = nullptr; +std::vector staging_buf; +kernel_arg_t kernel_arg = {}; + +static void show_usage() { + std::cout << "Vortex Test." << std::endl; + std::cout << "Usage: [-k: kernel] [-n words] [-h: help]" << std::endl; +} + +static void parse_args(int argc, char **argv) { + int c; + while ((c = getopt(argc, argv, "n:k:h?")) != -1) { + switch (c) { + case 'n': + count = atoi(optarg); + break; + case 'k': + kernel_file = optarg; + break; + case 'h': + case '?': { + show_usage(); + exit(0); + } break; + default: + show_usage(); + exit(-1); + } + } +} + +void cleanup() { + if (device) { + vx_mem_free(device, kernel_arg.addr_a); + vx_mem_free(device, kernel_arg.addr_b); + vx_mem_free(device, kernel_arg.addr_c); + vx_dev_close(device); + } +} + +void generate_source_matrix(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) { + src_a_data.resize(dim_m * dim_k); + src_b_data.resize(dim_k * dim_n); + + for (uint32_t i = 0; i < src_a_data.size(); ++i) { + src_a_data[i] = static_cast(i); + std::cout << "A: " << i << ": value=" << src_a_data[i] << std::endl; + } + for (uint32_t i = 0; i < src_b_data.size(); ++i) { + src_b_data[i] = static_cast(i); + std::cout << "B: " << i << ": value=" << src_b_data[i] << std::endl; + } +} + +void generate_reference_matmul(uint32_t dim_m, uint32_t dim_n, uint32_t dim_k) { + ref_data.resize(dim_m * dim_n); + + for (uint32_t i = 0; i < dim_m; ++i) { + for (uint32_t j = 0; j < dim_n; ++j) { + float ref = 0.0f; + for (uint32_t k = 0; k < dim_k; ++k) { + ref += src_a_data[dim_k * i + k] * src_b_data[dim_n * k + j]; + } + ref_data.at(dim_n * i + j) = ref; + } + } +} + +int run_test(const kernel_arg_t& kernel_arg, + uint32_t buf_size, + uint32_t dim_m, uint32_t dim_n) { + // start device + std::cout << "start device" << std::endl; + RT_CHECK(vx_start(device)); + + // wait for completion + std::cout << "wait for completion" << std::endl; + RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT)); + + // download destination buffer + std::cout << "download destination buffer" << std::endl; + RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.addr_c, buf_size)); + + // verify result + std::cout << "verify result" << std::endl; + { + int errors = 0; + auto buf_ptr = (float*)staging_buf.data(); + for (uint32_t i = 0; i < dim_m * dim_n; ++i) { + float ref = ref_data.at(i); + float cur = buf_ptr[i]; + if (std::abs((cur - ref) / ref) > 1e-6) { + std::cout << "error at result #" << std::dec << i + << std::hex << ": actual=" << cur << ", expected=" << ref << std::endl; + ++errors; + } + } + if (errors != 0) { + std::cout << "Found " << std::dec << errors << " errors!" << std::endl; + std::cout << "FAILED!" << std::endl; + return 1; + } + } + + return 0; +} + +int main(int argc, char *argv[]) { + // parse command arguments + parse_args(argc, argv); + + if (count == 0) { + count = 1; + } + + std::srand(50); + + // open device connection + std::cout << "open device connection" << std::endl; + RT_CHECK(vx_dev_open(&device)); + + // FIXME: hardcoded + uint32_t dim_m = 64; + uint32_t dim_n = 64; + uint32_t dim_k = 64; + + generate_source_matrix(dim_m, dim_n, dim_k); + generate_reference_matmul(dim_m, dim_n, dim_k); + + uint32_t src_a_buf_size = src_a_data.size() * sizeof(src_a_data[0]); + uint32_t src_b_buf_size = src_b_data.size() * sizeof(src_b_data[0]); + uint32_t dst_buf_size = ref_data.size() * sizeof(src_a_data[0]); + + std::cout << "buffer size: " << dst_buf_size << " bytes" << std::endl; + + // upload program + std::cout << "upload program" << std::endl; + RT_CHECK(vx_upload_kernel_file(device, kernel_file)); + + // allocate device memory + std::cout << "allocate device memory" << std::endl; + RT_CHECK(vx_mem_alloc(device, src_a_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_a)); + RT_CHECK(vx_mem_alloc(device, src_b_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_b)); + RT_CHECK(vx_mem_alloc(device, dst_buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.addr_c)); + + kernel_arg.dim_m = dim_m; + kernel_arg.dim_n = dim_n; + kernel_arg.dim_k = dim_k; + + std::cout << "dev_addr_a=0x" << std::hex << kernel_arg.addr_a << std::endl; + std::cout << "dev_addr_b=0x" << std::hex << kernel_arg.addr_b << std::endl; + std::cout << "dev_addr_c=0x" << std::hex << kernel_arg.addr_c << std::endl; + + // allocate staging buffer + { + std::cout << "allocate staging buffer" << std::endl; + uint32_t staging_buf_size = std::max( + src_a_buf_size, + std::max( + src_b_buf_size, + std::max(dst_buf_size, sizeof(kernel_arg_t)))); + staging_buf.resize(staging_buf_size); + } + + // upload kernel argument + { + std::cout << "upload kernel argument" << std::endl; + auto buf_ptr = staging_buf.data(); + kernel_arg.addr_a = (uint64_t) 0x20000; + kernel_arg.addr_b = (uint64_t) 0x28000; + kernel_arg.addr_c = (uint64_t) 0xc0000000ULL; + memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t)); + + std::cout << "uploading argument buffer to device, device mem address=" + << std::hex << KERNEL_ARG_DEV_MEM_ADDR << ", size=" << std::dec + << sizeof(kernel_arg_t) << " bytes\n"; + std::ofstream file("args.bin", std::ios::binary | std::ios::out); + if (!file) { + std::cerr << "error: failed to open args.bin for writing\n"; + exit(EXIT_FAILURE); + } + file.write(reinterpret_cast(staging_buf.data()), + sizeof(kernel_arg_t)); + file.close(); + + RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); + } + + // upload source buffer + { + { + auto buf_ptr = staging_buf.data(); + memcpy(buf_ptr, src_a_data.data(), src_a_data.size() * sizeof(float)); + RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_a, staging_buf.data(), + src_a_buf_size)); + + std::cout << "uploading source A matrix to device, device mem address=" + << std::hex << kernel_arg.addr_a << ", size=" << std::dec + << src_a_buf_size << " bytes\n"; + std::ofstream file("input.a.bin", std::ios::binary | std::ios::out); + if (!file) { + std::cerr << "error: failed to open args.bin for writing\n"; + exit(EXIT_FAILURE); + } + file.write(reinterpret_cast(buf_ptr), src_a_buf_size); + file.close(); + } + { + auto buf_ptr = staging_buf.data(); + memcpy(buf_ptr, src_b_data.data(), src_b_data.size() * sizeof(float)); + RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_b, staging_buf.data(), + src_b_buf_size)); + + std::cout << "uploading source B matrix to device, device mem address=" + << std::hex << kernel_arg.addr_b << ", size=" << std::dec + << src_b_buf_size << " bytes\n"; + std::ofstream file("input.b.bin", std::ios::binary | std::ios::out); + if (!file) { + std::cerr << "error: failed to open args.bin for writing\n"; + exit(EXIT_FAILURE); + } + file.write(reinterpret_cast(buf_ptr), src_b_buf_size); + file.close(); + } + } + + // clear destination buffer + { + std::cout << "clear destination buffer" << std::endl; + auto buf_ptr = (int32_t*)staging_buf.data(); + for (uint32_t i = 0; i < ref_data.size(); ++i) { + buf_ptr[i] = 0xdeadbeef; + } + RT_CHECK(vx_copy_to_dev(device, kernel_arg.addr_c, staging_buf.data(), dst_buf_size)); + } + + // run tests + std::cout << "run tests" << std::endl; + RT_CHECK(run_test(kernel_arg, dst_buf_size, kernel_arg.dim_m, kernel_arg.dim_n)); + std::cout << "PASSED!" << std::endl; + + // cleanup + std::cout << "cleanup" << std::endl; + cleanup(); + + return 0; +} diff --git a/tests/regression/sgemm_gemmini_duo/sgemm_gemmini_duo b/tests/regression/sgemm_gemmini_duo/sgemm_gemmini_duo new file mode 100755 index 00000000..2204a038 Binary files /dev/null and b/tests/regression/sgemm_gemmini_duo/sgemm_gemmini_duo differ