From be7d87c82ddede45d06901b7ede3fd46b04bd3cd Mon Sep 17 00:00:00 2001 From: Sungwoong Ha Date: Thu, 22 Feb 2024 16:31:42 -0800 Subject: [PATCH 01/36] temp --- ci/toolchain_env.sh | 4 ++++ hw/rtl/core/VX_core.sv | 31 +++++++++++++++++++++++++++++-- 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/ci/toolchain_env.sh b/ci/toolchain_env.sh index 440a899e..3d4e2d41 100644 --- a/ci/toolchain_env.sh +++ b/ci/toolchain_env.sh @@ -24,3 +24,7 @@ export PATH=$SV2V_PATH/bin:$PATH export YOSYS_PATH=$TOOLDIR/yosys export PATH=$YOSYS_PATH/bin:$PATH + +export LLVM_VORTEX=$TOOLDIR/llvm-vortex +export POCL_CC_PATH=$TOOLDIR/pocl/compiler +export POCL_RT_PATH=$TOOLDIR/pocl/runtime \ No newline at end of file diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index dde085a8..e5e57d99 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -45,7 +45,7 @@ module VX_core import VX_gpu_pkg::*; #( output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value, // Status - output wire busy + output wire busy //stays 1 when busy, 0 when done (termination) detect the negative edge ); VX_schedule_if schedule_if(); VX_fetch_if fetch_if(); @@ -258,7 +258,7 @@ module VX_core import VX_gpu_pkg::*; #( `endif -`ifdef PERF_ENABLE +`ifdef PERF_ENABLE // expose these perf counter to console using $display, %time; flag: --perf=0? wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle; wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle; @@ -333,6 +333,33 @@ module VX_core import VX_gpu_pkg::*; #( assign pipeline_perf_if.ifetch_latency = perf_icache_lat; assign pipeline_perf_if.load_latency = perf_dcache_lat; + + always @(negedge busy) begin + if (!reset) begin + $display("time : %t", $time); + $display("perf_dcache_rd_req_per_cycle: %h", perf_dcache_rd_req_per_cycle); + $display("perf_dcache_wr_req_per_cycle: %h", perf_dcache_wr_req_per_cycle); + $display("perf_dcache_rsp_per_cycle: %h", perf_dcache_rsp_per_cycle); + $display("perf_icache_pending_read_cycle: %h", perf_icache_pending_read_cycle); + $display("perf_dcache_pending_read_cycle: %h", perf_dcache_pending_read_cycle); + $display("perf_icache_pending_reads: %h", perf_icache_pending_reads); + $display("perf_dcache_pending_reads: %h", perf_dcache_pending_reads); + $display("perf_ifetches: %h", perf_ifetches); + $display("perf_loads: %h", perf_loads); + $display("perf_stores: %h", perf_stores); + $display("perf_icache_req_fire: %b", perf_icache_req_fire); + $display("perf_icache_rsp_fire: %b", perf_icache_rsp_fire); + $display("perf_dcache_rd_req_fire: %b", perf_dcache_rd_req_fire); + $display("perf_dcache_rd_req_fire_r: %b", perf_dcache_rd_req_fire_r); + $display("perf_dcache_wr_req_fire: %b", perf_dcache_wr_req_fire); + $display("perf_dcache_wr_req_fire_r: %b", perf_dcache_wr_req_fire_r); + $display("perf_dcache_rsp_fire: %b", perf_dcache_rsp_fire); + $display("scheduler idle: %d", pipeline_perf_if.sched_idles[31:0]); + $display("Instruction: %d",commit_csr_if.instret[31:0]); + $display("Cycle: %d",sched_csr_if.cycles); + end + end + `endif endmodule From f1e7407d3a689d943316af08c5eff28f345a7e8f Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 27 Feb 2024 15:44:04 -0800 Subject: [PATCH 02/36] sgemm_wg: Run multiple threadblock per core --- tests/regression/sgemm_wg/kernel.cpp | 34 +++++++++++++++++++++------- tests/regression/sgemm_wg/main.cpp | 4 ++-- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index ec207821..4609b9e6 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -9,7 +9,9 @@ inline void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, const uint32_t threadblock_dim_x, const uint32_t threadblock_dim_y, const uint32_t threadblock_id_x, - const uint32_t threadblock_id_y) { + const uint32_t threadblock_id_y, + const uint32_t threadblock_id_in_core, + float *sharedmem_per_threadblock) { const float *global_a = (const float *)arg->addr_a; const float *global_b = (const float *)arg->addr_b; float *global_c = (float *)arg->addr_c; @@ -29,19 +31,24 @@ inline void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, float reg_c = 0.0f; for (uint32_t k = 0; k < dim_k; k += threadblock_dim_x) { - float *local_a = (float *)DEV_SMEM_START_ADDR; + float *local_a = sharedmem_per_threadblock; size_t local_a_elems = threadblock_dim_x * threadblock_dim_y; - float *local_b = (float *)DEV_SMEM_START_ADDR + local_a_elems; + float *local_b = sharedmem_per_threadblock + local_a_elems; uint32_t offset_global_a = dim_k * global_row + (k + local_col); uint32_t offset_global_b = dim_n * (local_row + k) + global_col; + // FIXME: threadblocks size must be BM*BN, not BM*BK or BN*BK. This means + // there is a mismatch between the number of elements in the A/B tile and + // the C tile. This is handled by each thread computing multiple result + // elements. + // // local_a: threadblock_dim_y rows, threadblock_dim_x cols // local_b: threadblock_dim_x rows, threadblock_dim_y cols // threadblock_dim_x == block_k, threadblock_dim_y == block_m == block_n local_a[threadblock_dim_x * local_row + local_col] = global_a[offset_global_a]; local_b[threadblock_dim_y * local_col + local_row] = global_b[offset_global_b]; - vx_barrier(0, threadblock_dim_y); + vx_barrier(threadblock_id_in_core, threadblock_dim_y); vx_fence(); for (uint32_t local_k = 0; local_k < threadblock_dim_x; local_k++) { @@ -49,7 +56,7 @@ inline void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, local_b[threadblock_dim_y * local_col + local_k]; } - vx_barrier(0, threadblock_dim_y); + vx_barrier(threadblock_id_in_core, threadblock_dim_y); vx_fence(); } @@ -57,14 +64,19 @@ inline void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, } void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { + // @perf: All threads are running these compute whose result is mostly same + // across the threadblock + const uint32_t dim_n = arg->dim_n; int tid_x = task_id % dim_n; int tid_y = task_id / dim_n; + const uint32_t threadblocks_per_core = 2; const uint32_t threadblock_dim_x = vx_num_threads(); - const uint32_t threadblock_dim_y = vx_num_warps(); + const uint32_t threadblock_dim_y = vx_num_warps() / threadblocks_per_core; const uint32_t threads_per_threadblock = threadblock_dim_x * threadblock_dim_y; const int threadblock_id = task_id / threads_per_threadblock; + const int threadblock_id_in_core = threadblock_id % threadblocks_per_core; const uint32_t dim_n_in_blocks = dim_n / threadblock_dim_x; const int threadblock_id_x = threadblock_id % dim_n_in_blocks; @@ -72,8 +84,14 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { const int tid_in_threadblock_x = vx_thread_id(); const int tid_in_threadblock_y = vx_warp_id() % threadblock_dim_y; - thread_block_gemm(arg, tid_in_threadblock_x, tid_in_threadblock_y, threadblock_dim_x, - threadblock_dim_y, threadblock_id_x, threadblock_id_y); + + float *sharedmem_per_threadblock = + (float *)DEV_SMEM_START_ADDR + + (2 * threadblock_dim_x * threadblock_dim_y) * threadblock_id_in_core; + thread_block_gemm(arg, tid_in_threadblock_x, tid_in_threadblock_y, + threadblock_dim_x, threadblock_dim_y, threadblock_id_x, + threadblock_id_y, threadblock_id_in_core, + sharedmem_per_threadblock); } int main() { diff --git a/tests/regression/sgemm_wg/main.cpp b/tests/regression/sgemm_wg/main.cpp index c6252991..229463ef 100644 --- a/tests/regression/sgemm_wg/main.cpp +++ b/tests/regression/sgemm_wg/main.cpp @@ -147,8 +147,8 @@ int main(int argc, char *argv[]) { RT_CHECK(vx_dev_open(&device)); // FIXME: hardcoded - uint32_t dim_m = 16; - uint32_t dim_n = 16; + uint32_t dim_m = 32; + uint32_t dim_n = 32; uint32_t dim_k = 32; generate_source_matrix(dim_m, dim_n, dim_k); From 2b1b5fe5377b3eb756d4e8e2a076ab65f55d832d Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 27 Feb 2024 15:45:22 -0800 Subject: [PATCH 03/36] convolution: Fix write_operand_file after upstream merge --- tests/opencl/convolution/main.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/opencl/convolution/main.cc b/tests/opencl/convolution/main.cc index 5c62b56e..dded468f 100644 --- a/tests/opencl/convolution/main.cc +++ b/tests/opencl/convolution/main.cc @@ -238,9 +238,9 @@ int main (int argc, char **argv) { } // NOTE(hansung): Dump operand buffer to a file - if (write_operand_file("matmul.input.a.bin", h_a.data(), nbytes) != 0) + if (write_operand_file("convolution.input.input.bin", h_i.data(), i_nbytes) != 0) return EXIT_FAILURE; - if (write_operand_file("matmul.input.b.bin", h_b.data(), nbytes) != 0) + if (write_operand_file("convolution.input.weights.bin", h_w.data(), w_nbytes) != 0) return EXIT_FAILURE; // Creating command queue From a2ea27b2b522bcd3e45e18d8d67a201fb71aa204 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 27 Feb 2024 15:46:02 -0800 Subject: [PATCH 04/36] vx_spawn: Add spawn_tasks_contiguous_all_stub Spawns tasks in a way that the threads in a warp see contiguous thread_id, unlike the original variant where each thread were allocated a range of thread_id that spans the number of batches. E.g. in a 4-thread config, instead of mapping IDs (0,2,4,6)->(1,3,5,7), map (0,1,2,3)->(4,5,6,7). TODO remaining logic not implemented. --- kernel/src/vx_spawn.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/kernel/src/vx_spawn.c b/kernel/src/vx_spawn.c index fd8258e1..eb0bdb90 100644 --- a/kernel/src/vx_spawn.c +++ b/kernel/src/vx_spawn.c @@ -74,6 +74,27 @@ static void __attribute__ ((noinline)) spawn_tasks_all_stub() { } } +static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_stub() { + int NT = vx_num_threads(); + int NW = vx_num_warps(); + int cid = vx_core_id(); + int wid = vx_warp_id(); + int tid = vx_thread_id(); + + wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid]; + + // FIXME: handle RW + int waves = p_wspawn_args->NWs; + int offset = p_wspawn_args->offset + (NT * wid + tid); + + vx_spawn_tasks_cb callback = p_wspawn_args->callback; + void* arg = p_wspawn_args->arg; + for (int wave_id = 0; wave_id < waves; ++wave_id) { + int task_id = offset + (wave_id * NT * NW); + callback(task_id, arg); + } +} + static void __attribute__ ((noinline)) spawn_tasks_rem_stub() { int cid = vx_core_id(); int tid = vx_thread_id(); @@ -88,7 +109,8 @@ static void __attribute__ ((noinline)) spawn_tasks_all_cb() { vx_tmc(-1); // call stub routine - spawn_tasks_all_stub(); + // spawn_tasks_all_stub(); + spawn_tasks_contiguous_all_stub(); // disable warp vx_tmc_zero(); @@ -141,7 +163,7 @@ void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) { vx_tmc(-1); // call stub routine - spawn_tasks_all_stub(); + spawn_tasks_contiguous_all_stub(); // back to single-threaded vx_tmc_one(); From 27646bb507645169fbade129dbe4b055528b8a3a Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 27 Feb 2024 22:06:01 -0800 Subject: [PATCH 05/36] sgemm_wg: Implement multiple C per thread with sliding A/B blocks --- tests/regression/sgemm_wg/kernel.cpp | 103 ++++++++++++++++----------- 1 file changed, 63 insertions(+), 40 deletions(-) diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index 4609b9e6..58d54b36 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -3,7 +3,10 @@ #include #include "common.h" -inline void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, +#define MAX_TM 4 + +void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, + const uint32_t tid_in_threadblock, const uint32_t tid_in_threadblock_x, const uint32_t tid_in_threadblock_y, const uint32_t threadblock_dim_x, @@ -12,83 +15,103 @@ inline void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, const uint32_t threadblock_id_y, const uint32_t threadblock_id_in_core, float *sharedmem_per_threadblock) { - const float *global_a = (const float *)arg->addr_a; - const float *global_b = (const float *)arg->addr_b; - float *global_c = (float *)arg->addr_c; + const float *A = (const float *)arg->addr_a; + const float *B = (const float *)arg->addr_b; + float *C = (float *)arg->addr_c; - // assumes NT == NW == matrix_dim const uint32_t dim_m = arg->dim_m; const uint32_t dim_n = arg->dim_n; const uint32_t dim_k = arg->dim_k; - // FIXME: assumes local block size is square shape - const uint32_t local_row = tid_in_threadblock_y; - const uint32_t local_col = tid_in_threadblock_x; - const uint32_t global_row = threadblock_id_y * threadblock_dim_y + local_row; - const uint32_t global_col = threadblock_id_x * threadblock_dim_x + local_col; + // FIXME: Output block size is assumed to be square, i.e. BM == BN + // const uint32_t BM = threadblock_dim_y; + // const uint32_t BN = threadblock_dim_y; + // const uint32_t BK = threadblock_dim_x; + constexpr uint32_t BM = 8; + constexpr uint32_t BN = 8; + constexpr uint32_t BK = 4; + constexpr uint32_t TM = 2; + + const uint32_t local_a_row = tid_in_threadblock / BK; + const uint32_t local_a_col = tid_in_threadblock % BK; + const uint32_t local_b_row = tid_in_threadblock / BN; + const uint32_t local_b_col = tid_in_threadblock % BN; + const uint32_t global_a_row = BM * threadblock_id_y + local_a_row; + const uint32_t global_b_col = BN * threadblock_id_x + local_b_col; + + A += dim_k * BM * threadblock_id_y; + B += BN * threadblock_id_x; + C += dim_n * BM * threadblock_id_y + BN * threadblock_id_x; // each thread generates one output element - float reg_c = 0.0f; + float reg_c[MAX_TM] = { 0.0f }; - for (uint32_t k = 0; k < dim_k; k += threadblock_dim_x) { + for (uint32_t k = 0; k < dim_k; k += BK) { float *local_a = sharedmem_per_threadblock; size_t local_a_elems = threadblock_dim_x * threadblock_dim_y; float *local_b = sharedmem_per_threadblock + local_a_elems; - uint32_t offset_global_a = dim_k * global_row + (k + local_col); - uint32_t offset_global_b = dim_n * (local_row + k) + global_col; - // FIXME: threadblocks size must be BM*BN, not BM*BK or BN*BK. This means - // there is a mismatch between the number of elements in the A/B tile and - // the C tile. This is handled by each thread computing multiple result - // elements. - // - // local_a: threadblock_dim_y rows, threadblock_dim_x cols - // local_b: threadblock_dim_x rows, threadblock_dim_y cols - // threadblock_dim_x == block_k, threadblock_dim_y == block_m == block_n - local_a[threadblock_dim_x * local_row + local_col] = global_a[offset_global_a]; - local_b[threadblock_dim_y * local_col + local_row] = global_b[offset_global_b]; + // NOTE: local_b is transposed to column-major to facilitate better memory + // access. + local_a[BK * local_a_row + local_a_col] = A[dim_k * local_a_row + local_a_col]; + local_b[BN * local_b_row + local_b_col] = B[dim_n * local_b_row + local_b_col]; + + // Advance A and B block + A += BK; + B += dim_n * BK; vx_barrier(threadblock_id_in_core, threadblock_dim_y); vx_fence(); - for (uint32_t local_k = 0; local_k < threadblock_dim_x; local_k++) { - reg_c += local_a[threadblock_dim_x * local_row + local_k] * - local_b[threadblock_dim_y * local_col + local_k]; + for (uint32_t local_k = 0; local_k < BK; local_k++) { + // Compute multiple result elements (TM) per thread + const float local_b_tmp = local_b[BN * local_k + local_b_col]; +#pragma GCC unroll 1 + for (uint32_t result_idx = 0; result_idx < TM; result_idx++) { + reg_c[result_idx] += + local_a[BK * (TM * local_b_row + result_idx) + local_k] * + local_b_tmp; + } } vx_barrier(threadblock_id_in_core, threadblock_dim_y); vx_fence(); } - global_c[dim_n * global_row + global_col] = reg_c; +#pragma GCC unroll 1 + for (uint32_t result_idx = 0; result_idx < TM; result_idx++) { + C[dim_n * (TM * local_b_row + result_idx) + local_b_col] = reg_c[result_idx]; + } } void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { // @perf: All threads are running these compute whose result is mostly same // across the threadblock - const uint32_t dim_n = arg->dim_n; - int tid_x = task_id % dim_n; - int tid_y = task_id / dim_n; - - const uint32_t threadblocks_per_core = 2; + const uint32_t threadblocks_per_core = 1; const uint32_t threadblock_dim_x = vx_num_threads(); const uint32_t threadblock_dim_y = vx_num_warps() / threadblocks_per_core; const uint32_t threads_per_threadblock = threadblock_dim_x * threadblock_dim_y; const int threadblock_id = task_id / threads_per_threadblock; const int threadblock_id_in_core = threadblock_id % threadblocks_per_core; - const uint32_t dim_n_in_blocks = dim_n / threadblock_dim_x; - const int threadblock_id_x = threadblock_id % dim_n_in_blocks; - const int threadblock_id_y = threadblock_id / dim_n_in_blocks; - + const int tid_in_threadblock = task_id % threads_per_threadblock; const int tid_in_threadblock_x = vx_thread_id(); const int tid_in_threadblock_y = vx_warp_id() % threadblock_dim_y; + const uint32_t dim_m = arg->dim_m; + const uint32_t dim_n = arg->dim_n; + const uint32_t BN = 8; + const uint32_t dim_n_in_blocks = dim_n / BN; + const int threadblock_id_x = threadblock_id % dim_n_in_blocks; + const int threadblock_id_y = threadblock_id / dim_n_in_blocks; + // const int threadblock_id_x = dim_n / threadblock_dim_x; + // const int threadblock_id_y = dim_m / threadblock_dim_y / 1; + float *sharedmem_per_threadblock = (float *)DEV_SMEM_START_ADDR + - (2 * threadblock_dim_x * threadblock_dim_y) * threadblock_id_in_core; - thread_block_gemm(arg, tid_in_threadblock_x, tid_in_threadblock_y, + (2 * threads_per_threadblock) * threadblock_id_in_core; + thread_block_gemm(arg, tid_in_threadblock, tid_in_threadblock_x, tid_in_threadblock_y, threadblock_dim_x, threadblock_dim_y, threadblock_id_x, threadblock_id_y, threadblock_id_in_core, sharedmem_per_threadblock); @@ -96,7 +119,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { int main() { kernel_arg_t *arg = (kernel_arg_t *)KERNEL_ARG_DEV_MEM_ADDR; - const uint32_t grid_size = arg->dim_m * arg->dim_n; + const uint32_t grid_size = arg->dim_m * arg->dim_n / 2; vx_spawn_tasks(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); return 0; } From 46f242e520ac0d7f1b174c7f98986871001d6f5a Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 27 Feb 2024 22:23:25 -0800 Subject: [PATCH 06/36] sgemm_wg: Constantify BM/BN/BK/TM, computationally set gridsize and TB/core --- tests/regression/sgemm_wg/kernel.cpp | 51 ++++++++++++---------------- 1 file changed, 21 insertions(+), 30 deletions(-) diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index 58d54b36..69ef9f14 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -3,12 +3,13 @@ #include #include "common.h" -#define MAX_TM 4 +#define BM 8 +#define BN BM +#define BK 8 +#define TM (BM/BK) void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, const uint32_t tid_in_threadblock, - const uint32_t tid_in_threadblock_x, - const uint32_t tid_in_threadblock_y, const uint32_t threadblock_dim_x, const uint32_t threadblock_dim_y, const uint32_t threadblock_id_x, @@ -19,6 +20,7 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, const float *B = (const float *)arg->addr_b; float *C = (float *)arg->addr_c; + // assumes NT == NW == matrix_dim const uint32_t dim_m = arg->dim_m; const uint32_t dim_n = arg->dim_n; const uint32_t dim_k = arg->dim_k; @@ -27,10 +29,9 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, // const uint32_t BM = threadblock_dim_y; // const uint32_t BN = threadblock_dim_y; // const uint32_t BK = threadblock_dim_x; - constexpr uint32_t BM = 8; - constexpr uint32_t BN = 8; - constexpr uint32_t BK = 4; - constexpr uint32_t TM = 2; + // constexpr uint32_t BM = 8; + // constexpr uint32_t BN = 8; + // constexpr uint32_t BK = 2; const uint32_t local_a_row = tid_in_threadblock / BK; const uint32_t local_a_col = tid_in_threadblock % BK; @@ -39,26 +40,21 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, const uint32_t global_a_row = BM * threadblock_id_y + local_a_row; const uint32_t global_b_col = BN * threadblock_id_x + local_b_col; - A += dim_k * BM * threadblock_id_y; - B += BN * threadblock_id_x; - C += dim_n * BM * threadblock_id_y + BN * threadblock_id_x; - // each thread generates one output element - float reg_c[MAX_TM] = { 0.0f }; + float reg_c[TM] = { 0.0f }; for (uint32_t k = 0; k < dim_k; k += BK) { float *local_a = sharedmem_per_threadblock; size_t local_a_elems = threadblock_dim_x * threadblock_dim_y; float *local_b = sharedmem_per_threadblock + local_a_elems; + uint32_t global_a_offset = dim_k * global_a_row + (k + local_a_col); + uint32_t global_b_offset = dim_n * (k + local_b_row) + global_b_col; + // NOTE: local_b is transposed to column-major to facilitate better memory // access. - local_a[BK * local_a_row + local_a_col] = A[dim_k * local_a_row + local_a_col]; - local_b[BN * local_b_row + local_b_col] = B[dim_n * local_b_row + local_b_col]; - - // Advance A and B block - A += BK; - B += dim_n * BK; + local_a[BK * local_a_row + local_a_col] = A[global_a_offset]; + local_b[BN * local_b_row + local_b_col] = B[global_b_offset]; vx_barrier(threadblock_id_in_core, threadblock_dim_y); vx_fence(); @@ -66,7 +62,7 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, for (uint32_t local_k = 0; local_k < BK; local_k++) { // Compute multiple result elements (TM) per thread const float local_b_tmp = local_b[BN * local_k + local_b_col]; -#pragma GCC unroll 1 +#pragma GCC unroll 4 for (uint32_t result_idx = 0; result_idx < TM; result_idx++) { reg_c[result_idx] += local_a[BK * (TM * local_b_row + result_idx) + local_k] * @@ -78,9 +74,10 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, vx_fence(); } -#pragma GCC unroll 1 +#pragma GCC unroll 4 for (uint32_t result_idx = 0; result_idx < TM; result_idx++) { - C[dim_n * (TM * local_b_row + result_idx) + local_b_col] = reg_c[result_idx]; + C[dim_n * (BM * threadblock_id_y + TM * local_b_row + result_idx) + + global_b_col] = reg_c[result_idx]; } } @@ -88,30 +85,24 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { // @perf: All threads are running these compute whose result is mostly same // across the threadblock - const uint32_t threadblocks_per_core = 1; + const uint32_t threadblocks_per_core = vx_num_threads() * vx_num_warps() / (BM*BK); const uint32_t threadblock_dim_x = vx_num_threads(); const uint32_t threadblock_dim_y = vx_num_warps() / threadblocks_per_core; const uint32_t threads_per_threadblock = threadblock_dim_x * threadblock_dim_y; const int threadblock_id = task_id / threads_per_threadblock; const int threadblock_id_in_core = threadblock_id % threadblocks_per_core; - const int tid_in_threadblock = task_id % threads_per_threadblock; - const int tid_in_threadblock_x = vx_thread_id(); - const int tid_in_threadblock_y = vx_warp_id() % threadblock_dim_y; const uint32_t dim_m = arg->dim_m; const uint32_t dim_n = arg->dim_n; - const uint32_t BN = 8; const uint32_t dim_n_in_blocks = dim_n / BN; const int threadblock_id_x = threadblock_id % dim_n_in_blocks; const int threadblock_id_y = threadblock_id / dim_n_in_blocks; - // const int threadblock_id_x = dim_n / threadblock_dim_x; - // const int threadblock_id_y = dim_m / threadblock_dim_y / 1; float *sharedmem_per_threadblock = (float *)DEV_SMEM_START_ADDR + (2 * threads_per_threadblock) * threadblock_id_in_core; - thread_block_gemm(arg, tid_in_threadblock, tid_in_threadblock_x, tid_in_threadblock_y, + thread_block_gemm(arg, tid_in_threadblock, threadblock_dim_x, threadblock_dim_y, threadblock_id_x, threadblock_id_y, threadblock_id_in_core, sharedmem_per_threadblock); @@ -119,7 +110,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { int main() { kernel_arg_t *arg = (kernel_arg_t *)KERNEL_ARG_DEV_MEM_ADDR; - const uint32_t grid_size = arg->dim_m * arg->dim_n / 2; + const uint32_t grid_size = arg->dim_m * arg->dim_n / TM; vx_spawn_tasks(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); return 0; } From a06b2dd20ea702f4e3824cb519a0081d46312cfc Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 28 Feb 2024 21:17:42 -0800 Subject: [PATCH 07/36] sgemm_wg: Cleanup & proper unroll --- tests/regression/sgemm_wg/kernel.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index 69ef9f14..9b767d35 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -40,30 +40,30 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, const uint32_t global_a_row = BM * threadblock_id_y + local_a_row; const uint32_t global_b_col = BN * threadblock_id_x + local_b_col; - // each thread generates one output element + // each thread generates TM output element float reg_c[TM] = { 0.0f }; - for (uint32_t k = 0; k < dim_k; k += BK) { - float *local_a = sharedmem_per_threadblock; - size_t local_a_elems = threadblock_dim_x * threadblock_dim_y; - float *local_b = sharedmem_per_threadblock + local_a_elems; + volatile float *local_a = sharedmem_per_threadblock; + const size_t local_a_elems = threadblock_dim_x * threadblock_dim_y; + volatile float *local_b = sharedmem_per_threadblock + local_a_elems; + for (uint32_t k = 0; k < dim_k; k += BK) { uint32_t global_a_offset = dim_k * global_a_row + (k + local_a_col); uint32_t global_b_offset = dim_n * (k + local_b_row) + global_b_col; - // NOTE: local_b is transposed to column-major to facilitate better memory - // access. local_a[BK * local_a_row + local_a_col] = A[global_a_offset]; local_b[BN * local_b_row + local_b_col] = B[global_b_offset]; vx_barrier(threadblock_id_in_core, threadblock_dim_y); vx_fence(); +#pragma GCC unroll TM for (uint32_t local_k = 0; local_k < BK; local_k++) { // Compute multiple result elements (TM) per thread const float local_b_tmp = local_b[BN * local_k + local_b_col]; -#pragma GCC unroll 4 +#pragma GCC unroll TM for (uint32_t result_idx = 0; result_idx < TM; result_idx++) { + // NOTE use of local_b_row reg_c[result_idx] += local_a[BK * (TM * local_b_row + result_idx) + local_k] * local_b_tmp; @@ -74,8 +74,9 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, vx_fence(); } -#pragma GCC unroll 4 +#pragma GCC unroll TM for (uint32_t result_idx = 0; result_idx < TM; result_idx++) { + // NOTE use of local_b_row and global_b_col here C[dim_n * (BM * threadblock_id_y + TM * local_b_row + result_idx) + global_b_col] = reg_c[result_idx]; } From 6f4dfe5a0e4fa8b8530986b0d1921d8cdfd29068 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 29 Feb 2024 14:40:54 -0800 Subject: [PATCH 08/36] sgemm_wg: Implement 2D threadtiling --- tests/regression/sgemm_wg/kernel.cpp | 88 ++++++++++++++++++++-------- tests/regression/sgemm_wg/main.cpp | 6 +- 2 files changed, 66 insertions(+), 28 deletions(-) diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index 9b767d35..7f33a90a 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -5,8 +5,11 @@ #define BM 8 #define BN BM -#define BK 8 -#define TM (BM/BK) +#define BK 2 +// #define TM (BM/BK) +// #define TN (BN/BK) +#define TM 4 +#define TN 4 void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, const uint32_t tid_in_threadblock, @@ -40,33 +43,63 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, const uint32_t global_a_row = BM * threadblock_id_y + local_a_row; const uint32_t global_b_col = BN * threadblock_id_x + local_b_col; + const uint32_t local_c_row = tid_in_threadblock / (BN / TN); + const uint32_t local_c_col = tid_in_threadblock % (BN / TN); + // each thread generates TM output element - float reg_c[TM] = { 0.0f }; + float reg_c[TM * TN] = { 0.0f }; + float reg_a[TM] = { 0.0f }; + float reg_b[TN] = { 0.0f }; volatile float *local_a = sharedmem_per_threadblock; - const size_t local_a_elems = threadblock_dim_x * threadblock_dim_y; + // const size_t local_a_elems = threadblock_dim_x * threadblock_dim_y; + const size_t local_a_elems = (BM * BK); volatile float *local_b = sharedmem_per_threadblock + local_a_elems; - for (uint32_t k = 0; k < dim_k; k += BK) { - uint32_t global_a_offset = dim_k * global_a_row + (k + local_a_col); - uint32_t global_b_offset = dim_n * (k + local_b_row) + global_b_col; + constexpr uint32_t stride_a = (BM * BN) / BK / (TM * TN); + constexpr uint32_t stride_b = (BM * BN) / BN / (TM * TN); - local_a[BK * local_a_row + local_a_col] = A[global_a_offset]; - local_b[BN * local_b_row + local_b_col] = B[global_b_offset]; + for (uint32_t k = 0; k < dim_k; k += BK) { + for (uint32_t load_offset = 0; load_offset < BM; load_offset += stride_a) { + const uint32_t global_a_offset = + dim_k * (global_a_row + load_offset) + (k + local_a_col); + local_a[BK * (local_a_row + load_offset) + local_a_col] = + A[global_a_offset]; + } + for (uint32_t load_offset = 0; load_offset < BK; load_offset += stride_b) { + const uint32_t global_b_offset = + dim_n * (k + local_b_row + load_offset) + global_b_col; + local_b[BN * (local_b_row + load_offset) + local_b_col] = + B[global_b_offset]; + } vx_barrier(threadblock_id_in_core, threadblock_dim_y); vx_fence(); -#pragma GCC unroll TM for (uint32_t local_k = 0; local_k < BK; local_k++) { - // Compute multiple result elements (TM) per thread - const float local_b_tmp = local_b[BN * local_k + local_b_col]; #pragma GCC unroll TM - for (uint32_t result_idx = 0; result_idx < TM; result_idx++) { - // NOTE use of local_b_row - reg_c[result_idx] += - local_a[BK * (TM * local_b_row + result_idx) + local_k] * - local_b_tmp; + for (uint32_t res_idx_m = 0; res_idx_m < TM; res_idx_m++) { + reg_a[res_idx_m] = + local_a[BK * (TM * local_c_row + res_idx_m) + local_k]; + } +#pragma GCC unroll TN + for (uint32_t res_idx_n = 0; res_idx_n < TN; res_idx_n++) { + reg_b[res_idx_n] = + local_b[BN * local_k + (TN * local_c_col + res_idx_n)]; + } + + // Compute multiple result elements (TM) per thread +#pragma GCC unroll TM + for (uint32_t res_idx_m = 0; res_idx_m < TM; res_idx_m++) { +#pragma GCC unroll TN + for (uint32_t res_idx_n = 0; res_idx_n < TN; res_idx_n++) { + // NOTE use of local_b_row + reg_c[TN * res_idx_m + res_idx_n] += + reg_a[res_idx_m] * reg_b[res_idx_n]; + // reg_c[TN * res_idx_m + res_idx_n] += + // local_a[BK * (TM * local_c_row + res_idx_m) + local_k] * + // local_b[BN * local_k + (TN * local_c_col + res_idx_n)]; + } } } @@ -75,10 +108,14 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, } #pragma GCC unroll TM - for (uint32_t result_idx = 0; result_idx < TM; result_idx++) { - // NOTE use of local_b_row and global_b_col here - C[dim_n * (BM * threadblock_id_y + TM * local_b_row + result_idx) + - global_b_col] = reg_c[result_idx]; + for (uint32_t res_idx_m = 0; res_idx_m < TM; res_idx_m++) { +#pragma GCC unroll TN + for (uint32_t res_idx_n = 0; res_idx_n < TN; res_idx_n++) { + // NOTE use of local_b_row and global_b_col here + C[dim_n * (BM * threadblock_id_y + TM * local_c_row + res_idx_m) + + (BN * threadblock_id_x + TN * local_c_col + res_idx_n)] = + reg_c[TN * res_idx_m + res_idx_n]; + } } } @@ -86,10 +123,11 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { // @perf: All threads are running these compute whose result is mostly same // across the threadblock - const uint32_t threadblocks_per_core = vx_num_threads() * vx_num_warps() / (BM*BK); + const uint32_t threads_per_threadblock = ((BM * BN) / (TM * TN)); + const uint32_t threadblocks_per_core = + vx_num_threads() * vx_num_warps() / threads_per_threadblock; const uint32_t threadblock_dim_x = vx_num_threads(); const uint32_t threadblock_dim_y = vx_num_warps() / threadblocks_per_core; - const uint32_t threads_per_threadblock = threadblock_dim_x * threadblock_dim_y; const int threadblock_id = task_id / threads_per_threadblock; const int threadblock_id_in_core = threadblock_id % threadblocks_per_core; const int tid_in_threadblock = task_id % threads_per_threadblock; @@ -102,7 +140,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { float *sharedmem_per_threadblock = (float *)DEV_SMEM_START_ADDR + - (2 * threads_per_threadblock) * threadblock_id_in_core; + (2 * BM * BK) * threadblock_id_in_core; thread_block_gemm(arg, tid_in_threadblock, threadblock_dim_x, threadblock_dim_y, threadblock_id_x, threadblock_id_y, threadblock_id_in_core, @@ -111,7 +149,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { int main() { kernel_arg_t *arg = (kernel_arg_t *)KERNEL_ARG_DEV_MEM_ADDR; - const uint32_t grid_size = arg->dim_m * arg->dim_n / TM; + const uint32_t grid_size = arg->dim_m * arg->dim_n / (TM * TN); vx_spawn_tasks(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); return 0; } diff --git a/tests/regression/sgemm_wg/main.cpp b/tests/regression/sgemm_wg/main.cpp index 229463ef..c86f7aaf 100644 --- a/tests/regression/sgemm_wg/main.cpp +++ b/tests/regression/sgemm_wg/main.cpp @@ -147,9 +147,9 @@ int main(int argc, char *argv[]) { RT_CHECK(vx_dev_open(&device)); // FIXME: hardcoded - uint32_t dim_m = 32; - uint32_t dim_n = 32; - uint32_t dim_k = 32; + uint32_t dim_m = 64; + uint32_t dim_n = 64; + uint32_t dim_k = 64; generate_source_matrix(dim_m, dim_n, dim_k); generate_reference_matmul(dim_m, dim_n, dim_k); From a9709edae238a3cea3020f96c4ea006b79be4fd6 Mon Sep 17 00:00:00 2001 From: Sungwoong Ha Date: Fri, 1 Mar 2024 21:05:52 -0800 Subject: [PATCH 09/36] first pass --- hw/rtl/core/VX_core.sv | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index e5e57d99..e239ea4b 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -331,22 +331,25 @@ module VX_core import VX_gpu_pkg::*; #( assign pipeline_perf_if.stores = perf_stores; assign pipeline_perf_if.load_latency = perf_dcache_lat; assign pipeline_perf_if.ifetch_latency = perf_icache_lat; - assign pipeline_perf_if.load_latency = perf_dcache_lat; + real instrs = commit_csr_if.instret; + real cycles = sched_csr_if.cycles; + real icache_lat = perf_icache_lat; + real ifetches = perf_ifetches; + real dcache_lat = perf_dcache_lat; + real loads = perf_loads; always @(negedge busy) begin if (!reset) begin + $display("====================CORE : %d===================",CORE_ID); $display("time : %t", $time); - $display("perf_dcache_rd_req_per_cycle: %h", perf_dcache_rd_req_per_cycle); - $display("perf_dcache_wr_req_per_cycle: %h", perf_dcache_wr_req_per_cycle); - $display("perf_dcache_rsp_per_cycle: %h", perf_dcache_rsp_per_cycle); - $display("perf_icache_pending_read_cycle: %h", perf_icache_pending_read_cycle); - $display("perf_dcache_pending_read_cycle: %h", perf_dcache_pending_read_cycle); - $display("perf_icache_pending_reads: %h", perf_icache_pending_reads); - $display("perf_dcache_pending_reads: %h", perf_dcache_pending_reads); - $display("perf_ifetches: %h", perf_ifetches); - $display("perf_loads: %h", perf_loads); - $display("perf_stores: %h", perf_stores); + $display("perf_dcache_rd_req_per_cycle: %d", perf_dcache_rd_req_per_cycle); + $display("perf_dcache_wr_req_per_cycle: %d", perf_dcache_wr_req_per_cycle); + $display("perf_dcache_rsp_per_cycle: %d", perf_dcache_rsp_per_cycle); + $display("perf_icache_pending_read_cycle: %d", perf_icache_pending_read_cycle); + $display("perf_dcache_pending_read_cycle: %d", perf_dcache_pending_read_cycle); + $display("perf_icache_pending_reads: %d", perf_icache_pending_reads); + $display("perf_dcache_pending_reads: %d", perf_dcache_pending_reads); $display("perf_icache_req_fire: %b", perf_icache_req_fire); $display("perf_icache_rsp_fire: %b", perf_icache_rsp_fire); $display("perf_dcache_rd_req_fire: %b", perf_dcache_rd_req_fire); @@ -354,9 +357,18 @@ module VX_core import VX_gpu_pkg::*; #( $display("perf_dcache_wr_req_fire: %b", perf_dcache_wr_req_fire); $display("perf_dcache_wr_req_fire_r: %b", perf_dcache_wr_req_fire_r); $display("perf_dcache_rsp_fire: %b", perf_dcache_rsp_fire); - $display("scheduler idle: %d", pipeline_perf_if.sched_idles[31:0]); - $display("Instruction: %d",commit_csr_if.instret[31:0]); - $display("Cycle: %d",sched_csr_if.cycles); + + $display("Instructions: %d, Cycles: %d, IPC: %f", commit_csr_if.instret, sched_csr_if.cycles, instrs/cycles); + $display("scheduler idle: %d", pipeline_perf_if.sched_idles); + $display("scheduler stalls: %d", pipeline_perf_if.sched_stalls); + $display("ibuffer stalls: %d",pipeline_perf_if.ibf_stalls); + $display("issue stalls: %d",pipeline_perf_if.scb_stalls); + $display("sfu stalls: %d",pipeline_perf_if.units_uses[2]); + $display("ifetches: %d", perf_ifetches); + $display("ifetch latency: %f Cycles", icache_lat/ifetches); + $display("loads: %d", perf_loads); + $display("load latency: %f Cycles", dcache_lat/loads); + $display("stores: %d", perf_stores); end end From 3c2a266d379d9eda658048248a487da04eac4a0b Mon Sep 17 00:00:00 2001 From: Sungwoong Ha Date: Fri, 1 Mar 2024 21:27:26 -0800 Subject: [PATCH 10/36] second pass --- hw/rtl/core/VX_core.sv | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index e239ea4b..453ebb03 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -337,7 +337,18 @@ module VX_core import VX_gpu_pkg::*; #( real ifetches = perf_ifetches; real dcache_lat = perf_dcache_lat; real loads = perf_loads; + real scheduler_idles = pipeline_perf_if.sched_idles; + real scheduler_stalls = pipeline_perf_if.sched_stalls; + real ibuf_stalls = pipeline_perf_if.ibf_stalls; + real scrb_alu_per_core = pipeline_perf_if.units_uses[`EX_ALU]; + real scrb_fpu_per_core = pipeline_perf_if.units_uses[`EX_FPU]; + real scrb_lsu_per_core = pipeline_perf_if.units_uses[`EX_LSU]; + real scrb_sfu_per_core = pipeline_perf_if.units_uses[`EX_SFU]; + real scrb_tot = scrb_alu_per_core+scrb_fpu_per_core+scrb_lsu_per_core+scrb_sfu_per_core; + real scrb_wctl_per_core = pipeline_perf_if.sfu_uses[`SFU_WCTL]; + real scrb_csrs_per_core = pipeline_perf_if.sfu_uses[`SFU_CSRS]; + real sfu_tot = scrb_wctl_per_core+scrb_csrs_per_core; always @(negedge busy) begin if (!reset) begin @@ -359,11 +370,11 @@ module VX_core import VX_gpu_pkg::*; #( $display("perf_dcache_rsp_fire: %b", perf_dcache_rsp_fire); $display("Instructions: %d, Cycles: %d, IPC: %f", commit_csr_if.instret, sched_csr_if.cycles, instrs/cycles); - $display("scheduler idle: %d", pipeline_perf_if.sched_idles); - $display("scheduler stalls: %d", pipeline_perf_if.sched_stalls); - $display("ibuffer stalls: %d",pipeline_perf_if.ibf_stalls); - $display("issue stalls: %d",pipeline_perf_if.scb_stalls); - $display("sfu stalls: %d",pipeline_perf_if.units_uses[2]); + $display("scheduler idle: %d (%f)", pipeline_perf_if.sched_idles, scheduler_idles/cycles); + $display("scheduler stalls: %d (%f)", pipeline_perf_if.sched_stalls, scheduler_stalls/cycles); + $display("ibuffer stalls: %d (%f)",pipeline_perf_if.ibf_stalls, ibuf_stalls/cycles); + $display("issue stalls: %d(alu=%f, fpu=%f, lsu=%f, sfu=%f)",pipeline_perf_if.scb_stalls, scrb_alu_per_core/scrb_tot, scrb_fpu_per_core/scrb_tot, scrb_lsu_per_core/scrb_tot, scrb_sfu_per_core/scrb_tot); + $display("sfu stalls: %d (scrs=%f, wctl=%f)",pipeline_perf_if.units_uses[`EX_SFU], scrb_csrs_per_core/sfu_tot, scrb_wctl_per_core/sfu_tot); $display("ifetches: %d", perf_ifetches); $display("ifetch latency: %f Cycles", icache_lat/ifetches); $display("loads: %d", perf_loads); From fbe872c8314a7a6f1e79ca19c6f6dbeb66f46d90 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 12 Mar 2024 15:34:17 -0700 Subject: [PATCH 11/36] sgemm_wg: Add missing makefile dep to common.h --- tests/regression/sgemm_wg/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/regression/sgemm_wg/Makefile b/tests/regression/sgemm_wg/Makefile index f57f6124..289369d2 100644 --- a/tests/regression/sgemm_wg/Makefile +++ b/tests/regression/sgemm_wg/Makefile @@ -1,6 +1,6 @@ PROJECT = sgemm_wg -SRCS = main.cpp +SRCS = main.cpp common.h VX_SRCS = kernel.cpp From 510a834db529f20bac2fdea09f7a92372462402e Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 12 Mar 2024 15:34:42 -0700 Subject: [PATCH 12/36] sgemm_wg: Implement software barrier for inter-core synchronization --- tests/regression/sgemm_wg/kernel.cpp | 45 +++++++++++++++++++++++----- tests/regression/sgemm_wg/main.cpp | 6 ++-- 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index 7f33a90a..a65e1e5f 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include "common.h" @@ -8,8 +9,35 @@ #define BK 2 // #define TM (BM/BK) // #define TN (BN/BK) -#define TM 4 -#define TN 4 +#define TM 2 +#define TN 2 + +#define DEV_BARRIER_MMIO_BASE_ADDR 0xff003f00UL +#define CORES_PER_CLUSTER 4 + +void threadblock_barrier(unsigned int barrier_id, unsigned int count) { + vx_barrier(barrier_id, count); + vx_fence(); + +#if CORES_PER_CLUSTER != 1 + if (vx_thread_id() == 0) { + volatile uint32_t *mmio = (volatile uint32_t *)(DEV_BARRIER_MMIO_BASE_ADDR); + int core_id = vx_core_id(); + const uint32_t barrier_stride = CORES_PER_CLUSTER; + const uint32_t barrier_offset = barrier_stride * barrier_id; + // 1 : 0x00 is reserved for mmio read reg + mmio[barrier_offset + 1 + core_id] = 1; + vx_printf("========== barrier written! barrier_id=%u, count=%u\n", barrier_id, count); + + // wait for other cores in the cluster to finish by waiting on the + // all-synced read-only mmio reg + while (mmio[barrier_offset] == 0); + + // reset per-core flag back to zero for the next barrier + mmio[barrier_offset + 1 + core_id] = 0; + } +#endif +} void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, const uint32_t tid_in_threadblock, @@ -73,8 +101,7 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, B[global_b_offset]; } - vx_barrier(threadblock_id_in_core, threadblock_dim_y); - vx_fence(); + threadblock_barrier(threadblock_id_in_core, threadblock_dim_y); for (uint32_t local_k = 0; local_k < BK; local_k++) { #pragma GCC unroll TM @@ -103,8 +130,7 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, } } - vx_barrier(threadblock_id_in_core, threadblock_dim_y); - vx_fence(); + threadblock_barrier(threadblock_id_in_core, threadblock_dim_y); } #pragma GCC unroll TM @@ -123,7 +149,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { // @perf: All threads are running these compute whose result is mostly same // across the threadblock - const uint32_t threads_per_threadblock = ((BM * BN) / (TM * TN)); + const uint32_t threads_per_threadblock = (BM * BN) / (TM * TN); const uint32_t threadblocks_per_core = vx_num_threads() * vx_num_warps() / threads_per_threadblock; const uint32_t threadblock_dim_x = vx_num_threads(); @@ -138,6 +164,11 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { const int threadblock_id_x = threadblock_id % dim_n_in_blocks; const int threadblock_id_y = threadblock_id / dim_n_in_blocks; + // initialize barrier MMIO + volatile uint32_t *barrier_mmio = (volatile uint32_t *)(DEV_BARRIER_MMIO_BASE_ADDR); + *barrier_mmio = 0; + vx_fence(); + float *sharedmem_per_threadblock = (float *)DEV_SMEM_START_ADDR + (2 * BM * BK) * threadblock_id_in_core; diff --git a/tests/regression/sgemm_wg/main.cpp b/tests/regression/sgemm_wg/main.cpp index c86f7aaf..229463ef 100644 --- a/tests/regression/sgemm_wg/main.cpp +++ b/tests/regression/sgemm_wg/main.cpp @@ -147,9 +147,9 @@ int main(int argc, char *argv[]) { RT_CHECK(vx_dev_open(&device)); // FIXME: hardcoded - uint32_t dim_m = 64; - uint32_t dim_n = 64; - uint32_t dim_k = 64; + uint32_t dim_m = 32; + uint32_t dim_n = 32; + uint32_t dim_k = 32; generate_source_matrix(dim_m, dim_n, dim_k); generate_reference_matmul(dim_m, dim_n, dim_k); From 2036d37840e2ec72f49dff3a3e06b593dbd3d610 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 13 Mar 2024 21:32:57 -0700 Subject: [PATCH 13/36] sgemm_wg: Prevent run-ahead using ternary flags; reduce mem accesses --- tests/regression/sgemm_wg/kernel.cpp | 61 ++++++++++++++++++++-------- 1 file changed, 45 insertions(+), 16 deletions(-) diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index a65e1e5f..44299934 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -13,29 +13,61 @@ #define TN 2 #define DEV_BARRIER_MMIO_BASE_ADDR 0xff003f00UL -#define CORES_PER_CLUSTER 4 +#define CORES_PER_CLUSTER 2 +#define BARRIER_STRIDE 4 -void threadblock_barrier(unsigned int barrier_id, unsigned int count) { +void threadblock_barrier(unsigned int tid_in_threadblock, unsigned int barrier_id, unsigned int count) { vx_barrier(barrier_id, count); vx_fence(); -#if CORES_PER_CLUSTER != 1 - if (vx_thread_id() == 0) { + // vx_printf("========== barrier! barrier_id=%u, count=%u\n", barrier_id, count); + +#if CORES_PER_CLUSTER != 0 + // this code doesn't work without the memory-mapped register implemented in + // hardware, hence the #ifdef. + + if (tid_in_threadblock == 0) { volatile uint32_t *mmio = (volatile uint32_t *)(DEV_BARRIER_MMIO_BASE_ADDR); int core_id = vx_core_id(); - const uint32_t barrier_stride = CORES_PER_CLUSTER; + // FIXME: hardcoded + const uint32_t barrier_stride = BARRIER_STRIDE; const uint32_t barrier_offset = barrier_stride * barrier_id; - // 1 : 0x00 is reserved for mmio read reg + + // wait for the barrier to be initialized + while (mmio[barrier_offset + 1 + core_id] != 0); + + // signal internal-core synchronization done mmio[barrier_offset + 1 + core_id] = 1; - vx_printf("========== barrier written! barrier_id=%u, count=%u\n", barrier_id, count); // wait for other cores in the cluster to finish by waiting on the // all-synced read-only mmio reg while (mmio[barrier_offset] == 0); - // reset per-core flag back to zero for the next barrier - mmio[barrier_offset + 1 + core_id] = 0; + // need to signal that this core passed the barrier; otherwise, if we + // reset this to 0 right away, the other core still waiting for the + // barrier might never see the all-sync mmio reg as 1. + mmio[barrier_offset + 1 + core_id] = 2; + + // // if this core is the last one passing the barrier, reset all per-core + // // flags to 0 to get ready for the next barrier + // bool all_passed = true; + // for (int i = 0; i < CORES_PER_CLUSTER; i++) { + // // if (i == core_id) continue; + // // NOTE: this requires coherent access of store-to-load to the same + // // address + // if (mmio[barrier_offset + 1 + i] != 2) { + // all_passed = false; + // break; + // } + // } + // if (all_passed) { + // for (int i = 0; i < CORES_PER_CLUSTER; i++) { + // mmio[barrier_offset + 1 + i] = 0; + // } + // } } + + vx_barrier(barrier_id, count); #endif } @@ -101,7 +133,8 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, B[global_b_offset]; } - threadblock_barrier(threadblock_id_in_core, threadblock_dim_y); + threadblock_barrier(tid_in_threadblock, threadblock_id_in_core, + threadblock_dim_y); for (uint32_t local_k = 0; local_k < BK; local_k++) { #pragma GCC unroll TM @@ -130,7 +163,8 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, } } - threadblock_barrier(threadblock_id_in_core, threadblock_dim_y); + threadblock_barrier(tid_in_threadblock, threadblock_id_in_core, + threadblock_dim_y); } #pragma GCC unroll TM @@ -164,11 +198,6 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { const int threadblock_id_x = threadblock_id % dim_n_in_blocks; const int threadblock_id_y = threadblock_id / dim_n_in_blocks; - // initialize barrier MMIO - volatile uint32_t *barrier_mmio = (volatile uint32_t *)(DEV_BARRIER_MMIO_BASE_ADDR); - *barrier_mmio = 0; - vx_fence(); - float *sharedmem_per_threadblock = (float *)DEV_SMEM_START_ADDR + (2 * BM * BK) * threadblock_id_in_core; From 12ee2a3a0fe2d2456461f912852e9732749c54a4 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 18 Mar 2024 16:40:02 -0700 Subject: [PATCH 14/36] Write cluster-aware thread scheduling NOTE: cores per cluster is hardcoded as a constant --- kernel/src/vx_spawn.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/kernel/src/vx_spawn.c b/kernel/src/vx_spawn.c index eb0bdb90..c4c00a06 100644 --- a/kernel/src/vx_spawn.c +++ b/kernel/src/vx_spawn.c @@ -15,6 +15,8 @@ #include #include +#define CORES_PER_CLUSTER 2 + #ifdef __cplusplus extern "C" { #endif @@ -95,6 +97,30 @@ static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_stub() { } } +static void __attribute__ ((noinline)) spawn_tasks_cluster_all_stub() { + int NT = vx_num_threads(); + int NW = vx_num_warps(); + int cid = vx_core_id(); + int wid = vx_warp_id(); + int tid = vx_thread_id(); + + const int core_id_in_cluster = vx_core_id() % CORES_PER_CLUSTER; + const int cluster_wid = CORES_PER_CLUSTER * wid + core_id_in_cluster; + + wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid]; + + // FIXME: handle RW + int waves = p_wspawn_args->NWs; + int offset = p_wspawn_args->offset + (NT * cluster_wid + tid); + + vx_spawn_tasks_cb callback = p_wspawn_args->callback; + void* arg = p_wspawn_args->arg; + for (int wave_id = 0; wave_id < waves; ++wave_id) { + int task_id = offset + (wave_id * NT * NW * CORES_PER_CLUSTER); + callback(task_id, arg); + } +} + static void __attribute__ ((noinline)) spawn_tasks_rem_stub() { int cid = vx_core_id(); int tid = vx_thread_id(); @@ -110,7 +136,7 @@ static void __attribute__ ((noinline)) spawn_tasks_all_cb() { // call stub routine // spawn_tasks_all_stub(); - spawn_tasks_contiguous_all_stub(); + spawn_tasks_cluster_all_stub(); // disable warp vx_tmc_zero(); @@ -151,7 +177,11 @@ void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) { rW = TW - fW * NW; // remaining warps } - wspawn_tasks_args_t wspawn_args = { callback, arg, core_id * tasks_per_core, fW, rW }; + int cluster_id = core_id / CORES_PER_CLUSTER; + const int tasks_per_cluster = tasks_per_core * CORES_PER_CLUSTER; + const int offset = cluster_id * tasks_per_cluster; + wspawn_tasks_args_t wspawn_args = { callback, arg, offset, fW, rW }; + // wspawn_tasks_args_t wspawn_args = { callback, arg, core_id * tasks_per_core, fW, rW }; g_wspawn_args[core_id] = &wspawn_args; if (TW >= 1) { @@ -163,7 +193,7 @@ void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) { vx_tmc(-1); // call stub routine - spawn_tasks_contiguous_all_stub(); + spawn_tasks_cluster_all_stub(); // back to single-threaded vx_tmc_one(); From f590c4b41744bce9d3340f3c063c8a468c1d329c Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 24 Mar 2024 01:44:49 -0700 Subject: [PATCH 15/36] Add vx_spawn.h as dependency to kernel/Makefile --- kernel/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/Makefile b/kernel/Makefile index 07b8c97b..575707f8 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -51,10 +51,10 @@ $(PROJECT).dump: $(PROJECT).a %.S.o: src/%.S $(CC) $(CFLAGS) -c $< -o $@ -%.cpp.o: src/%.cpp +%.cpp.o: src/%.cpp include/vx_spawn.h $(CXX) $(CFLAGS) -c $< -o $@ -%.c.o: src/%.c +%.c.o: src/%.c include/vx_spawn.h $(CC) $(CFLAGS) -c $< -o $@ $(PROJECT).a: $(OBJS) From 8f3474b15167e1fdbc04c00c3da2c2e90fd07972 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 24 Mar 2024 01:45:08 -0700 Subject: [PATCH 16/36] Don't clean *.bin --- tests/regression/common.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/regression/common.mk b/tests/regression/common.mk index 6a858edc..d38df853 100644 --- a/tests/regression/common.mk +++ b/tests/regression/common.mk @@ -115,7 +115,7 @@ clean: rm -rf $(PROJECT) *.o .depend clean-all: clean - rm -rf *.elf *.bin *.dump + rm -rf *.elf *.dump ifneq ($(MAKECMDGOALS),clean) -include .depend From 7d177492b2f99f4dd388caf2ca9d9be167f02036 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 24 Mar 2024 01:45:30 -0700 Subject: [PATCH 17/36] Move CORES_PER_CLUSTER to vx_spawn.h --- kernel/include/vx_spawn.h | 2 ++ kernel/src/vx_spawn.c | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/include/vx_spawn.h b/kernel/include/vx_spawn.h index 2584b997..321e3f83 100644 --- a/kernel/include/vx_spawn.h +++ b/kernel/include/vx_spawn.h @@ -17,6 +17,8 @@ #include #include +#define CORES_PER_CLUSTER 2 + #ifdef __cplusplus extern "C" { #endif diff --git a/kernel/src/vx_spawn.c b/kernel/src/vx_spawn.c index c4c00a06..c57e55f2 100644 --- a/kernel/src/vx_spawn.c +++ b/kernel/src/vx_spawn.c @@ -15,8 +15,6 @@ #include #include -#define CORES_PER_CLUSTER 2 - #ifdef __cplusplus extern "C" { #endif From ff401bdec0eb4a916d0fc126136620ceb0c0531c Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 24 Mar 2024 01:47:00 -0700 Subject: [PATCH 18/36] Cleanup tests/.gitignore --- tests/.gitignore | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/.gitignore b/tests/.gitignore index a9884992..30ca0fa4 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -1 +1,7 @@ **/*.log +.depend +*.bin +*.dump +*.elf +*.o +*.ll From cc7b34ec5b9dc92200acb80f78c34a689219a52b Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 26 Mar 2024 10:44:02 -0700 Subject: [PATCH 19/36] vecaddx: Write args.bin and input.bin --- tests/regression/vecaddx/main.cpp | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/tests/regression/vecaddx/main.cpp b/tests/regression/vecaddx/main.cpp index 117f3470..4f3b77af 100644 --- a/tests/regression/vecaddx/main.cpp +++ b/tests/regression/vecaddx/main.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -201,10 +202,19 @@ int main(int argc, char *argv[]) { memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t)); RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); + std::ofstream file("args.bin", std::ios::binary | std::ios::out); + if (!file) { + std::cerr << "error: failed to open args.bin for writing\n"; + exit(EXIT_FAILURE); + } + file.write(reinterpret_cast(staging_buf.data()), sizeof(kernel_arg_t)); + file.close(); + // generate source data source_data.resize(2 * num_points); for (uint32_t i = 0; i < source_data.size(); ++i) { - source_data[i] = Comparator::generate(); + // source_data[i] = Comparator::generate(); + source_data[i] = static_cast(i); } // upload source buffer0 @@ -215,6 +225,14 @@ int main(int argc, char *argv[]) { buf_ptr[i] = source_data[2 * i + 0]; } RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size)); + + std::ofstream file("input.a.bin", std::ios::binary | std::ios::out); + if (!file) { + std::cerr << "error: failed to open input.a.bin for writing\n"; + exit(EXIT_FAILURE); + } + file.write(reinterpret_cast(buf_ptr), buf_size); + file.close(); } // upload source buffer1 @@ -225,6 +243,14 @@ int main(int argc, char *argv[]) { buf_ptr[i] = source_data[2 * i + 1]; } RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size)); + + std::ofstream file("input.b.bin", std::ios::binary | std::ios::out); + if (!file) { + std::cerr << "error: failed to open input.b.bin for writing\n"; + exit(EXIT_FAILURE); + } + file.write(reinterpret_cast(buf_ptr), buf_size); + file.close(); } // clear destination buffer @@ -243,4 +269,4 @@ int main(int argc, char *argv[]) { std::cout << "PASSED!" << std::endl; return 0; -} \ No newline at end of file +} From 7f00e6c37665662471ecfa54a9a2f8a3e04d2253 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 26 Mar 2024 10:44:33 -0700 Subject: [PATCH 20/36] vecaddx: Change arg device address to 7fff0000 --- tests/regression/vecaddx/common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/regression/vecaddx/common.h b/tests/regression/vecaddx/common.h index 2b8f164a..a7b26936 100644 --- a/tests/regression/vecaddx/common.h +++ b/tests/regression/vecaddx/common.h @@ -1,7 +1,7 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 +#define KERNEL_ARG_DEV_MEM_ADDR 0x7fff0000 #ifndef TYPE #define TYPE float From f050a08d77792c7d4ad2d83a1725776a2c929835 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 26 Mar 2024 10:45:14 -0700 Subject: [PATCH 21/36] Write vx_spawn_tasks_cluster This scheduling logic tries to evenly distribute warps across *all* cores, instead of trying to fill up the first cores as much as possible. This scheme is necessary for the intra-cluster cores which are assumed to have equal workloads distributed. --- kernel/include/vx_spawn.h | 1 + kernel/src/vx_spawn.c | 81 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/kernel/include/vx_spawn.h b/kernel/include/vx_spawn.h index 321e3f83..06a85af7 100644 --- a/kernel/include/vx_spawn.h +++ b/kernel/include/vx_spawn.h @@ -50,6 +50,7 @@ void vx_wspawn_wait(); void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg); void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback, void * arg); +void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void * arg); void vx_serial(vx_serial_cb callback, void * arg); diff --git a/kernel/src/vx_spawn.c b/kernel/src/vx_spawn.c index c57e55f2..04b58253 100644 --- a/kernel/src/vx_spawn.c +++ b/kernel/src/vx_spawn.c @@ -140,6 +140,87 @@ static void __attribute__ ((noinline)) spawn_tasks_all_cb() { vx_tmc_zero(); } +void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg) { + // device specs + int NC = vx_num_cores(); + int NW = vx_num_warps(); + int NT = vx_num_threads(); + + // current core id + int core_id = vx_core_id(); + if (core_id >= NUM_CORES_MAX) + return; + + // Distribute threads equally across as many cores as possible, even if they + // don't fill up NW*NT in a single core. This makes sure the warps get evenly + // distributed in a single cluster + // + // TODO: Try to contain in a single cluster if possible? + int num_active_cores = (num_tasks > NT) ? (num_tasks / NT) : 1; + num_active_cores = MIN(num_active_cores, NC); + if (core_id >= num_active_cores) + return; // terminate extra cores + + int tasks_per_core = num_tasks / num_active_cores; + int tasks_per_core_last = tasks_per_core; + if (core_id == (num_active_cores - 1)) { + int rem = num_tasks % num_active_cores; + tasks_per_core_last += rem; // last core also executes remaining tasks + } + + int num_full_warps = tasks_per_core_last / NT; + int rem_threads_in_last_warp = tasks_per_core_last % NT; + // sequential iterations + int num_full_waves = 1; + int rem_warps_in_last_wave = 0; + if (num_full_warps >= NW) { + // this division will result in the same value for both the last core and + // the rest + num_full_waves = num_full_warps / NW; + rem_warps_in_last_wave = num_full_warps % NW; + } + + int cluster_id = core_id / CORES_PER_CLUSTER; + const int tasks_per_cluster = tasks_per_core * CORES_PER_CLUSTER; + const int offset = cluster_id * tasks_per_cluster; + wspawn_tasks_args_t wspawn_args = {callback, arg, offset, num_full_waves, rem_warps_in_last_wave}; + g_wspawn_args[core_id] = &wspawn_args; + + if (num_full_warps >= 1) { + // execute callback on other warps + int nw = MIN(num_full_warps, NW); + vx_wspawn(nw, spawn_tasks_all_cb); + + // activate all threads + vx_tmc(-1); + + // call stub routine + spawn_tasks_cluster_all_stub(); + + // back to single-threaded + vx_tmc_one(); + + // wait for spawn warps to terminate + vx_wspawn_wait(); + } + + if (rem_threads_in_last_warp != 0) { + // adjust offset + wspawn_args.offset += (tasks_per_core_last - rem_threads_in_last_warp); + + // activate remaining threads + int tmask = (1 << rem_threads_in_last_warp) - 1; + vx_tmc(tmask); + + // call stub routine + // FIXME: unimplemented for cluster! + spawn_tasks_rem_stub(); + + // back to single-threaded + vx_tmc_one(); + } +} + void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) { // device specs int NC = vx_num_cores(); From 3729a05adccd890a991a1b6984204861de4a9ef2 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 26 Mar 2024 16:36:57 -0700 Subject: [PATCH 22/36] vx_spawn.c: Separate cluster-based scheduling code from original --- kernel/include/vx_spawn.h | 2 ++ kernel/src/vx_spawn.c | 23 +++++++++++++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/kernel/include/vx_spawn.h b/kernel/include/vx_spawn.h index 06a85af7..8ebbab09 100644 --- a/kernel/include/vx_spawn.h +++ b/kernel/include/vx_spawn.h @@ -17,7 +17,9 @@ #include #include +#ifndef CORES_PER_CLUSTER #define CORES_PER_CLUSTER 2 +#endif #ifdef __cplusplus extern "C" { diff --git a/kernel/src/vx_spawn.c b/kernel/src/vx_spawn.c index 04b58253..87688e1c 100644 --- a/kernel/src/vx_spawn.c +++ b/kernel/src/vx_spawn.c @@ -128,7 +128,7 @@ static void __attribute__ ((noinline)) spawn_tasks_rem_stub() { (p_wspawn_args->callback)(task_id, p_wspawn_args->arg); } -static void __attribute__ ((noinline)) spawn_tasks_all_cb() { +static void __attribute__ ((noinline)) spawn_tasks_cluster_all_cb() { // activate all threads vx_tmc(-1); @@ -140,6 +140,17 @@ static void __attribute__ ((noinline)) spawn_tasks_all_cb() { vx_tmc_zero(); } +static void __attribute__ ((noinline)) spawn_tasks_all_cb() { + // activate all threads + vx_tmc(-1); + + // call stub routine + spawn_tasks_all_stub(); + + // disable warp + vx_tmc_zero(); +} + void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg) { // device specs int NC = vx_num_cores(); @@ -189,7 +200,7 @@ void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg if (num_full_warps >= 1) { // execute callback on other warps int nw = MIN(num_full_warps, NW); - vx_wspawn(nw, spawn_tasks_all_cb); + vx_wspawn(nw, spawn_tasks_cluster_all_cb); // activate all threads vx_tmc(-1); @@ -256,11 +267,7 @@ void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) { rW = TW - fW * NW; // remaining warps } - int cluster_id = core_id / CORES_PER_CLUSTER; - const int tasks_per_cluster = tasks_per_core * CORES_PER_CLUSTER; - const int offset = cluster_id * tasks_per_cluster; - wspawn_tasks_args_t wspawn_args = { callback, arg, offset, fW, rW }; - // wspawn_tasks_args_t wspawn_args = { callback, arg, core_id * tasks_per_core, fW, rW }; + wspawn_tasks_args_t wspawn_args = { callback, arg, core_id * tasks_per_core, fW, rW }; g_wspawn_args[core_id] = &wspawn_args; if (TW >= 1) { @@ -272,7 +279,7 @@ void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) { vx_tmc(-1); // call stub routine - spawn_tasks_cluster_all_stub(); + spawn_tasks_all_stub(); // back to single-threaded vx_tmc_one(); From 4d2c0084d126b1d14252c716e320b9ede808d295 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 26 Mar 2024 16:37:44 -0700 Subject: [PATCH 23/36] common.mk: Compile separate cluster ELF ... using -DRADIANCE, which the kernel C code use explicitly to switch between vx_spawn_tasks and vx_spawn_tasks_cluster. This is to ease running both simX and Chipyard simulations without mixing up binaries. --- tests/regression/common.mk | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/regression/common.mk b/tests/regression/common.mk index d38df853..087561b0 100644 --- a/tests/regression/common.mk +++ b/tests/regression/common.mk @@ -78,17 +78,23 @@ endif endif endif -all: $(PROJECT) kernel.bin kernel.dump +all: $(PROJECT) kernel.bin kernel.dump kernel.radiance.dump kernel.dump: kernel.elf $(VX_DP) -D kernel.elf > kernel.dump -kernel.bin: kernel.elf +kernel.radiance.dump: kernel.radiance.elf + $(VX_DP) -D kernel.radiance.elf > kernel.radiance.dump + +kernel.bin: kernel.elf kernel.radiance.elf $(VX_CP) -O binary kernel.elf kernel.bin kernel.elf: $(VX_SRCS) $(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf +kernel.radiance.elf: $(VX_SRCS) + $(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -DRADIANCE -o kernel.radiance.elf + $(PROJECT): $(SRCS) $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ From b54580949604e19093610c1d9d5f6dc1697b6703 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 26 Mar 2024 16:42:36 -0700 Subject: [PATCH 24/36] vecaddx: Use -DRADIANCE --- tests/regression/vecaddx/kernel.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/regression/vecaddx/kernel.cpp b/tests/regression/vecaddx/kernel.cpp index 6ed42164..6e782586 100644 --- a/tests/regression/vecaddx/kernel.cpp +++ b/tests/regression/vecaddx/kernel.cpp @@ -13,6 +13,10 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { int main() { kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; +#ifdef RADIANCE + vx_spawn_tasks_cluster(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg); +#else vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg); +#endif return 0; } From df1f7f242a05d2d8fa21e3cd29994943545a121f Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Tue, 26 Mar 2024 23:51:59 -0700 Subject: [PATCH 25/36] vx_spawn.c: Implement spawn_tasks_cluster_rem_stub --- kernel/src/vx_spawn.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/kernel/src/vx_spawn.c b/kernel/src/vx_spawn.c index 87688e1c..fb36b0bc 100644 --- a/kernel/src/vx_spawn.c +++ b/kernel/src/vx_spawn.c @@ -102,14 +102,15 @@ static void __attribute__ ((noinline)) spawn_tasks_cluster_all_stub() { int wid = vx_warp_id(); int tid = vx_thread_id(); - const int core_id_in_cluster = vx_core_id() % CORES_PER_CLUSTER; - const int cluster_wid = CORES_PER_CLUSTER * wid + core_id_in_cluster; + const int core_id_in_cluster = cid % CORES_PER_CLUSTER; + // round-robin warp_id allocation across cores in cluster + const int wid_in_cluster = CORES_PER_CLUSTER * wid + core_id_in_cluster; wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid]; // FIXME: handle RW int waves = p_wspawn_args->NWs; - int offset = p_wspawn_args->offset + (NT * cluster_wid + tid); + int offset = p_wspawn_args->offset + (NT * wid_in_cluster + tid); vx_spawn_tasks_cb callback = p_wspawn_args->callback; void* arg = p_wspawn_args->arg; @@ -128,6 +129,25 @@ static void __attribute__ ((noinline)) spawn_tasks_rem_stub() { (p_wspawn_args->callback)(task_id, p_wspawn_args->arg); } +static void __attribute__ ((noinline)) spawn_tasks_cluster_rem_stub() { + int NT = vx_num_threads(); + int cid = vx_core_id(); + int tid = vx_thread_id(); + int wid = vx_warp_id(); + + const int core_id_in_cluster = cid % CORES_PER_CLUSTER; + // round-robin warp_id allocation across cores in cluster + const int wid_in_cluster = CORES_PER_CLUSTER * wid + core_id_in_cluster; + + wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid]; + // FIXME: This assumes that all cores but the last one are working with full + // warps, and only the last core has a partially-filled warp. + int offset = p_wspawn_args->offset + (NT * wid_in_cluster + tid); + + int task_id = offset; + (p_wspawn_args->callback)(task_id, p_wspawn_args->arg); +} + static void __attribute__ ((noinline)) spawn_tasks_cluster_all_cb() { // activate all threads vx_tmc(-1); @@ -224,8 +244,7 @@ void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg vx_tmc(tmask); // call stub routine - // FIXME: unimplemented for cluster! - spawn_tasks_rem_stub(); + spawn_tasks_cluster_rem_stub(); // back to single-threaded vx_tmc_one(); From 4e834f21035c72e93a16cc0cfbf7b190b16324a1 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 27 Mar 2024 15:09:45 -0700 Subject: [PATCH 26/36] vx_spawn.c: Rewrite cluster-based vx_spawn_tasks variant Implements round-robin allocation of warps to cores & maintains contiguous thread ID allocation to neighboring threads. Also handles partially-enabled remainder warp logic. TODO: Hardcodes only 1 cluster in the system. --- kernel/src/vx_spawn.c | 88 ++++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 48 deletions(-) diff --git a/kernel/src/vx_spawn.c b/kernel/src/vx_spawn.c index fb36b0bc..8e5002f4 100644 --- a/kernel/src/vx_spawn.c +++ b/kernel/src/vx_spawn.c @@ -74,27 +74,6 @@ static void __attribute__ ((noinline)) spawn_tasks_all_stub() { } } -static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_stub() { - int NT = vx_num_threads(); - int NW = vx_num_warps(); - int cid = vx_core_id(); - int wid = vx_warp_id(); - int tid = vx_thread_id(); - - wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid]; - - // FIXME: handle RW - int waves = p_wspawn_args->NWs; - int offset = p_wspawn_args->offset + (NT * wid + tid); - - vx_spawn_tasks_cb callback = p_wspawn_args->callback; - void* arg = p_wspawn_args->arg; - for (int wave_id = 0; wave_id < waves; ++wave_id) { - int task_id = offset + (wave_id * NT * NW); - callback(task_id, arg); - } -} - static void __attribute__ ((noinline)) spawn_tasks_cluster_all_stub() { int NT = vx_num_threads(); int NW = vx_num_warps(); @@ -109,11 +88,13 @@ static void __attribute__ ((noinline)) spawn_tasks_cluster_all_stub() { wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid]; // FIXME: handle RW - int waves = p_wspawn_args->NWs; + int waves = p_wspawn_args->NWs + (wid < p_wspawn_args->RWs); int offset = p_wspawn_args->offset + (NT * wid_in_cluster + tid); vx_spawn_tasks_cb callback = p_wspawn_args->callback; void* arg = p_wspawn_args->arg; + + // sequential iterations for (int wave_id = 0; wave_id < waves; ++wave_id) { int task_id = offset + (wave_id * NT * NW * CORES_PER_CLUSTER); callback(task_id, arg); @@ -171,6 +152,9 @@ static void __attribute__ ((noinline)) spawn_tasks_all_cb() { vx_tmc_zero(); } +// This function runs in every core, but with only 1 warp and 1 thread enabled. +// The logic in this function figures out how many warps/threads this particular +// core has to enable to fulfill an entire grid of computation. void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg) { // device specs int NC = vx_num_cores(); @@ -181,45 +165,49 @@ void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg int core_id = vx_core_id(); if (core_id >= NUM_CORES_MAX) return; + const int cluster_id = core_id / CORES_PER_CLUSTER; + const int core_id_in_cluster = core_id % CORES_PER_CLUSTER; // Distribute threads equally across as many cores as possible, even if they // don't fill up NW*NT in a single core. This makes sure the warps get evenly // distributed in a single cluster // // TODO: Try to contain in a single cluster if possible? - int num_active_cores = (num_tasks > NT) ? (num_tasks / NT) : 1; - num_active_cores = MIN(num_active_cores, NC); + const int num_active_cores = (num_tasks + (NT - 1)) / NT; if (core_id >= num_active_cores) return; // terminate extra cores - int tasks_per_core = num_tasks / num_active_cores; - int tasks_per_core_last = tasks_per_core; - if (core_id == (num_active_cores - 1)) { - int rem = num_tasks % num_active_cores; - tasks_per_core_last += rem; // last core also executes remaining tasks + // FIXME: HARDCODES 1 CLUSTER! + const int num_tasks_this_cluster = num_tasks; + const int num_full_warps = num_tasks_this_cluster / NT; + const int rem_threads_in_last_warp = num_tasks_this_cluster % NT; + // const int num_warps = (num_tasks_this_cluster + (NT - 1)) / NT; + + int num_warps_this_core = num_full_warps / CORES_PER_CLUSTER; + const int num_warps_in_last_row = num_full_warps % CORES_PER_CLUSTER; + if (core_id_in_cluster < num_warps_in_last_row) { + num_warps_this_core++; + } + // if 0, last warp is full-threads enabled + int rem_threads_in_last_warp_this_core = 0; + if (rem_threads_in_last_warp != 0) { + if (core_id_in_cluster == num_warps_in_last_row - 1) { + rem_threads_in_last_warp_this_core = rem_threads_in_last_warp; + } } - int num_full_warps = tasks_per_core_last / NT; - int rem_threads_in_last_warp = tasks_per_core_last % NT; // sequential iterations - int num_full_waves = 1; - int rem_warps_in_last_wave = 0; - if (num_full_warps >= NW) { - // this division will result in the same value for both the last core and - // the rest - num_full_waves = num_full_warps / NW; - rem_warps_in_last_wave = num_full_warps % NW; - } + const int num_full_waves = num_warps_this_core / NW; + const int rem_full_warps_in_last_wave = num_warps_this_core % NW; - int cluster_id = core_id / CORES_PER_CLUSTER; - const int tasks_per_cluster = tasks_per_core * CORES_PER_CLUSTER; - const int offset = cluster_id * tasks_per_cluster; - wspawn_tasks_args_t wspawn_args = {callback, arg, offset, num_full_waves, rem_warps_in_last_wave}; + const const int offset = cluster_id * num_tasks_this_cluster; + wspawn_tasks_args_t wspawn_args = {callback, arg, offset, num_full_waves, + rem_full_warps_in_last_wave}; g_wspawn_args[core_id] = &wspawn_args; - if (num_full_warps >= 1) { + if (num_warps_this_core > 0) { // execute callback on other warps - int nw = MIN(num_full_warps, NW); + const int nw = MIN(num_warps_this_core, NW); vx_wspawn(nw, spawn_tasks_cluster_all_cb); // activate all threads @@ -235,12 +223,16 @@ void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg vx_wspawn_wait(); } - if (rem_threads_in_last_warp != 0) { + // TODO: Instead of launching an additional wave just to work on remaining + // threads, handle this in the last wave amongst other full warps. + if (rem_threads_in_last_warp != 0 && core_id_in_cluster == 0) { // adjust offset - wspawn_args.offset += (tasks_per_core_last - rem_threads_in_last_warp); + // FIXME: consider cluster_id here + // FIXME: use rem_threads_in_last_warp_this_core + wspawn_args.offset += (num_tasks_this_cluster - rem_threads_in_last_warp); // activate remaining threads - int tmask = (1 << rem_threads_in_last_warp) - 1; + const int tmask = (1 << rem_threads_in_last_warp) - 1; vx_tmc(tmask); // call stub routine From fa6adceb7e48b3a70454acc0f138ce2b705ea437 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 27 Mar 2024 15:15:38 -0700 Subject: [PATCH 27/36] vecaddx: Hardcode args/input device address to match chipyard Don't use mem_alloc/mem_free API --- tests/regression/vecaddx/main.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tests/regression/vecaddx/main.cpp b/tests/regression/vecaddx/main.cpp index 4f3b77af..e25ad5b4 100644 --- a/tests/regression/vecaddx/main.cpp +++ b/tests/regression/vecaddx/main.cpp @@ -107,9 +107,9 @@ static void parse_args(int argc, char **argv) { void cleanup() { if (device) { - vx_mem_free(device, kernel_arg.src0_addr); - vx_mem_free(device, kernel_arg.src1_addr); - vx_mem_free(device, kernel_arg.dst_addr); + // vx_mem_free(device, kernel_arg.src0_addr); + // vx_mem_free(device, kernel_arg.src1_addr); + // vx_mem_free(device, kernel_arg.dst_addr); vx_dev_close(device); } } @@ -182,9 +182,12 @@ int main(int argc, char *argv[]) { // allocate device memory std::cout << "allocate device memory" << std::endl; - RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src0_addr)); - RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src1_addr)); - RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr)); + // RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src0_addr)); + // RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src1_addr)); + // RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr)); + kernel_arg.src0_addr = 0x20000UL; + kernel_arg.src1_addr = 0x28000UL; + kernel_arg.dst_addr = 0xc0000000UL; kernel_arg.num_points = num_points; From 870846f20fbe3d3ab307a32b6bbbd8d009901983 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 27 Mar 2024 15:38:52 -0700 Subject: [PATCH 28/36] vx_spawn.c: Create separate vx_spawn_tasks_contiguous --- kernel/include/vx_spawn.h | 1 + kernel/src/vx_spawn.c | 105 +++++++++++++++++++++++++++++++++++++- 2 files changed, 104 insertions(+), 2 deletions(-) diff --git a/kernel/include/vx_spawn.h b/kernel/include/vx_spawn.h index 8ebbab09..84dad2bc 100644 --- a/kernel/include/vx_spawn.h +++ b/kernel/include/vx_spawn.h @@ -53,6 +53,7 @@ void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg); void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback, void * arg); void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void * arg); +void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void * arg); void vx_serial(vx_serial_cb callback, void * arg); diff --git a/kernel/src/vx_spawn.c b/kernel/src/vx_spawn.c index 8e5002f4..278516a3 100644 --- a/kernel/src/vx_spawn.c +++ b/kernel/src/vx_spawn.c @@ -74,6 +74,26 @@ static void __attribute__ ((noinline)) spawn_tasks_all_stub() { } } +static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_stub() { + int NT = vx_num_threads(); + int NW = vx_num_warps(); + int cid = vx_core_id(); + int wid = vx_warp_id(); + int tid = vx_thread_id(); + + wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid]; + + int waves = p_wspawn_args->NWs + (wid < p_wspawn_args->RWs); + int offset = p_wspawn_args->offset + (NT * wid + tid); + + vx_spawn_tasks_cb callback = p_wspawn_args->callback; + void* arg = p_wspawn_args->arg; + for (int wave_id = 0; wave_id < waves; ++wave_id) { + int task_id = offset + (wave_id * NT * NW); + callback(task_id, arg); + } +} + static void __attribute__ ((noinline)) spawn_tasks_cluster_all_stub() { int NT = vx_num_threads(); int NW = vx_num_warps(); @@ -87,7 +107,6 @@ static void __attribute__ ((noinline)) spawn_tasks_cluster_all_stub() { wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid]; - // FIXME: handle RW int waves = p_wspawn_args->NWs + (wid < p_wspawn_args->RWs); int offset = p_wspawn_args->offset + (NT * wid_in_cluster + tid); @@ -129,12 +148,22 @@ static void __attribute__ ((noinline)) spawn_tasks_cluster_rem_stub() { (p_wspawn_args->callback)(task_id, p_wspawn_args->arg); } +static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_cb() { + // activate all threads + vx_tmc(-1); + + // call stub routine + spawn_tasks_contiguous_all_stub(); + + // disable warp + vx_tmc_zero(); +} + static void __attribute__ ((noinline)) spawn_tasks_cluster_all_cb() { // activate all threads vx_tmc(-1); // call stub routine - // spawn_tasks_all_stub(); spawn_tasks_cluster_all_stub(); // disable warp @@ -243,6 +272,78 @@ void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg } } +void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void * arg) { + // device specs + int NC = vx_num_cores(); + int NW = vx_num_warps(); + int NT = vx_num_threads(); + + // current core id + int core_id = vx_core_id(); + if (core_id >= NUM_CORES_MAX) + return; + + // calculate necessary active cores + int WT = NW * NT; + int nC = (num_tasks > WT) ? (num_tasks / WT) : 1; + int nc = MIN(nC, NC); + if (core_id >= nc) + return; // terminate extra cores + + // number of tasks per core + int tasks_per_core = num_tasks / nc; + int tasks_per_core_n1 = tasks_per_core; + if (core_id == (nc-1)) { + int rem = num_tasks - (nc * tasks_per_core); + tasks_per_core_n1 += rem; // last core also executes remaining tasks + } + + // number of tasks per warp + int TW = tasks_per_core_n1 / NT; // occupied warps + int rT = tasks_per_core_n1 - TW * NT; // remaining threads + int fW = 1, rW = 0; + if (TW >= NW) { + fW = TW / NW; // full warps iterations + rW = TW - fW * NW; // remaining warps + } + + wspawn_tasks_args_t wspawn_args = { callback, arg, core_id * tasks_per_core, fW, rW }; + g_wspawn_args[core_id] = &wspawn_args; + + if (TW >= 1) { + // execute callback on other warps + int nw = MIN(TW, NW); + vx_wspawn(nw, spawn_tasks_contiguous_all_cb); + + // activate all threads + vx_tmc(-1); + + // call stub routine + spawn_tasks_contiguous_all_stub(); + + // back to single-threaded + vx_tmc_one(); + + // wait for spawn warps to terminate + vx_wspawn_wait(); + } + + if (rT != 0) { + // adjust offset + wspawn_args.offset += (tasks_per_core_n1 - rT); + + // activate remaining threads + int tmask = (1 << rT) - 1; + vx_tmc(tmask); + + // call stub routine + spawn_tasks_rem_stub(); + + // back to single-threaded + vx_tmc_one(); + } +} + void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) { // device specs int NC = vx_num_cores(); From 09822764e7f3931edea74198eb86917ccd758086 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 27 Mar 2024 22:43:25 -0700 Subject: [PATCH 29/36] sgemm_wg: Remove software-based barrier implementation Intra-cluster barrier is now implemented in hardware, transparent to the ISA. --- tests/regression/sgemm_wg/kernel.cpp | 54 ---------------------------- 1 file changed, 54 deletions(-) diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index 44299934..78f056fa 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -12,63 +12,9 @@ #define TM 2 #define TN 2 -#define DEV_BARRIER_MMIO_BASE_ADDR 0xff003f00UL -#define CORES_PER_CLUSTER 2 -#define BARRIER_STRIDE 4 - void threadblock_barrier(unsigned int tid_in_threadblock, unsigned int barrier_id, unsigned int count) { - vx_barrier(barrier_id, count); vx_fence(); - - // vx_printf("========== barrier! barrier_id=%u, count=%u\n", barrier_id, count); - -#if CORES_PER_CLUSTER != 0 - // this code doesn't work without the memory-mapped register implemented in - // hardware, hence the #ifdef. - - if (tid_in_threadblock == 0) { - volatile uint32_t *mmio = (volatile uint32_t *)(DEV_BARRIER_MMIO_BASE_ADDR); - int core_id = vx_core_id(); - // FIXME: hardcoded - const uint32_t barrier_stride = BARRIER_STRIDE; - const uint32_t barrier_offset = barrier_stride * barrier_id; - - // wait for the barrier to be initialized - while (mmio[barrier_offset + 1 + core_id] != 0); - - // signal internal-core synchronization done - mmio[barrier_offset + 1 + core_id] = 1; - - // wait for other cores in the cluster to finish by waiting on the - // all-synced read-only mmio reg - while (mmio[barrier_offset] == 0); - - // need to signal that this core passed the barrier; otherwise, if we - // reset this to 0 right away, the other core still waiting for the - // barrier might never see the all-sync mmio reg as 1. - mmio[barrier_offset + 1 + core_id] = 2; - - // // if this core is the last one passing the barrier, reset all per-core - // // flags to 0 to get ready for the next barrier - // bool all_passed = true; - // for (int i = 0; i < CORES_PER_CLUSTER; i++) { - // // if (i == core_id) continue; - // // NOTE: this requires coherent access of store-to-load to the same - // // address - // if (mmio[barrier_offset + 1 + i] != 2) { - // all_passed = false; - // break; - // } - // } - // if (all_passed) { - // for (int i = 0; i < CORES_PER_CLUSTER; i++) { - // mmio[barrier_offset + 1 + i] = 0; - // } - // } - } - vx_barrier(barrier_id, count); -#endif } void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, From 9555b790e71b2df7d79ea70ba49ba6d34809b552 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 27 Mar 2024 22:45:51 -0700 Subject: [PATCH 30/36] sgemm_wg: ifdef-guard cluster specific code --- tests/regression/sgemm_wg/kernel.cpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index 78f056fa..d34861a7 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -130,8 +130,13 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { // across the threadblock const uint32_t threads_per_threadblock = (BM * BN) / (TM * TN); +#ifdef RADIANCE + const uint32_t threadblocks_per_core = + vx_num_threads() * vx_num_warps() / (threads_per_threadblock / CORES_PER_CLUSTER); +#else const uint32_t threadblocks_per_core = vx_num_threads() * vx_num_warps() / threads_per_threadblock; +#endif const uint32_t threadblock_dim_x = vx_num_threads(); const uint32_t threadblock_dim_y = vx_num_warps() / threadblocks_per_core; const int threadblock_id = task_id / threads_per_threadblock; @@ -156,6 +161,12 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { int main() { kernel_arg_t *arg = (kernel_arg_t *)KERNEL_ARG_DEV_MEM_ADDR; const uint32_t grid_size = arg->dim_m * arg->dim_n / (TM * TN); - vx_spawn_tasks(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); +#ifdef RADIANCE + vx_spawn_tasks_cluster(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); +#else + // NOTE: This kernel assumes contiguous thread scheduling for threadblock + // allocation, and therefore does not work with original vx_spawn_tasks + vx_spawn_tasks_contiguous(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); +#endif return 0; } From 9673db4e8cae4dd6fc66682761c0f310c0b95f66 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 28 Mar 2024 17:35:47 -0700 Subject: [PATCH 31/36] sgemm_wg: Fix possible divide-by-0 --- tests/regression/sgemm_wg/kernel.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index d34861a7..5fc1b8b8 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -132,7 +132,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { const uint32_t threads_per_threadblock = (BM * BN) / (TM * TN); #ifdef RADIANCE const uint32_t threadblocks_per_core = - vx_num_threads() * vx_num_warps() / (threads_per_threadblock / CORES_PER_CLUSTER); + vx_num_threads() * vx_num_warps() / threads_per_threadblock * CORES_PER_CLUSTER; #else const uint32_t threadblocks_per_core = vx_num_threads() * vx_num_warps() / threads_per_threadblock; @@ -149,13 +149,13 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { const int threadblock_id_x = threadblock_id % dim_n_in_blocks; const int threadblock_id_y = threadblock_id / dim_n_in_blocks; + // "static" shared memory allocation. This would determine threadblock + // occupancy of a single cluster float *sharedmem_per_threadblock = - (float *)DEV_SMEM_START_ADDR + - (2 * BM * BK) * threadblock_id_in_core; - thread_block_gemm(arg, tid_in_threadblock, - threadblock_dim_x, threadblock_dim_y, threadblock_id_x, - threadblock_id_y, threadblock_id_in_core, - sharedmem_per_threadblock); + (float *)DEV_SMEM_START_ADDR + (2 * BM * BK) * threadblock_id_in_core; + thread_block_gemm(arg, tid_in_threadblock, threadblock_dim_x, + threadblock_dim_y, threadblock_id_x, threadblock_id_y, + threadblock_id_in_core, sharedmem_per_threadblock); } int main() { From a9b0814211b760b2b0299614ead326af1e989c46 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 28 Mar 2024 18:17:00 -0700 Subject: [PATCH 32/36] sgemm_wg: Document tiling parameter constraints --- tests/regression/sgemm_wg/kernel.cpp | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index 5fc1b8b8..11612db1 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -4,11 +4,20 @@ #include #include "common.h" +// Constraints on parameters: +// * Memory: +// (BM + BN) * BK * sizeof(float) <= sharedmem size. +// BM * BK == BN * BK >= threadblock size >= NT * CORES_PER_CLUSTER +// When larger, the kernel runs a sequential loop to read into sharedmem; +// but smaller case is not handled. +// * Compute: +// ( M* N) / (TM*TN) == grid size >= NC*NW*NT +// (BM*BN) / (TM*TN) == threadblock size >= NT * CORES_PER_CLUSTER +// * Combining BM * BK >= (BM*BN) / (TM*TN) == threadblock yields +// BM <= BK*TM*TN. #define BM 8 #define BN BM #define BK 2 -// #define TM (BM/BK) -// #define TN (BN/BK) #define TM 2 #define TN 2 @@ -82,7 +91,9 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, threadblock_barrier(tid_in_threadblock, threadblock_id_in_core, threadblock_dim_y); + // Compute single tile*tile matmul for (uint32_t local_k = 0; local_k < BK; local_k++) { + // First, pump data from SMEM->RF #pragma GCC unroll TM for (uint32_t res_idx_m = 0; res_idx_m < TM; res_idx_m++) { reg_a[res_idx_m] = @@ -94,7 +105,7 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, local_b[BN * local_k + (TN * local_c_col + res_idx_n)]; } - // Compute multiple result elements (TM) per thread + // Next, compute multiple result elements (TM*TN) by reusing data in RF #pragma GCC unroll TM for (uint32_t res_idx_m = 0; res_idx_m < TM; res_idx_m++) { #pragma GCC unroll TN @@ -113,6 +124,7 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, threadblock_dim_y); } + // Store result data from RF to GMEM #pragma GCC unroll TM for (uint32_t res_idx_m = 0; res_idx_m < TM; res_idx_m++) { #pragma GCC unroll TN From e4eec8ab4d2260eafcfbd98d18addb840dcb37c1 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 28 Mar 2024 20:16:44 -0700 Subject: [PATCH 33/36] vx_spawn.c: Handle num_clusters > 1 WIP: still assumes num_tasks is divisible by num_cluster --- kernel/src/vx_spawn.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/kernel/src/vx_spawn.c b/kernel/src/vx_spawn.c index 278516a3..9ea45ded 100644 --- a/kernel/src/vx_spawn.c +++ b/kernel/src/vx_spawn.c @@ -148,7 +148,7 @@ static void __attribute__ ((noinline)) spawn_tasks_cluster_rem_stub() { (p_wspawn_args->callback)(task_id, p_wspawn_args->arg); } -static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_cb() { +static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_cb() { // activate all threads vx_tmc(-1); @@ -159,7 +159,7 @@ static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_cb() { vx_tmc_zero(); } -static void __attribute__ ((noinline)) spawn_tasks_cluster_all_cb() { +static void __attribute__ ((noinline)) spawn_tasks_cluster_all_cb() { // activate all threads vx_tmc(-1); @@ -186,9 +186,11 @@ static void __attribute__ ((noinline)) spawn_tasks_all_cb() { // core has to enable to fulfill an entire grid of computation. void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg) { // device specs - int NC = vx_num_cores(); - int NW = vx_num_warps(); - int NT = vx_num_threads(); + const int NC = vx_num_cores(); + const int NW = vx_num_warps(); + const int NT = vx_num_threads(); + // NOTE: assumes divisible + const int num_cluster = NC / CORES_PER_CLUSTER; // current core id int core_id = vx_core_id(); @@ -206,8 +208,8 @@ void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg if (core_id >= num_active_cores) return; // terminate extra cores - // FIXME: HARDCODES 1 CLUSTER! - const int num_tasks_this_cluster = num_tasks; + // FIXME: assumes num_tasks is divisible by num_cluster + const int num_tasks_this_cluster = num_tasks / num_cluster; const int num_full_warps = num_tasks_this_cluster / NT; const int rem_threads_in_last_warp = num_tasks_this_cluster % NT; // const int num_warps = (num_tasks_this_cluster + (NT - 1)) / NT; From 537b97eb202b248ea3d0e228a62240cce862468f Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 28 Mar 2024 20:17:26 -0700 Subject: [PATCH 34/36] common.mk: Don't clean all *.elf --- tests/regression/common.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/regression/common.mk b/tests/regression/common.mk index 087561b0..8f4c4db1 100644 --- a/tests/regression/common.mk +++ b/tests/regression/common.mk @@ -121,7 +121,7 @@ clean: rm -rf $(PROJECT) *.o .depend clean-all: clean - rm -rf *.elf *.dump + rm -rf kernel.elf kernel.radiance.elf *.dump ifneq ($(MAKECMDGOALS),clean) -include .depend From fa2b6e2ad0d27da4dfae778626a714281c2ad505 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 29 Mar 2024 02:48:29 -0700 Subject: [PATCH 35/36] sgemm_wg: Explicitly limit unroll to reduce stack spilling This needs to be done case-by-case for different BK/TM/TN combinations and examining the assembly. --- tests/regression/sgemm_wg/kernel.cpp | 40 +++++++++++++++++----------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index 11612db1..4833154c 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -12,14 +12,15 @@ // but smaller case is not handled. // * Compute: // ( M* N) / (TM*TN) == grid size >= NC*NW*NT +// (BM*BN) / (TM*TN) == threadblock size < NT * NW * CORES_PER_CLUSTER // (BM*BN) / (TM*TN) == threadblock size >= NT * CORES_PER_CLUSTER // * Combining BM * BK >= (BM*BN) / (TM*TN) == threadblock yields -// BM <= BK*TM*TN. -#define BM 8 +// BM <= BK*TM*TN +#define BM 16 #define BN BM -#define BK 2 -#define TM 2 -#define TN 2 +#define BK 4 +#define TM 4 +#define TN 4 void threadblock_barrier(unsigned int tid_in_threadblock, unsigned int barrier_id, unsigned int count) { vx_fence(); @@ -32,7 +33,7 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, const uint32_t threadblock_dim_y, const uint32_t threadblock_id_x, const uint32_t threadblock_id_y, - const uint32_t threadblock_id_in_core, + const uint32_t threadblock_id_in_cluster, float *sharedmem_per_threadblock) { const float *A = (const float *)arg->addr_a; const float *B = (const float *)arg->addr_b; @@ -75,12 +76,17 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, constexpr uint32_t stride_b = (BM * BN) / BN / (TM * TN); for (uint32_t k = 0; k < dim_k; k += BK) { + // Data move from GMEM to SMEM + // + // Make sure global offset values for A and B are contiguous between + // neighboring threads to ensure GMEM coalescing. for (uint32_t load_offset = 0; load_offset < BM; load_offset += stride_a) { const uint32_t global_a_offset = dim_k * (global_a_row + load_offset) + (k + local_a_col); local_a[BK * (local_a_row + load_offset) + local_a_col] = A[global_a_offset]; } +// #pragma GCC unroll 1 for (uint32_t load_offset = 0; load_offset < BK; load_offset += stride_b) { const uint32_t global_b_offset = dim_n * (k + local_b_row + load_offset) + global_b_col; @@ -88,10 +94,11 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, B[global_b_offset]; } - threadblock_barrier(tid_in_threadblock, threadblock_id_in_core, + threadblock_barrier(tid_in_threadblock, threadblock_id_in_cluster, threadblock_dim_y); // Compute single tile*tile matmul +#pragma GCC unroll 2 for (uint32_t local_k = 0; local_k < BK; local_k++) { // First, pump data from SMEM->RF #pragma GCC unroll TM @@ -120,7 +127,7 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, } } - threadblock_barrier(tid_in_threadblock, threadblock_id_in_core, + threadblock_barrier(tid_in_threadblock, threadblock_id_in_cluster, threadblock_dim_y); } @@ -137,14 +144,15 @@ void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, } } -void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { +void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) { // @perf: All threads are running these compute whose result is mostly same // across the threadblock const uint32_t threads_per_threadblock = (BM * BN) / (TM * TN); #ifdef RADIANCE - const uint32_t threadblocks_per_core = - vx_num_threads() * vx_num_warps() / threads_per_threadblock * CORES_PER_CLUSTER; + const uint32_t threadblocks_per_core = vx_num_threads() * vx_num_warps() / + threads_per_threadblock * + CORES_PER_CLUSTER; #else const uint32_t threadblocks_per_core = vx_num_threads() * vx_num_warps() / threads_per_threadblock; @@ -152,7 +160,7 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { const uint32_t threadblock_dim_x = vx_num_threads(); const uint32_t threadblock_dim_y = vx_num_warps() / threadblocks_per_core; const int threadblock_id = task_id / threads_per_threadblock; - const int threadblock_id_in_core = threadblock_id % threadblocks_per_core; + const int threadblock_id_in_cluster = threadblock_id % threadblocks_per_core; const int tid_in_threadblock = task_id % threads_per_threadblock; const uint32_t dim_m = arg->dim_m; @@ -164,10 +172,10 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { // "static" shared memory allocation. This would determine threadblock // occupancy of a single cluster float *sharedmem_per_threadblock = - (float *)DEV_SMEM_START_ADDR + (2 * BM * BK) * threadblock_id_in_core; + (float *)DEV_SMEM_START_ADDR + (2 * BM * BK) * threadblock_id_in_cluster; thread_block_gemm(arg, tid_in_threadblock, threadblock_dim_x, threadblock_dim_y, threadblock_id_x, threadblock_id_y, - threadblock_id_in_core, sharedmem_per_threadblock); + threadblock_id_in_cluster, sharedmem_per_threadblock); } int main() { @@ -176,8 +184,8 @@ int main() { #ifdef RADIANCE vx_spawn_tasks_cluster(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); #else - // NOTE: This kernel assumes contiguous thread scheduling for threadblock - // allocation, and therefore does not work with original vx_spawn_tasks + // NOTE: This kernel assumes contiguous thread scheduling for efficient shared + // memory allocation, and therefore does not work with original vx_spawn_tasks vx_spawn_tasks_contiguous(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); #endif return 0; From b0c1f773889936934db2552f66921022bc89d6e1 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 29 Mar 2024 12:24:55 -0700 Subject: [PATCH 36/36] vx_start.S: Swizzle stack space Striding stack space for threads by power-of-two risks possibilities of bank conflicts or cache aliasing problems. Add an extra offset of 4 bytes to avoid this. --- kernel/src/vx_start.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/src/vx_start.S b/kernel/src/vx_start.S index b5065c95..d2a81707 100644 --- a/kernel/src/vx_start.S +++ b/kernel/src/vx_start.S @@ -102,6 +102,8 @@ init_regs: #endif csrr t0, VX_CSR_MHARTID sll t1, t0, STACK_LOG2_SIZE + sll t2, t0, 2 + add t1, t1, t2 sub sp, sp, t1 # set thread pointer register