diff --git a/ci/toolchain_env.sh b/ci/toolchain_env.sh index 440a899e..3d4e2d41 100644 --- a/ci/toolchain_env.sh +++ b/ci/toolchain_env.sh @@ -24,3 +24,7 @@ export PATH=$SV2V_PATH/bin:$PATH export YOSYS_PATH=$TOOLDIR/yosys export PATH=$YOSYS_PATH/bin:$PATH + +export LLVM_VORTEX=$TOOLDIR/llvm-vortex +export POCL_CC_PATH=$TOOLDIR/pocl/compiler +export POCL_RT_PATH=$TOOLDIR/pocl/runtime \ No newline at end of file diff --git a/hw/rtl/core/VX_core.sv b/hw/rtl/core/VX_core.sv index dde085a8..453ebb03 100644 --- a/hw/rtl/core/VX_core.sv +++ b/hw/rtl/core/VX_core.sv @@ -45,7 +45,7 @@ module VX_core import VX_gpu_pkg::*; #( output wire [`NUM_REGS-1:0][`XLEN-1:0] sim_wb_value, // Status - output wire busy + output wire busy //stays 1 when busy, 0 when done (termination) detect the negative edge ); VX_schedule_if schedule_if(); VX_fetch_if fetch_if(); @@ -258,7 +258,7 @@ module VX_core import VX_gpu_pkg::*; #( `endif -`ifdef PERF_ENABLE +`ifdef PERF_ENABLE // expose these perf counter to console using $display, %time; flag: --perf=0? wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rd_req_per_cycle; wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_wr_req_per_cycle; @@ -331,7 +331,57 @@ module VX_core import VX_gpu_pkg::*; #( assign pipeline_perf_if.stores = perf_stores; assign pipeline_perf_if.load_latency = perf_dcache_lat; assign pipeline_perf_if.ifetch_latency = perf_icache_lat; - assign pipeline_perf_if.load_latency = perf_dcache_lat; + real instrs = commit_csr_if.instret; + real cycles = sched_csr_if.cycles; + real icache_lat = perf_icache_lat; + real ifetches = perf_ifetches; + real dcache_lat = perf_dcache_lat; + real loads = perf_loads; + real scheduler_idles = pipeline_perf_if.sched_idles; + real scheduler_stalls = pipeline_perf_if.sched_stalls; + real ibuf_stalls = pipeline_perf_if.ibf_stalls; + real scrb_alu_per_core = pipeline_perf_if.units_uses[`EX_ALU]; + real scrb_fpu_per_core = pipeline_perf_if.units_uses[`EX_FPU]; + real scrb_lsu_per_core = pipeline_perf_if.units_uses[`EX_LSU]; + real scrb_sfu_per_core = pipeline_perf_if.units_uses[`EX_SFU]; + real scrb_tot = scrb_alu_per_core+scrb_fpu_per_core+scrb_lsu_per_core+scrb_sfu_per_core; + + real scrb_wctl_per_core = pipeline_perf_if.sfu_uses[`SFU_WCTL]; + real scrb_csrs_per_core = pipeline_perf_if.sfu_uses[`SFU_CSRS]; + real sfu_tot = scrb_wctl_per_core+scrb_csrs_per_core; + + always @(negedge busy) begin + if (!reset) begin + $display("====================CORE : %d===================",CORE_ID); + $display("time : %t", $time); + $display("perf_dcache_rd_req_per_cycle: %d", perf_dcache_rd_req_per_cycle); + $display("perf_dcache_wr_req_per_cycle: %d", perf_dcache_wr_req_per_cycle); + $display("perf_dcache_rsp_per_cycle: %d", perf_dcache_rsp_per_cycle); + $display("perf_icache_pending_read_cycle: %d", perf_icache_pending_read_cycle); + $display("perf_dcache_pending_read_cycle: %d", perf_dcache_pending_read_cycle); + $display("perf_icache_pending_reads: %d", perf_icache_pending_reads); + $display("perf_dcache_pending_reads: %d", perf_dcache_pending_reads); + $display("perf_icache_req_fire: %b", perf_icache_req_fire); + $display("perf_icache_rsp_fire: %b", perf_icache_rsp_fire); + $display("perf_dcache_rd_req_fire: %b", perf_dcache_rd_req_fire); + $display("perf_dcache_rd_req_fire_r: %b", perf_dcache_rd_req_fire_r); + $display("perf_dcache_wr_req_fire: %b", perf_dcache_wr_req_fire); + $display("perf_dcache_wr_req_fire_r: %b", perf_dcache_wr_req_fire_r); + $display("perf_dcache_rsp_fire: %b", perf_dcache_rsp_fire); + + $display("Instructions: %d, Cycles: %d, IPC: %f", commit_csr_if.instret, sched_csr_if.cycles, instrs/cycles); + $display("scheduler idle: %d (%f)", pipeline_perf_if.sched_idles, scheduler_idles/cycles); + $display("scheduler stalls: %d (%f)", pipeline_perf_if.sched_stalls, scheduler_stalls/cycles); + $display("ibuffer stalls: %d (%f)",pipeline_perf_if.ibf_stalls, ibuf_stalls/cycles); + $display("issue stalls: %d(alu=%f, fpu=%f, lsu=%f, sfu=%f)",pipeline_perf_if.scb_stalls, scrb_alu_per_core/scrb_tot, scrb_fpu_per_core/scrb_tot, scrb_lsu_per_core/scrb_tot, scrb_sfu_per_core/scrb_tot); + $display("sfu stalls: %d (scrs=%f, wctl=%f)",pipeline_perf_if.units_uses[`EX_SFU], scrb_csrs_per_core/sfu_tot, scrb_wctl_per_core/sfu_tot); + $display("ifetches: %d", perf_ifetches); + $display("ifetch latency: %f Cycles", icache_lat/ifetches); + $display("loads: %d", perf_loads); + $display("load latency: %f Cycles", dcache_lat/loads); + $display("stores: %d", perf_stores); + end + end `endif diff --git a/kernel/Makefile b/kernel/Makefile index 07b8c97b..575707f8 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -51,10 +51,10 @@ $(PROJECT).dump: $(PROJECT).a %.S.o: src/%.S $(CC) $(CFLAGS) -c $< -o $@ -%.cpp.o: src/%.cpp +%.cpp.o: src/%.cpp include/vx_spawn.h $(CXX) $(CFLAGS) -c $< -o $@ -%.c.o: src/%.c +%.c.o: src/%.c include/vx_spawn.h $(CC) $(CFLAGS) -c $< -o $@ $(PROJECT).a: $(OBJS) diff --git a/kernel/include/vx_spawn.h b/kernel/include/vx_spawn.h index 2584b997..84dad2bc 100644 --- a/kernel/include/vx_spawn.h +++ b/kernel/include/vx_spawn.h @@ -17,6 +17,10 @@ #include #include +#ifndef CORES_PER_CLUSTER +#define CORES_PER_CLUSTER 2 +#endif + #ifdef __cplusplus extern "C" { #endif @@ -48,6 +52,8 @@ void vx_wspawn_wait(); void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg); void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback, void * arg); +void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void * arg); +void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void * arg); void vx_serial(vx_serial_cb callback, void * arg); diff --git a/kernel/src/vx_spawn.c b/kernel/src/vx_spawn.c index fd8258e1..9ea45ded 100644 --- a/kernel/src/vx_spawn.c +++ b/kernel/src/vx_spawn.c @@ -74,6 +74,52 @@ static void __attribute__ ((noinline)) spawn_tasks_all_stub() { } } +static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_stub() { + int NT = vx_num_threads(); + int NW = vx_num_warps(); + int cid = vx_core_id(); + int wid = vx_warp_id(); + int tid = vx_thread_id(); + + wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid]; + + int waves = p_wspawn_args->NWs + (wid < p_wspawn_args->RWs); + int offset = p_wspawn_args->offset + (NT * wid + tid); + + vx_spawn_tasks_cb callback = p_wspawn_args->callback; + void* arg = p_wspawn_args->arg; + for (int wave_id = 0; wave_id < waves; ++wave_id) { + int task_id = offset + (wave_id * NT * NW); + callback(task_id, arg); + } +} + +static void __attribute__ ((noinline)) spawn_tasks_cluster_all_stub() { + int NT = vx_num_threads(); + int NW = vx_num_warps(); + int cid = vx_core_id(); + int wid = vx_warp_id(); + int tid = vx_thread_id(); + + const int core_id_in_cluster = cid % CORES_PER_CLUSTER; + // round-robin warp_id allocation across cores in cluster + const int wid_in_cluster = CORES_PER_CLUSTER * wid + core_id_in_cluster; + + wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid]; + + int waves = p_wspawn_args->NWs + (wid < p_wspawn_args->RWs); + int offset = p_wspawn_args->offset + (NT * wid_in_cluster + tid); + + vx_spawn_tasks_cb callback = p_wspawn_args->callback; + void* arg = p_wspawn_args->arg; + + // sequential iterations + for (int wave_id = 0; wave_id < waves; ++wave_id) { + int task_id = offset + (wave_id * NT * NW * CORES_PER_CLUSTER); + callback(task_id, arg); + } +} + static void __attribute__ ((noinline)) spawn_tasks_rem_stub() { int cid = vx_core_id(); int tid = vx_thread_id(); @@ -83,6 +129,47 @@ static void __attribute__ ((noinline)) spawn_tasks_rem_stub() { (p_wspawn_args->callback)(task_id, p_wspawn_args->arg); } +static void __attribute__ ((noinline)) spawn_tasks_cluster_rem_stub() { + int NT = vx_num_threads(); + int cid = vx_core_id(); + int tid = vx_thread_id(); + int wid = vx_warp_id(); + + const int core_id_in_cluster = cid % CORES_PER_CLUSTER; + // round-robin warp_id allocation across cores in cluster + const int wid_in_cluster = CORES_PER_CLUSTER * wid + core_id_in_cluster; + + wspawn_tasks_args_t* p_wspawn_args = (wspawn_tasks_args_t*)g_wspawn_args[cid]; + // FIXME: This assumes that all cores but the last one are working with full + // warps, and only the last core has a partially-filled warp. + int offset = p_wspawn_args->offset + (NT * wid_in_cluster + tid); + + int task_id = offset; + (p_wspawn_args->callback)(task_id, p_wspawn_args->arg); +} + +static void __attribute__ ((noinline)) spawn_tasks_contiguous_all_cb() { + // activate all threads + vx_tmc(-1); + + // call stub routine + spawn_tasks_contiguous_all_stub(); + + // disable warp + vx_tmc_zero(); +} + +static void __attribute__ ((noinline)) spawn_tasks_cluster_all_cb() { + // activate all threads + vx_tmc(-1); + + // call stub routine + spawn_tasks_cluster_all_stub(); + + // disable warp + vx_tmc_zero(); +} + static void __attribute__ ((noinline)) spawn_tasks_all_cb() { // activate all threads vx_tmc(-1); @@ -94,6 +181,171 @@ static void __attribute__ ((noinline)) spawn_tasks_all_cb() { vx_tmc_zero(); } +// This function runs in every core, but with only 1 warp and 1 thread enabled. +// The logic in this function figures out how many warps/threads this particular +// core has to enable to fulfill an entire grid of computation. +void vx_spawn_tasks_cluster(int num_tasks, vx_spawn_tasks_cb callback, void *arg) { + // device specs + const int NC = vx_num_cores(); + const int NW = vx_num_warps(); + const int NT = vx_num_threads(); + // NOTE: assumes divisible + const int num_cluster = NC / CORES_PER_CLUSTER; + + // current core id + int core_id = vx_core_id(); + if (core_id >= NUM_CORES_MAX) + return; + const int cluster_id = core_id / CORES_PER_CLUSTER; + const int core_id_in_cluster = core_id % CORES_PER_CLUSTER; + + // Distribute threads equally across as many cores as possible, even if they + // don't fill up NW*NT in a single core. This makes sure the warps get evenly + // distributed in a single cluster + // + // TODO: Try to contain in a single cluster if possible? + const int num_active_cores = (num_tasks + (NT - 1)) / NT; + if (core_id >= num_active_cores) + return; // terminate extra cores + + // FIXME: assumes num_tasks is divisible by num_cluster + const int num_tasks_this_cluster = num_tasks / num_cluster; + const int num_full_warps = num_tasks_this_cluster / NT; + const int rem_threads_in_last_warp = num_tasks_this_cluster % NT; + // const int num_warps = (num_tasks_this_cluster + (NT - 1)) / NT; + + int num_warps_this_core = num_full_warps / CORES_PER_CLUSTER; + const int num_warps_in_last_row = num_full_warps % CORES_PER_CLUSTER; + if (core_id_in_cluster < num_warps_in_last_row) { + num_warps_this_core++; + } + // if 0, last warp is full-threads enabled + int rem_threads_in_last_warp_this_core = 0; + if (rem_threads_in_last_warp != 0) { + if (core_id_in_cluster == num_warps_in_last_row - 1) { + rem_threads_in_last_warp_this_core = rem_threads_in_last_warp; + } + } + + // sequential iterations + const int num_full_waves = num_warps_this_core / NW; + const int rem_full_warps_in_last_wave = num_warps_this_core % NW; + + const const int offset = cluster_id * num_tasks_this_cluster; + wspawn_tasks_args_t wspawn_args = {callback, arg, offset, num_full_waves, + rem_full_warps_in_last_wave}; + g_wspawn_args[core_id] = &wspawn_args; + + if (num_warps_this_core > 0) { + // execute callback on other warps + const int nw = MIN(num_warps_this_core, NW); + vx_wspawn(nw, spawn_tasks_cluster_all_cb); + + // activate all threads + vx_tmc(-1); + + // call stub routine + spawn_tasks_cluster_all_stub(); + + // back to single-threaded + vx_tmc_one(); + + // wait for spawn warps to terminate + vx_wspawn_wait(); + } + + // TODO: Instead of launching an additional wave just to work on remaining + // threads, handle this in the last wave amongst other full warps. + if (rem_threads_in_last_warp != 0 && core_id_in_cluster == 0) { + // adjust offset + // FIXME: consider cluster_id here + // FIXME: use rem_threads_in_last_warp_this_core + wspawn_args.offset += (num_tasks_this_cluster - rem_threads_in_last_warp); + + // activate remaining threads + const int tmask = (1 << rem_threads_in_last_warp) - 1; + vx_tmc(tmask); + + // call stub routine + spawn_tasks_cluster_rem_stub(); + + // back to single-threaded + vx_tmc_one(); + } +} + +void vx_spawn_tasks_contiguous(int num_tasks, vx_spawn_tasks_cb callback , void * arg) { + // device specs + int NC = vx_num_cores(); + int NW = vx_num_warps(); + int NT = vx_num_threads(); + + // current core id + int core_id = vx_core_id(); + if (core_id >= NUM_CORES_MAX) + return; + + // calculate necessary active cores + int WT = NW * NT; + int nC = (num_tasks > WT) ? (num_tasks / WT) : 1; + int nc = MIN(nC, NC); + if (core_id >= nc) + return; // terminate extra cores + + // number of tasks per core + int tasks_per_core = num_tasks / nc; + int tasks_per_core_n1 = tasks_per_core; + if (core_id == (nc-1)) { + int rem = num_tasks - (nc * tasks_per_core); + tasks_per_core_n1 += rem; // last core also executes remaining tasks + } + + // number of tasks per warp + int TW = tasks_per_core_n1 / NT; // occupied warps + int rT = tasks_per_core_n1 - TW * NT; // remaining threads + int fW = 1, rW = 0; + if (TW >= NW) { + fW = TW / NW; // full warps iterations + rW = TW - fW * NW; // remaining warps + } + + wspawn_tasks_args_t wspawn_args = { callback, arg, core_id * tasks_per_core, fW, rW }; + g_wspawn_args[core_id] = &wspawn_args; + + if (TW >= 1) { + // execute callback on other warps + int nw = MIN(TW, NW); + vx_wspawn(nw, spawn_tasks_contiguous_all_cb); + + // activate all threads + vx_tmc(-1); + + // call stub routine + spawn_tasks_contiguous_all_stub(); + + // back to single-threaded + vx_tmc_one(); + + // wait for spawn warps to terminate + vx_wspawn_wait(); + } + + if (rT != 0) { + // adjust offset + wspawn_args.offset += (tasks_per_core_n1 - rT); + + // activate remaining threads + int tmask = (1 << rT) - 1; + vx_tmc(tmask); + + // call stub routine + spawn_tasks_rem_stub(); + + // back to single-threaded + vx_tmc_one(); + } +} + void vx_spawn_tasks(int num_tasks, vx_spawn_tasks_cb callback , void * arg) { // device specs int NC = vx_num_cores(); diff --git a/kernel/src/vx_start.S b/kernel/src/vx_start.S index b5065c95..d2a81707 100644 --- a/kernel/src/vx_start.S +++ b/kernel/src/vx_start.S @@ -102,6 +102,8 @@ init_regs: #endif csrr t0, VX_CSR_MHARTID sll t1, t0, STACK_LOG2_SIZE + sll t2, t0, 2 + add t1, t1, t2 sub sp, sp, t1 # set thread pointer register diff --git a/tests/.gitignore b/tests/.gitignore index a9884992..30ca0fa4 100644 --- a/tests/.gitignore +++ b/tests/.gitignore @@ -1 +1,7 @@ **/*.log +.depend +*.bin +*.dump +*.elf +*.o +*.ll diff --git a/tests/opencl/convolution/main.cc b/tests/opencl/convolution/main.cc index 5c62b56e..dded468f 100644 --- a/tests/opencl/convolution/main.cc +++ b/tests/opencl/convolution/main.cc @@ -238,9 +238,9 @@ int main (int argc, char **argv) { } // NOTE(hansung): Dump operand buffer to a file - if (write_operand_file("matmul.input.a.bin", h_a.data(), nbytes) != 0) + if (write_operand_file("convolution.input.input.bin", h_i.data(), i_nbytes) != 0) return EXIT_FAILURE; - if (write_operand_file("matmul.input.b.bin", h_b.data(), nbytes) != 0) + if (write_operand_file("convolution.input.weights.bin", h_w.data(), w_nbytes) != 0) return EXIT_FAILURE; // Creating command queue diff --git a/tests/regression/common.mk b/tests/regression/common.mk index 6a858edc..8f4c4db1 100644 --- a/tests/regression/common.mk +++ b/tests/regression/common.mk @@ -78,17 +78,23 @@ endif endif endif -all: $(PROJECT) kernel.bin kernel.dump +all: $(PROJECT) kernel.bin kernel.dump kernel.radiance.dump kernel.dump: kernel.elf $(VX_DP) -D kernel.elf > kernel.dump -kernel.bin: kernel.elf +kernel.radiance.dump: kernel.radiance.elf + $(VX_DP) -D kernel.radiance.elf > kernel.radiance.dump + +kernel.bin: kernel.elf kernel.radiance.elf $(VX_CP) -O binary kernel.elf kernel.bin kernel.elf: $(VX_SRCS) $(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf +kernel.radiance.elf: $(VX_SRCS) + $(VX_CXX) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -DRADIANCE -o kernel.radiance.elf + $(PROJECT): $(SRCS) $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ @@ -115,7 +121,7 @@ clean: rm -rf $(PROJECT) *.o .depend clean-all: clean - rm -rf *.elf *.bin *.dump + rm -rf kernel.elf kernel.radiance.elf *.dump ifneq ($(MAKECMDGOALS),clean) -include .depend diff --git a/tests/regression/sgemm_wg/Makefile b/tests/regression/sgemm_wg/Makefile index f57f6124..289369d2 100644 --- a/tests/regression/sgemm_wg/Makefile +++ b/tests/regression/sgemm_wg/Makefile @@ -1,6 +1,6 @@ PROJECT = sgemm_wg -SRCS = main.cpp +SRCS = main.cpp common.h VX_SRCS = kernel.cpp diff --git a/tests/regression/sgemm_wg/kernel.cpp b/tests/regression/sgemm_wg/kernel.cpp index ec207821..4833154c 100644 --- a/tests/regression/sgemm_wg/kernel.cpp +++ b/tests/regression/sgemm_wg/kernel.cpp @@ -1,84 +1,192 @@ #include #include +#include #include #include "common.h" -inline void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, - const uint32_t tid_in_threadblock_x, - const uint32_t tid_in_threadblock_y, +// Constraints on parameters: +// * Memory: +// (BM + BN) * BK * sizeof(float) <= sharedmem size. +// BM * BK == BN * BK >= threadblock size >= NT * CORES_PER_CLUSTER +// When larger, the kernel runs a sequential loop to read into sharedmem; +// but smaller case is not handled. +// * Compute: +// ( M* N) / (TM*TN) == grid size >= NC*NW*NT +// (BM*BN) / (TM*TN) == threadblock size < NT * NW * CORES_PER_CLUSTER +// (BM*BN) / (TM*TN) == threadblock size >= NT * CORES_PER_CLUSTER +// * Combining BM * BK >= (BM*BN) / (TM*TN) == threadblock yields +// BM <= BK*TM*TN +#define BM 16 +#define BN BM +#define BK 4 +#define TM 4 +#define TN 4 + +void threadblock_barrier(unsigned int tid_in_threadblock, unsigned int barrier_id, unsigned int count) { + vx_fence(); + vx_barrier(barrier_id, count); +} + +void thread_block_gemm(kernel_arg_t *__UNIFORM__ arg, + const uint32_t tid_in_threadblock, const uint32_t threadblock_dim_x, const uint32_t threadblock_dim_y, const uint32_t threadblock_id_x, - const uint32_t threadblock_id_y) { - const float *global_a = (const float *)arg->addr_a; - const float *global_b = (const float *)arg->addr_b; - float *global_c = (float *)arg->addr_c; + const uint32_t threadblock_id_y, + const uint32_t threadblock_id_in_cluster, + float *sharedmem_per_threadblock) { + const float *A = (const float *)arg->addr_a; + const float *B = (const float *)arg->addr_b; + float *C = (float *)arg->addr_c; // assumes NT == NW == matrix_dim const uint32_t dim_m = arg->dim_m; const uint32_t dim_n = arg->dim_n; const uint32_t dim_k = arg->dim_k; - // FIXME: assumes local block size is square shape - const uint32_t local_row = tid_in_threadblock_y; - const uint32_t local_col = tid_in_threadblock_x; - const uint32_t global_row = threadblock_id_y * threadblock_dim_y + local_row; - const uint32_t global_col = threadblock_id_x * threadblock_dim_x + local_col; + // FIXME: Output block size is assumed to be square, i.e. BM == BN + // const uint32_t BM = threadblock_dim_y; + // const uint32_t BN = threadblock_dim_y; + // const uint32_t BK = threadblock_dim_x; + // constexpr uint32_t BM = 8; + // constexpr uint32_t BN = 8; + // constexpr uint32_t BK = 2; - // each thread generates one output element - float reg_c = 0.0f; + const uint32_t local_a_row = tid_in_threadblock / BK; + const uint32_t local_a_col = tid_in_threadblock % BK; + const uint32_t local_b_row = tid_in_threadblock / BN; + const uint32_t local_b_col = tid_in_threadblock % BN; + const uint32_t global_a_row = BM * threadblock_id_y + local_a_row; + const uint32_t global_b_col = BN * threadblock_id_x + local_b_col; - for (uint32_t k = 0; k < dim_k; k += threadblock_dim_x) { - float *local_a = (float *)DEV_SMEM_START_ADDR; - size_t local_a_elems = threadblock_dim_x * threadblock_dim_y; - float *local_b = (float *)DEV_SMEM_START_ADDR + local_a_elems; + const uint32_t local_c_row = tid_in_threadblock / (BN / TN); + const uint32_t local_c_col = tid_in_threadblock % (BN / TN); - uint32_t offset_global_a = dim_k * global_row + (k + local_col); - uint32_t offset_global_b = dim_n * (local_row + k) + global_col; - // local_a: threadblock_dim_y rows, threadblock_dim_x cols - // local_b: threadblock_dim_x rows, threadblock_dim_y cols - // threadblock_dim_x == block_k, threadblock_dim_y == block_m == block_n - local_a[threadblock_dim_x * local_row + local_col] = global_a[offset_global_a]; - local_b[threadblock_dim_y * local_col + local_row] = global_b[offset_global_b]; + // each thread generates TM output element + float reg_c[TM * TN] = { 0.0f }; + float reg_a[TM] = { 0.0f }; + float reg_b[TN] = { 0.0f }; - vx_barrier(0, threadblock_dim_y); - vx_fence(); + volatile float *local_a = sharedmem_per_threadblock; + // const size_t local_a_elems = threadblock_dim_x * threadblock_dim_y; + const size_t local_a_elems = (BM * BK); + volatile float *local_b = sharedmem_per_threadblock + local_a_elems; - for (uint32_t local_k = 0; local_k < threadblock_dim_x; local_k++) { - reg_c += local_a[threadblock_dim_x * local_row + local_k] * - local_b[threadblock_dim_y * local_col + local_k]; + constexpr uint32_t stride_a = (BM * BN) / BK / (TM * TN); + constexpr uint32_t stride_b = (BM * BN) / BN / (TM * TN); + + for (uint32_t k = 0; k < dim_k; k += BK) { + // Data move from GMEM to SMEM + // + // Make sure global offset values for A and B are contiguous between + // neighboring threads to ensure GMEM coalescing. + for (uint32_t load_offset = 0; load_offset < BM; load_offset += stride_a) { + const uint32_t global_a_offset = + dim_k * (global_a_row + load_offset) + (k + local_a_col); + local_a[BK * (local_a_row + load_offset) + local_a_col] = + A[global_a_offset]; + } +// #pragma GCC unroll 1 + for (uint32_t load_offset = 0; load_offset < BK; load_offset += stride_b) { + const uint32_t global_b_offset = + dim_n * (k + local_b_row + load_offset) + global_b_col; + local_b[BN * (local_b_row + load_offset) + local_b_col] = + B[global_b_offset]; } - vx_barrier(0, threadblock_dim_y); - vx_fence(); + threadblock_barrier(tid_in_threadblock, threadblock_id_in_cluster, + threadblock_dim_y); + + // Compute single tile*tile matmul +#pragma GCC unroll 2 + for (uint32_t local_k = 0; local_k < BK; local_k++) { + // First, pump data from SMEM->RF +#pragma GCC unroll TM + for (uint32_t res_idx_m = 0; res_idx_m < TM; res_idx_m++) { + reg_a[res_idx_m] = + local_a[BK * (TM * local_c_row + res_idx_m) + local_k]; + } +#pragma GCC unroll TN + for (uint32_t res_idx_n = 0; res_idx_n < TN; res_idx_n++) { + reg_b[res_idx_n] = + local_b[BN * local_k + (TN * local_c_col + res_idx_n)]; + } + + // Next, compute multiple result elements (TM*TN) by reusing data in RF +#pragma GCC unroll TM + for (uint32_t res_idx_m = 0; res_idx_m < TM; res_idx_m++) { +#pragma GCC unroll TN + for (uint32_t res_idx_n = 0; res_idx_n < TN; res_idx_n++) { + // NOTE use of local_b_row + reg_c[TN * res_idx_m + res_idx_n] += + reg_a[res_idx_m] * reg_b[res_idx_n]; + // reg_c[TN * res_idx_m + res_idx_n] += + // local_a[BK * (TM * local_c_row + res_idx_m) + local_k] * + // local_b[BN * local_k + (TN * local_c_col + res_idx_n)]; + } + } + } + + threadblock_barrier(tid_in_threadblock, threadblock_id_in_cluster, + threadblock_dim_y); } - global_c[dim_n * global_row + global_col] = reg_c; + // Store result data from RF to GMEM +#pragma GCC unroll TM + for (uint32_t res_idx_m = 0; res_idx_m < TM; res_idx_m++) { +#pragma GCC unroll TN + for (uint32_t res_idx_n = 0; res_idx_n < TN; res_idx_n++) { + // NOTE use of local_b_row and global_b_col here + C[dim_n * (BM * threadblock_id_y + TM * local_c_row + res_idx_m) + + (BN * threadblock_id_x + TN * local_c_col + res_idx_n)] = + reg_c[TN * res_idx_m + res_idx_n]; + } + } } -void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { - const uint32_t dim_n = arg->dim_n; - int tid_x = task_id % dim_n; - int tid_y = task_id / dim_n; +void kernel_body(int task_id, kernel_arg_t *__UNIFORM__ arg) { + // @perf: All threads are running these compute whose result is mostly same + // across the threadblock + const uint32_t threads_per_threadblock = (BM * BN) / (TM * TN); +#ifdef RADIANCE + const uint32_t threadblocks_per_core = vx_num_threads() * vx_num_warps() / + threads_per_threadblock * + CORES_PER_CLUSTER; +#else + const uint32_t threadblocks_per_core = + vx_num_threads() * vx_num_warps() / threads_per_threadblock; +#endif const uint32_t threadblock_dim_x = vx_num_threads(); - const uint32_t threadblock_dim_y = vx_num_warps(); - const uint32_t threads_per_threadblock = threadblock_dim_x * threadblock_dim_y; + const uint32_t threadblock_dim_y = vx_num_warps() / threadblocks_per_core; const int threadblock_id = task_id / threads_per_threadblock; + const int threadblock_id_in_cluster = threadblock_id % threadblocks_per_core; + const int tid_in_threadblock = task_id % threads_per_threadblock; - const uint32_t dim_n_in_blocks = dim_n / threadblock_dim_x; + const uint32_t dim_m = arg->dim_m; + const uint32_t dim_n = arg->dim_n; + const uint32_t dim_n_in_blocks = dim_n / BN; const int threadblock_id_x = threadblock_id % dim_n_in_blocks; const int threadblock_id_y = threadblock_id / dim_n_in_blocks; - const int tid_in_threadblock_x = vx_thread_id(); - const int tid_in_threadblock_y = vx_warp_id() % threadblock_dim_y; - thread_block_gemm(arg, tid_in_threadblock_x, tid_in_threadblock_y, threadblock_dim_x, - threadblock_dim_y, threadblock_id_x, threadblock_id_y); + // "static" shared memory allocation. This would determine threadblock + // occupancy of a single cluster + float *sharedmem_per_threadblock = + (float *)DEV_SMEM_START_ADDR + (2 * BM * BK) * threadblock_id_in_cluster; + thread_block_gemm(arg, tid_in_threadblock, threadblock_dim_x, + threadblock_dim_y, threadblock_id_x, threadblock_id_y, + threadblock_id_in_cluster, sharedmem_per_threadblock); } int main() { kernel_arg_t *arg = (kernel_arg_t *)KERNEL_ARG_DEV_MEM_ADDR; - const uint32_t grid_size = arg->dim_m * arg->dim_n; - vx_spawn_tasks(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); + const uint32_t grid_size = arg->dim_m * arg->dim_n / (TM * TN); +#ifdef RADIANCE + vx_spawn_tasks_cluster(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); +#else + // NOTE: This kernel assumes contiguous thread scheduling for efficient shared + // memory allocation, and therefore does not work with original vx_spawn_tasks + vx_spawn_tasks_contiguous(grid_size, (vx_spawn_tasks_cb)kernel_body, arg); +#endif return 0; } diff --git a/tests/regression/sgemm_wg/main.cpp b/tests/regression/sgemm_wg/main.cpp index c6252991..229463ef 100644 --- a/tests/regression/sgemm_wg/main.cpp +++ b/tests/regression/sgemm_wg/main.cpp @@ -147,8 +147,8 @@ int main(int argc, char *argv[]) { RT_CHECK(vx_dev_open(&device)); // FIXME: hardcoded - uint32_t dim_m = 16; - uint32_t dim_n = 16; + uint32_t dim_m = 32; + uint32_t dim_n = 32; uint32_t dim_k = 32; generate_source_matrix(dim_m, dim_n, dim_k); diff --git a/tests/regression/vecaddx/common.h b/tests/regression/vecaddx/common.h index 2b8f164a..a7b26936 100644 --- a/tests/regression/vecaddx/common.h +++ b/tests/regression/vecaddx/common.h @@ -1,7 +1,7 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000 +#define KERNEL_ARG_DEV_MEM_ADDR 0x7fff0000 #ifndef TYPE #define TYPE float diff --git a/tests/regression/vecaddx/kernel.cpp b/tests/regression/vecaddx/kernel.cpp index 6ed42164..6e782586 100644 --- a/tests/regression/vecaddx/kernel.cpp +++ b/tests/regression/vecaddx/kernel.cpp @@ -13,6 +13,10 @@ void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) { int main() { kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR; +#ifdef RADIANCE + vx_spawn_tasks_cluster(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg); +#else vx_spawn_tasks(arg->num_points, (vx_spawn_tasks_cb)kernel_body, arg); +#endif return 0; } diff --git a/tests/regression/vecaddx/main.cpp b/tests/regression/vecaddx/main.cpp index 117f3470..e25ad5b4 100644 --- a/tests/regression/vecaddx/main.cpp +++ b/tests/regression/vecaddx/main.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -106,9 +107,9 @@ static void parse_args(int argc, char **argv) { void cleanup() { if (device) { - vx_mem_free(device, kernel_arg.src0_addr); - vx_mem_free(device, kernel_arg.src1_addr); - vx_mem_free(device, kernel_arg.dst_addr); + // vx_mem_free(device, kernel_arg.src0_addr); + // vx_mem_free(device, kernel_arg.src1_addr); + // vx_mem_free(device, kernel_arg.dst_addr); vx_dev_close(device); } } @@ -181,9 +182,12 @@ int main(int argc, char *argv[]) { // allocate device memory std::cout << "allocate device memory" << std::endl; - RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src0_addr)); - RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src1_addr)); - RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr)); + // RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src0_addr)); + // RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.src1_addr)); + // RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.dst_addr)); + kernel_arg.src0_addr = 0x20000UL; + kernel_arg.src1_addr = 0x28000UL; + kernel_arg.dst_addr = 0xc0000000UL; kernel_arg.num_points = num_points; @@ -201,10 +205,19 @@ int main(int argc, char *argv[]) { memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t)); RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t))); + std::ofstream file("args.bin", std::ios::binary | std::ios::out); + if (!file) { + std::cerr << "error: failed to open args.bin for writing\n"; + exit(EXIT_FAILURE); + } + file.write(reinterpret_cast(staging_buf.data()), sizeof(kernel_arg_t)); + file.close(); + // generate source data source_data.resize(2 * num_points); for (uint32_t i = 0; i < source_data.size(); ++i) { - source_data[i] = Comparator::generate(); + // source_data[i] = Comparator::generate(); + source_data[i] = static_cast(i); } // upload source buffer0 @@ -215,6 +228,14 @@ int main(int argc, char *argv[]) { buf_ptr[i] = source_data[2 * i + 0]; } RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size)); + + std::ofstream file("input.a.bin", std::ios::binary | std::ios::out); + if (!file) { + std::cerr << "error: failed to open input.a.bin for writing\n"; + exit(EXIT_FAILURE); + } + file.write(reinterpret_cast(buf_ptr), buf_size); + file.close(); } // upload source buffer1 @@ -225,6 +246,14 @@ int main(int argc, char *argv[]) { buf_ptr[i] = source_data[2 * i + 1]; } RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size)); + + std::ofstream file("input.b.bin", std::ios::binary | std::ios::out); + if (!file) { + std::cerr << "error: failed to open input.b.bin for writing\n"; + exit(EXIT_FAILURE); + } + file.write(reinterpret_cast(buf_ptr), buf_size); + file.close(); } // clear destination buffer @@ -243,4 +272,4 @@ int main(int argc, char *argv[]) { std::cout << "PASSED!" << std::endl; return 0; -} \ No newline at end of file +}