From 9f9ec109604ad6d21c366015d74538cd318c987a Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 8 May 2024 11:26:09 -0700 Subject: [PATCH 01/31] tensor: Enable scaling NUM_THREADS by octets todo: lane-to-octet mapping is arbitrary atm --- hw/rtl/core/VX_tensor_core.sv | 38 +++++++++++---------- hw/rtl/core/VX_tensor_ucode_8lanes.vh | 49 +++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 17 deletions(-) create mode 100644 hw/rtl/core/VX_tensor_ucode_8lanes.vh diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index 9971d619..71ed8538 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -10,8 +10,6 @@ module VX_tensor_core #( VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH], VX_commit_if.master commit_if [`ISSUE_WIDTH] ); - `STATIC_ASSERT(`NUM_THREADS == 32, ("tensor core requires # of threads in a warp to be 32 (try running w/ CONFIGS=\"-DNUM_THREADS=32\")")); - for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin VX_tensor_core_warp #( .ISW(i) @@ -35,29 +33,35 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( VX_dispatch_if.slave dispatch_if, VX_commit_if.master commit_if ); + localparam NUM_OCTETS = (`NUM_THREADS / 8); + // offet in the lane numbers that get mapped to the two threadgroups in an + // octet. E.g. two tgs map lane 0-3 and lane 16-19 -> 16 + // FIXME: not sure this is the right logic. just filling in what works + localparam LANE_OFFSET_THREADGROUP = (4 * NUM_OCTETS); + wire [1:0] step = 2'(dispatch_if.data.op_type); - logic [3:0] octet_results_valid; - logic [3:0] octet_results_ready; - logic [3:0] octet_operands_ready; + logic [NUM_OCTETS-1:0] octet_results_valid; + logic [NUM_OCTETS-1:0] octet_results_ready; + logic [NUM_OCTETS-1:0] octet_operands_ready; logic [`NUM_THREADS-1:0][`XLEN-1:0] wb_data_0; logic [`NUM_THREADS-1:0][`XLEN-1:0] wb_data_1; assign dispatch_if.ready = &octet_operands_ready; `ifdef EXT_T_ENABLE - for (genvar i = 0; i < 4/*octets*/; ++i) begin + for (genvar i = 0; i < NUM_OCTETS; ++i) begin `else for (genvar i = 0; i < 0; ++i) begin `endif // lane-to-octet mapping; see figure 13 of the paper wire [7:0][31:0] octet_A = { - dispatch_if.data.rs1_data[16+4*i +: 4], dispatch_if.data.rs1_data[4*i +: 4] + dispatch_if.data.rs1_data[LANE_OFFSET_THREADGROUP + 4*i +: 4], dispatch_if.data.rs1_data[4*i +: 4] }; wire [7:0][31:0] octet_B = { - dispatch_if.data.rs2_data[16+4*i +: 4], dispatch_if.data.rs2_data[4*i +: 4] + dispatch_if.data.rs2_data[LANE_OFFSET_THREADGROUP + 4*i +: 4], dispatch_if.data.rs2_data[4*i +: 4] }; wire [7:0][31:0] octet_C = { - dispatch_if.data.rs3_data[16+4*i +: 4], dispatch_if.data.rs3_data[4*i +: 4] + dispatch_if.data.rs3_data[LANE_OFFSET_THREADGROUP + 4*i +: 4], dispatch_if.data.rs3_data[4*i +: 4] }; logic [3:0][3:0][31:0] octet_D; @@ -100,15 +104,15 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( assign wb_data_1[4*i+2] = octet_D[0][3]; assign wb_data_1[4*i+3] = octet_D[1][3]; - assign wb_data_0[4*i+16+0] = octet_D[2][0]; - assign wb_data_0[4*i+16+1] = octet_D[3][0]; - assign wb_data_0[4*i+16+2] = octet_D[2][2]; - assign wb_data_0[4*i+16+3] = octet_D[3][2]; + assign wb_data_0[4*i+LANE_OFFSET_THREADGROUP+0] = octet_D[2][0]; + assign wb_data_0[4*i+LANE_OFFSET_THREADGROUP+1] = octet_D[3][0]; + assign wb_data_0[4*i+LANE_OFFSET_THREADGROUP+2] = octet_D[2][2]; + assign wb_data_0[4*i+LANE_OFFSET_THREADGROUP+3] = octet_D[3][2]; - assign wb_data_1[4*i+16+0] = octet_D[2][1]; - assign wb_data_1[4*i+16+1] = octet_D[3][1]; - assign wb_data_1[4*i+16+2] = octet_D[2][3]; - assign wb_data_1[4*i+16+3] = octet_D[3][3]; + assign wb_data_1[4*i+LANE_OFFSET_THREADGROUP+0] = octet_D[2][1]; + assign wb_data_1[4*i+LANE_OFFSET_THREADGROUP+1] = octet_D[3][1]; + assign wb_data_1[4*i+LANE_OFFSET_THREADGROUP+2] = octet_D[2][3]; + assign wb_data_1[4*i+LANE_OFFSET_THREADGROUP+3] = octet_D[3][3]; end /* commit_if.data_t parts that we need to keep around: diff --git a/hw/rtl/core/VX_tensor_ucode_8lanes.vh b/hw/rtl/core/VX_tensor_ucode_8lanes.vh new file mode 100644 index 00000000..41ec857e --- /dev/null +++ b/hw/rtl/core/VX_tensor_ucode_8lanes.vh @@ -0,0 +1,49 @@ +// uop metadata (sequencing, next state), execution metadata (EX_TYPE, OP_TYPE, OP_MOD), wb, use pc, use imm, pc, imm, rd, rs1, rs2, rs3 +HMMA_SET0_STEP0_0: begin + uop = {NEXT, HMMA_SET0_STEP0_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(16), `FREG(0), `FREG(8), `FREG(16)}; +end +HMMA_SET0_STEP0_1: begin + uop = {NEXT, HMMA_SET0_STEP1_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(17), `FREG(1), `FREG(9), `FREG(17)}; +end +HMMA_SET0_STEP1_0: begin + uop = {NEXT, HMMA_SET0_STEP1_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(18), `FREG(0), `FREG(8), `FREG(18)}; +end +HMMA_SET0_STEP1_1: begin + uop = {NEXT, HMMA_SET0_STEP2_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(19), `FREG(1), `FREG(9), `FREG(19)}; +end +HMMA_SET0_STEP2_0: begin + uop = {NEXT, HMMA_SET0_STEP2_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(20), `FREG(0), `FREG(8), `FREG(20)}; +end +HMMA_SET0_STEP2_1: begin + uop = {NEXT, HMMA_SET0_STEP3_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(21), `FREG(1), `FREG(9), `FREG(21)}; +end +HMMA_SET0_STEP3_0: begin + uop = {NEXT, HMMA_SET0_STEP3_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(22), `FREG(0), `FREG(8), `FREG(22)}; +end +HMMA_SET0_STEP3_1: begin + uop = {NEXT, HMMA_SET1_STEP0_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(23), `FREG(1), `FREG(9), `FREG(23)}; +end +HMMA_SET1_STEP0_0: begin + uop = {NEXT, HMMA_SET1_STEP0_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(16), `FREG(2), `FREG(10), `FREG(16)}; +end +HMMA_SET1_STEP0_1: begin + uop = {NEXT, HMMA_SET1_STEP1_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(17), `FREG(3), `FREG(11), `FREG(17)}; +end +HMMA_SET1_STEP1_0: begin + uop = {NEXT, HMMA_SET1_STEP1_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(18), `FREG(2), `FREG(10), `FREG(18)}; +end +HMMA_SET1_STEP1_1: begin + uop = {NEXT, HMMA_SET1_STEP2_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(19), `FREG(3), `FREG(11), `FREG(19)}; +end +HMMA_SET1_STEP2_0: begin + uop = {NEXT, HMMA_SET1_STEP2_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(20), `FREG(2), `FREG(10), `FREG(20)}; +end +HMMA_SET1_STEP2_1: begin + uop = {NEXT, HMMA_SET1_STEP3_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(21), `FREG(3), `FREG(11), `FREG(21)}; +end +HMMA_SET1_STEP3_0: begin + uop = {NEXT, HMMA_SET1_STEP3_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(22), `FREG(2), `FREG(10), `FREG(22)}; +end +HMMA_SET1_STEP3_1: begin + uop = {FINISH, HMMA_SET0_STEP0_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b1, 32'b1, `FREG(23), `FREG(3), `FREG(11), `FREG(23)}; +end From 1a1094b2bb6b6b986ca4add4deb731e1cd6e5a1c Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 15 May 2024 15:34:26 -0700 Subject: [PATCH 02/31] tensor: Add dispatch unit to narrow to BLOCK_SIZE=1 --- hw/rtl/core/VX_tensor_core.sv | 94 +++++++++++++++++++++++++---------- 1 file changed, 69 insertions(+), 25 deletions(-) diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index 71ed8538..14d8175b 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -1,7 +1,7 @@ `ifdef EXT_T_ENABLE `include "VX_fpu_define.vh" -module VX_tensor_core #( +module VX_tensor_core import VX_gpu_pkg::*; #( ) ( input clk, @@ -10,15 +10,54 @@ module VX_tensor_core #( VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH], VX_commit_if.master commit_if [`ISSUE_WIDTH] ); - for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin + localparam BLOCK_SIZE = 1; + localparam NUM_LANES = `NUM_THREADS; + // localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS); + localparam PARTIAL_BW = 1; + + VX_execute_if #( + .NUM_LANES (NUM_LANES) + ) execute_if[BLOCK_SIZE](); + + `RESET_RELAY (dispatch_reset, reset); + + VX_dispatch_unit #( + .BLOCK_SIZE (BLOCK_SIZE), + .NUM_LANES (NUM_LANES), + .OUT_REG (PARTIAL_BW ? 1 : 0) + ) dispatch_unit ( + .clk (clk), + .reset (dispatch_reset), + .dispatch_if(dispatch_if), + .execute_if (execute_if) + ); + + VX_commit_if #( + .NUM_LANES (NUM_LANES) + ) commit_block_if[BLOCK_SIZE](); + + `RESET_RELAY (commit_reset, reset); + + VX_gather_unit #( + .BLOCK_SIZE (BLOCK_SIZE), + .NUM_LANES (NUM_LANES), + .OUT_REG (PARTIAL_BW ? 3 : 0) // FIXME: why 3? + ) gather_unit ( + .clk (clk), + .reset (commit_reset), + .commit_in_if (commit_block_if), + .commit_out_if (commit_if) + ); + + for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin VX_tensor_core_warp #( - .ISW(i) + .ISW(1) // FIXME: not block_idx ) tensor_core ( .clk(clk), .reset(reset), - .dispatch_if(dispatch_if[i]), - .commit_if(commit_if[i]) + .execute_if(execute_if[block_idx]), + .commit_if(commit_block_if[block_idx]) ); end @@ -30,7 +69,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( input clk, input reset, - VX_dispatch_if.slave dispatch_if, + VX_execute_if.slave execute_if, VX_commit_if.master commit_if ); localparam NUM_OCTETS = (`NUM_THREADS / 8); @@ -39,14 +78,15 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( // FIXME: not sure this is the right logic. just filling in what works localparam LANE_OFFSET_THREADGROUP = (4 * NUM_OCTETS); - wire [1:0] step = 2'(dispatch_if.data.op_type); + wire [1:0] step = 2'(execute_if.data.op_type); logic [NUM_OCTETS-1:0] octet_results_valid; logic [NUM_OCTETS-1:0] octet_results_ready; logic [NUM_OCTETS-1:0] octet_operands_ready; + // FIXME: should be NUM_LANES? logic [`NUM_THREADS-1:0][`XLEN-1:0] wb_data_0; logic [`NUM_THREADS-1:0][`XLEN-1:0] wb_data_1; - assign dispatch_if.ready = &octet_operands_ready; + assign execute_if.ready = &octet_operands_ready; `ifdef EXT_T_ENABLE for (genvar i = 0; i < NUM_OCTETS; ++i) begin @@ -55,13 +95,13 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( `endif // lane-to-octet mapping; see figure 13 of the paper wire [7:0][31:0] octet_A = { - dispatch_if.data.rs1_data[LANE_OFFSET_THREADGROUP + 4*i +: 4], dispatch_if.data.rs1_data[4*i +: 4] + execute_if.data.rs1_data[LANE_OFFSET_THREADGROUP + 4*i +: 4], execute_if.data.rs1_data[4*i +: 4] }; wire [7:0][31:0] octet_B = { - dispatch_if.data.rs2_data[LANE_OFFSET_THREADGROUP + 4*i +: 4], dispatch_if.data.rs2_data[4*i +: 4] + execute_if.data.rs2_data[LANE_OFFSET_THREADGROUP + 4*i +: 4], execute_if.data.rs2_data[4*i +: 4] }; wire [7:0][31:0] octet_C = { - dispatch_if.data.rs3_data[LANE_OFFSET_THREADGROUP + 4*i +: 4], dispatch_if.data.rs3_data[4*i +: 4] + execute_if.data.rs3_data[LANE_OFFSET_THREADGROUP + 4*i +: 4], execute_if.data.rs3_data[4*i +: 4] }; logic [3:0][3:0][31:0] octet_D; @@ -77,7 +117,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( .A_in(octet_A), .B_in(octet_B), .C_in(octet_C), - .operands_valid(dispatch_if.valid), + .operands_valid(execute_if.valid), .operands_ready(octet_operands_ready[i]), .step(step), @@ -126,18 +166,18 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS; - wire dispatch_if_fire = dispatch_if.valid && dispatch_if.ready; + wire execute_if_fire = execute_if.valid && execute_if.ready; wire commit_if_fire = commit_if.valid && commit_if.ready; - wire [DATAW-1:0] dispatch_if_data_enq = { - dispatch_if.data.uuid, - wis_to_wid(dispatch_if.data.wis, ISW), - dispatch_if.data.tmask, - dispatch_if.data.PC, - dispatch_if.data.wb, - dispatch_if.data.rd + wire [DATAW-1:0] execute_if_data_enq = { + execute_if.data.uuid, + execute_if.data.wid, + execute_if.data.tmask, + execute_if.data.PC, + execute_if.data.wb, + execute_if.data.rd }; - wire [DATAW-1:0] dispatch_if_data_deq; + wire [DATAW-1:0] execute_if_data_deq; // this is probably a little oversized VX_fifo_queue #( @@ -146,10 +186,10 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( ) pending_uops ( .clk(clk), .reset(reset), - .push(dispatch_if_fire), + .push(execute_if_fire), .pop(commit_if_fire), - .data_in(dispatch_if_data_enq), - .data_out(dispatch_if_data_deq), + .data_in(execute_if_data_enq), + .data_out(execute_if_data_deq), `UNUSED_PIN(empty), `UNUSED_PIN(alm_empty), `UNUSED_PIN(full), // should be impossible to overflow @@ -163,7 +203,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1; wire [COMMIT_DATAW-1:0] commit_if_data = { - dispatch_if_data_deq, /* uuid ~ rd */ + execute_if_data_deq, /* uuid ~ rd */ subcommit == 1'b0 ? wb_data_0 : wb_data_1, /* data */ 1'b0, /* pid */ 1'b1, /* sop */ @@ -227,6 +267,10 @@ module VX_tensor_octet #( // note that not all lanes participate at every step case (step) 2'b00: begin + // Two A_in segments correspond to two 2x2 subtiles of A read + // by two threadgroups: [0:2,0:2] and [4:6,0:2] in Step 0 of + // Figure 10(b). B_in OTOH is shared by two threadgroups. + // Note k-dimension is shrunk from 4 to 2. A_half = { A_in[5:4], A_in[1:0] }; B_half = B_in[3:0]; end From 89e7d65926db97f5f4d8a422b39f30ed0609c9e2 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 16 May 2024 12:49:15 -0700 Subject: [PATCH 03/31] tensor: Add ready signal to enforce 1 warp occupancy Currently disabled as the timing behavior is already ~accurate --- hw/dpi/float_dpi.cpp | 4 ++-- hw/rtl/VX_config.vh | 2 +- hw/rtl/core/VX_tensor_core.sv | 14 ++++++++++++++ hw/rtl/fpu/VX_tensor_dpu.sv | 13 +++++++++++-- 4 files changed, 28 insertions(+), 5 deletions(-) diff --git a/hw/dpi/float_dpi.cpp b/hw/dpi/float_dpi.cpp index 29ca22df..6a810555 100644 --- a/hw/dpi/float_dpi.cpp +++ b/hw/dpi/float_dpi.cpp @@ -347,7 +347,7 @@ void dpi_fmax(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, s // A is M * K, B is K * M, C is M * M, D is M * M #define M 4 -#define K 2 +#define K 2 // FIXME: 4x4x1 / cycle / octet! // all row major float c_A_tile[M][K]; @@ -551,7 +551,7 @@ void dpi_print_results(int wid, int octet, const svBitVecVal* A_tile, const svBi } steps[wid] += 1; - if (steps[wid] % 64 == 0) { + if (steps[wid] % 32 == 0) { steps[wid] = 0; std::cout << "warp " << wid << " finished wmma\n"; std::cout << "A tile" << "\n"; diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 65d56e8a..5ef71794 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -391,7 +391,7 @@ // Tensor Core Latency `ifndef LATENCY_HMMA -`define LATENCY_HMMA 8 +`define LATENCY_HMMA 2 `endif // Icache Configurable Knobs ////////////////////////////////////////////////// diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index 14d8175b..185218fc 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -32,6 +32,10 @@ module VX_tensor_core import VX_gpu_pkg::*; #( .execute_if (execute_if) ); + // FIXME: when multiple warps are running, step0_0 from multiple warps can + // get interleaved before the first warp advances to step0_1, fucking + // everything up + VX_commit_if #( .NUM_LANES (NUM_LANES) ) commit_block_if[BLOCK_SIZE](); @@ -175,6 +179,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( execute_if.data.PC, execute_if.data.wb, execute_if.data.rd + // pid/sop/eop set later }; wire [DATAW-1:0] execute_if_data_deq; @@ -320,8 +325,16 @@ module VX_tensor_octet #( end end + wire hmma_ready; wire stall = result_valid && ~result_ready; + // backpressure from commit assign operands_ready = ~stall; + // TODO: Below line is to only allow 1 warp to occupy the octet at a time; + // currently, dpu is fully-pipelined and allows concurrency between + // multiple warps. This seems to be not a problem though given that the + // RF operand read takes >=2 cycles, which should be the end-to-end + // latency of the DPU anyways + // assign operands_ready = hmma_ready && ~stall; // A is 4x2 fp32 matrix wire [3:0][1:0][31:0] A_tile = { @@ -359,6 +372,7 @@ module VX_tensor_octet #( .stall(stall), .valid_in(do_hmma), + .ready_in(hmma_ready), .A_tile(A_tile), .B_tile(B_tile), .C_tile(C_tile), diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv index cfc5f507..63d35ae7 100644 --- a/hw/rtl/fpu/VX_tensor_dpu.sv +++ b/hw/rtl/fpu/VX_tensor_dpu.sv @@ -11,6 +11,7 @@ module VX_tensor_dpu #( input stall, input valid_in, + output ready_in, input [3:0][1:0][31:0] A_tile, input [1:0][3:0][31:0] B_tile, input [3:0][3:0][31:0] C_tile, @@ -24,12 +25,20 @@ module VX_tensor_dpu #( dpi_hmma(valid_in, A_tile, B_tile, C_tile, result_hmma); end + logic ready_reg; always @(posedge clk) begin - if (~reset && valid_in) begin + if (reset) begin + ready_reg <= '1; + end else if (valid_in) begin + ready_reg <= '0; dpi_print_results(int'(ISW), int'(OCTET), A_tile, B_tile, C_tile, result_hmma); + end else if (valid_out) begin + ready_reg <= '1; end end - + + // ready as soon as valid_out + assign ready_in = ready_reg || valid_out; VX_shift_register #( .DATAW (1 + $bits(D_tile)), From 317695a8d0b8ca69ad30d6b38236a0d6bfa7f90f Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 16 May 2024 15:32:46 -0700 Subject: [PATCH 04/31] Add perf counters on LSU resp valid tmasks --- hw/rtl/core/VX_lsu_unit.sv | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/hw/rtl/core/VX_lsu_unit.sv b/hw/rtl/core/VX_lsu_unit.sv index b4fd6ee1..63f1d4c6 100644 --- a/hw/rtl/core/VX_lsu_unit.sv +++ b/hw/rtl/core/VX_lsu_unit.sv @@ -596,6 +596,31 @@ module VX_lsu_unit import VX_gpu_pkg::*; #( .commit_out_if (commit_if) ); +`ifdef PERF_ENABLE + wire [`CLOG2(NUM_LANES+1)-1:0] perf_rsp_tmask_valids_per_cycle; + wire [`CLOG2(NUM_LANES+1)-1:0] perf_rsp_tmask_total_per_cycle; + reg [`PERF_CTR_BITS-1:0] perf_rsp_tmask_valids; + reg [`PERF_CTR_BITS-1:0] perf_rsp_tmask_total; + reg [`PERF_CTR_BITS-1:0] perf_rsp_fires; + + `POP_COUNT(perf_rsp_tmask_valids_per_cycle, rsp_tmask); + assign perf_rsp_tmask_total_per_cycle = NUM_LANES; + + always @(posedge clk) begin + if (reset) begin + perf_rsp_tmask_valids <= '0; + perf_rsp_tmask_total <= '0; + perf_rsp_fires <= '0; + end else begin + if (mem_rsp_fire) begin + perf_rsp_tmask_valids <= perf_rsp_tmask_valids + perf_rsp_tmask_valids_per_cycle; + perf_rsp_tmask_total <= perf_rsp_tmask_total + perf_rsp_tmask_total_per_cycle; + perf_rsp_fires <= perf_rsp_fires + 1'b1; + end + end + end +`endif + `ifdef DBG_SCOPE_LSU if (CORE_ID == 0) begin `ifdef SCOPE From 5034d8d14b548fd42bf983666ac1ad0d1d00c091 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 16 May 2024 20:07:30 -0700 Subject: [PATCH 05/31] tensor: Add buffer to hide 2cyc commit latency Since operand and commit throughput are the same (2 cycles), it is unnecessary to stall the dpu during the multi-cycle commit. This enables the dpu to operate at full throughput of 1 operand every 2 cycles. --- hw/rtl/VX_config.vh | 2 +- hw/rtl/core/VX_tensor_core.sv | 26 +++++++++++++++++++++++--- hw/rtl/fpu/VX_tensor_dpu.sv | 1 + 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 5ef71794..65d56e8a 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -391,7 +391,7 @@ // Tensor Core Latency `ifndef LATENCY_HMMA -`define LATENCY_HMMA 2 +`define LATENCY_HMMA 8 `endif // Icache Configurable Knobs ////////////////////////////////////////////////// diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index 185218fc..29bfb98c 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -326,8 +326,10 @@ module VX_tensor_octet #( end wire hmma_ready; - wire stall = result_valid && ~result_ready; + wire outbuf_ready_in; + // wire stall = result_valid && ~result_ready; // backpressure from commit + wire stall = ~outbuf_ready_in; assign operands_ready = ~stall; // TODO: Below line is to only allow 1 warp to occupy the octet at a time; // currently, dpu is fully-pipelined and allows concurrency between @@ -349,6 +351,7 @@ module VX_tensor_octet #( }; // C is 4x4 fp32 matrix logic [3:0][3:0][31:0] C_tile; + logic [3:0][3:0][31:0] D_tile; always @(*) begin C_tile = { @@ -360,6 +363,7 @@ module VX_tensor_octet #( end wire do_hmma = (substep == 1'b1 && operands_valid && operands_ready); + wire dpu_valid; // this does (m,n,k)=(4,4,2) matmul, modeling compute of a single octet VX_tensor_dpu #( @@ -377,8 +381,24 @@ module VX_tensor_octet #( .B_tile(B_tile), .C_tile(C_tile), - .valid_out(result_valid), - .D_tile(D_out) + .valid_out(dpu_valid), + .D_tile(D_tile) + ); + + // buffer to stage the result tile for 2 cycles until commit/writeback is + // complete + VX_stream_buffer #( + .DATAW ($bits(D_out)), + .OUT_REG (1) // not sure this is necessary + ) output_buffer ( + .clk (clk), + .reset (reset), + .valid_in (dpu_valid), + .ready_in (outbuf_ready_in), + .data_in (D_tile), + .data_out (D_out), + .ready_out (result_ready), + .valid_out (result_valid) ); endmodule `endif diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv index 63d35ae7..4130fb98 100644 --- a/hw/rtl/fpu/VX_tensor_dpu.sv +++ b/hw/rtl/fpu/VX_tensor_dpu.sv @@ -40,6 +40,7 @@ module VX_tensor_dpu #( // ready as soon as valid_out assign ready_in = ready_reg || valid_out; + // fixed-latency model VX_shift_register #( .DATAW (1 + $bits(D_tile)), .DEPTH (`LATENCY_HMMA), From 45d86b26a2d32f6fdec33ec9a9be5df1f850f057 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 16 May 2024 22:15:01 -0700 Subject: [PATCH 06/31] tensor: Add counter for dpu operations --- hw/rtl/core/VX_tensor_core.sv | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index 29bfb98c..e37f5016 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -400,5 +400,19 @@ module VX_tensor_octet #( .ready_out (result_ready), .valid_out (result_valid) ); + +`ifdef PERF_ENABLE + logic [`PERF_CTR_BITS-1:0] perf_tensor_dpu_total; + + always @(posedge clk) begin + if (reset) begin + perf_tensor_dpu_total <= '0; + end else begin + if (do_hmma) begin + perf_tensor_dpu_total <= perf_tensor_dpu_total + 1'b1; + end + end + end +`endif endmodule `endif From 8775458a8fcab23d45edcbf81cfb2b2ff26aa18e Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sat, 25 May 2024 19:08:17 -0700 Subject: [PATCH 07/31] Stage half-operands per warp An easy solution to handle multiple concurrent warp operations by staging half-operands in their own per-warp register. This might increase area requirement by quite a bit. TODO: Commit is not being handled correctly yet --- hw/rtl/core/VX_tensor_core.sv | 81 +++++++++++++++++++++++------------ 1 file changed, 54 insertions(+), 27 deletions(-) diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index e37f5016..b6b11754 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -83,6 +83,8 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( localparam LANE_OFFSET_THREADGROUP = (4 * NUM_OCTETS); wire [1:0] step = 2'(execute_if.data.op_type); + wire operands_last_in_pair = (execute_if.data.op_mod == `INST_MOD_BITS'(1)); + logic [NUM_OCTETS-1:0] octet_results_valid; logic [NUM_OCTETS-1:0] octet_results_ready; logic [NUM_OCTETS-1:0] octet_operands_ready; @@ -111,6 +113,8 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( logic [3:0][3:0][31:0] octet_D; logic result_valid; logic result_ready; + + // op_mod is reused to indicate instruction's id in pair VX_tensor_octet #( .ISW(ISW), .OCTET(i) @@ -122,6 +126,8 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( .B_in(octet_B), .C_in(octet_C), .operands_valid(execute_if.valid), + .operands_wid(execute_if.data.wid), + .operands_last_in_pair(operands_last_in_pair), .operands_ready(octet_operands_ready[i]), .step(step), @@ -245,11 +251,14 @@ module VX_tensor_octet #( input clk, input reset, - input [7:0][31:0] A_in, - input [7:0][31:0] B_in, - input [7:0][31:0] C_in, - input operands_valid, // we have to backpressure due to there potentially being contention over commit - output operands_ready, + input [7:0][31:0] A_in, + input [7:0][31:0] B_in, + input [7:0][31:0] C_in, + input operands_valid, + input [`NW_WIDTH-1:0] operands_wid, + input operands_last_in_pair, + // we have to backpressure due to there potentially being contention over commit + output operands_ready, input [1:0] step, @@ -258,9 +267,9 @@ module VX_tensor_octet #( input result_ready ); // 512 bits/octet * 4 octets per warp - logic [3:0][31:0] A_buffer, A_buffer_n; - logic [3:0][31:0] B_buffer, B_buffer_n; - logic [7:0][31:0] C_buffer, C_buffer_n; + logic [`NUM_WARPS-1:0][3:0][31:0] A_buffer, A_buffer_n; + logic [`NUM_WARPS-1:0][3:0][31:0] B_buffer, B_buffer_n; + logic [`NUM_WARPS-1:0][7:0][31:0] C_buffer, C_buffer_n; // half the inputs are buffered, half are not (instead coming straight // from operand bus) unlike the real tensor core. @@ -268,6 +277,10 @@ module VX_tensor_octet #( logic [3:0][31:0] A_half; logic [3:0][31:0] B_half; logic [7:0][31:0] C_half; + + logic [`NUM_WARPS-1:0] substeps; + logic [`NUM_WARPS-1:0] substeps_n; + always @(*) begin // note that not all lanes participate at every step case (step) @@ -296,18 +309,29 @@ module VX_tensor_octet #( end logic substep; - wire substep_n = (operands_ready && operands_valid) ? ~substep : substep; + wire operands_fire = operands_ready && operands_valid; + wire substep_n = operands_fire && operands_last_in_pair; always @(*) begin A_buffer_n = A_buffer; B_buffer_n = B_buffer; C_buffer_n = C_buffer; + substeps_n = substeps; - if (substep == 1'b0) begin - A_buffer_n = A_half; - B_buffer_n = B_half; - C_buffer_n = C_half; + if (operands_fire) begin + substeps_n[operands_wid] = ~substeps[operands_wid]; + if (!operands_last_in_pair) begin + A_buffer_n[operands_wid] = A_half; + B_buffer_n[operands_wid] = B_half; + C_buffer_n[operands_wid] = C_half; + end end + + // if (operands_fire && (substep == 1'b0)) begin + // A_buffer_n[operands_wid] = A_half; + // B_buffer_n[operands_wid] = B_half; + // C_buffer_n[operands_wid] = C_half; + // end end always @(posedge clk) begin @@ -315,13 +339,17 @@ module VX_tensor_octet #( A_buffer <= '0; B_buffer <= '0; C_buffer <= '0; + substep <= '0; + substeps <= '0; end else begin A_buffer <= A_buffer_n; B_buffer <= B_buffer_n; C_buffer <= C_buffer_n; + substep <= substep_n; + substeps <= substeps_n; end end @@ -330,39 +358,38 @@ module VX_tensor_octet #( // wire stall = result_valid && ~result_ready; // backpressure from commit wire stall = ~outbuf_ready_in; - assign operands_ready = ~stall; + // assign operands_ready = ~stall; // TODO: Below line is to only allow 1 warp to occupy the octet at a time; // currently, dpu is fully-pipelined and allows concurrency between // multiple warps. This seems to be not a problem though given that the // RF operand read takes >=2 cycles, which should be the end-to-end // latency of the DPU anyways - // assign operands_ready = hmma_ready && ~stall; + assign operands_ready = hmma_ready && ~stall; // A is 4x2 fp32 matrix wire [3:0][1:0][31:0] A_tile = { - { A_half[3], A_buffer[3] }, - { A_half[2], A_buffer[2] }, - { A_half[1], A_buffer[1] }, - { A_half[0], A_buffer[0] } + { A_half[3], A_buffer[operands_wid][3] }, + { A_half[2], A_buffer[operands_wid][2] }, + { A_half[1], A_buffer[operands_wid][1] }, + { A_half[0], A_buffer[operands_wid][0] } }; // B is 2x4 fp32 matrix wire [1:0][3:0][31:0] B_tile = { - B_half, B_buffer + B_half, B_buffer[operands_wid] }; // C is 4x4 fp32 matrix logic [3:0][3:0][31:0] C_tile; logic [3:0][3:0][31:0] D_tile; always @(*) begin - C_tile = { - C_half[7], C_buffer[7], C_half[5], C_buffer[5], - C_half[6], C_buffer[6], C_half[4], C_buffer[4], - C_half[3], C_buffer[3], C_half[1], C_buffer[1], - C_half[2], C_buffer[2], C_half[0], C_buffer[0] - }; + C_tile[3] = { C_half[7], C_buffer[operands_wid][7], C_half[5], C_buffer[operands_wid][5] }; + C_tile[2] = { C_half[6], C_buffer[operands_wid][6], C_half[4], C_buffer[operands_wid][4] }; + C_tile[1] = { C_half[3], C_buffer[operands_wid][3], C_half[1], C_buffer[operands_wid][1] }; + C_tile[0] = { C_half[2], C_buffer[operands_wid][2], C_half[0], C_buffer[operands_wid][0] }; end - wire do_hmma = (substep == 1'b1 && operands_valid && operands_ready); + // wire do_hmma = operands_fire && (substeps[operands_wid] == 1'b1); + wire do_hmma = operands_fire && operands_last_in_pair; wire dpu_valid; // this does (m,n,k)=(4,4,2) matmul, modeling compute of a single octet From 5a95eba1f5424629be64dc2c927aae2b02662dec Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sat, 25 May 2024 19:54:03 -0700 Subject: [PATCH 08/31] tensor: Clear c_*_tile before compute This didn't really cause any problem, but just to be sure. --- hw/dpi/float_dpi.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/hw/dpi/float_dpi.cpp b/hw/dpi/float_dpi.cpp index 6a810555..570d6bf2 100644 --- a/hw/dpi/float_dpi.cpp +++ b/hw/dpi/float_dpi.cpp @@ -358,6 +358,15 @@ float c_D_tile[M][M]; // code assumes that svBitVecVal is basically a uint32_t static_assert(sizeof(svBitVecVal) == 4); +void clear_float_array(float* c_tile, int rows, int cols) { + for (int i = 0; i < rows; i += 1) { + for (int j = 0; j < cols; j += 1) { + int index = i * cols + j; + c_tile[index] = 0.0f; + } + } +} + void fill_float_array(const svBitVecVal* sv_tile, float* c_tile, int rows, int cols) { for (int i = 0; i < rows; i += 1) { @@ -396,6 +405,11 @@ void dpi_hmma(bool enable, const svBitVecVal* A_tile, const svBitVecVal* B_tile, if (!enable) { return; } + clear_float_array(&c_A_tile[0][0], M, K); + clear_float_array(&c_B_tile[0][0], K, M); + clear_float_array(&c_C_tile[0][0], M, M); + clear_float_array(&c_D_tile[0][0], M, M); + // std::cout << "A: " << std::endl; fill_float_array(A_tile, &c_A_tile[0][0], M, K); // std::cout << "B: " << std::endl; From 864265bda5ee5115d0de15939ea59ba92145295b Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sat, 25 May 2024 20:04:31 -0700 Subject: [PATCH 09/31] tensor: Fix consecutive commits to write to same warp ... by splitting the pending_uops queue across warps. --- hw/rtl/core/VX_tensor_core.sv | 76 ++++++++++++++++++++++------------- hw/rtl/fpu/VX_tensor_dpu.sv | 10 +++-- 2 files changed, 53 insertions(+), 33 deletions(-) diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index b6b11754..d1ee3b38 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -32,10 +32,6 @@ module VX_tensor_core import VX_gpu_pkg::*; #( .execute_if (execute_if) ); - // FIXME: when multiple warps are running, step0_0 from multiple warps can - // get interleaved before the first warp advances to step0_1, fucking - // everything up - VX_commit_if #( .NUM_LANES (NUM_LANES) ) commit_block_if[BLOCK_SIZE](); @@ -83,7 +79,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( localparam LANE_OFFSET_THREADGROUP = (4 * NUM_OCTETS); wire [1:0] step = 2'(execute_if.data.op_type); - wire operands_last_in_pair = (execute_if.data.op_mod == `INST_MOD_BITS'(1)); + wire last_in_pair = (execute_if.data.op_mod == `INST_MOD_BITS'(1)); logic [NUM_OCTETS-1:0] octet_results_valid; logic [NUM_OCTETS-1:0] octet_results_ready; @@ -91,6 +87,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( // FIXME: should be NUM_LANES? logic [`NUM_THREADS-1:0][`XLEN-1:0] wb_data_0; logic [`NUM_THREADS-1:0][`XLEN-1:0] wb_data_1; + wire [`NW_WIDTH-1:0] wb_wid; assign execute_if.ready = &octet_operands_ready; @@ -127,12 +124,13 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( .C_in(octet_C), .operands_valid(execute_if.valid), .operands_wid(execute_if.data.wid), - .operands_last_in_pair(operands_last_in_pair), + .operands_last_in_pair(last_in_pair), .operands_ready(octet_operands_ready[i]), .step(step), .D_out(octet_D), + .D_wid(wb_wid), .result_valid(result_valid), .result_ready(result_ready) ); @@ -188,33 +186,49 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( // pid/sop/eop set later }; - wire [DATAW-1:0] execute_if_data_deq; + wire [`NUM_WARPS-1:0][DATAW-1:0] execute_if_data_deq; - // this is probably a little oversized - VX_fifo_queue #( - .DATAW(DATAW), - .DEPTH(16) - ) pending_uops ( - .clk(clk), - .reset(reset), - .push(execute_if_fire), - .pop(commit_if_fire), - .data_in(execute_if_data_enq), - .data_out(execute_if_data_deq), - `UNUSED_PIN(empty), - `UNUSED_PIN(alm_empty), - `UNUSED_PIN(full), // should be impossible to overflow - `UNUSED_PIN(alm_full), - `UNUSED_PIN(size) - ); + for (genvar i = 0; i < `NUM_WARPS; i++) begin + wire enq = execute_if_fire && (execute_if.data.wid == i); + wire deq = commit_if_fire && (wb_wid == i); + logic full; + // execute_if request queue. + // This has to be separated per-warp, as otherwise requests from + // multiple warps can be enqueued interleaved, which makes it hard to + // ensure two consecutive dequeues are associated to the same warp for + // commit. + VX_fifo_queue #( + .DATAW(DATAW), + .DEPTH(4 /* FIXME: arbitrary */) + ) pending_uops ( + .clk(clk), + .reset(reset), + .push(enq), + .pop(deq), + .data_in(execute_if_data_enq), + .data_out(execute_if_data_deq[i]), + `UNUSED_PIN(empty), + `UNUSED_PIN(alm_empty), + .full(full), // should be impossible to overflow + `UNUSED_PIN(alm_full), + `UNUSED_PIN(size) + ); + + `RUNTIME_ASSERT(!full, ("tensor core uop queue is full!")); + end + + // unlike execute which can be interleaved between warps, commit is + // serialized and completed one-warp-by-warp, therefore we only need to + // keep one subcommit state bit unlike for `substeps` logic subcommit, subcommit_n; + wire all_valid = (& octet_results_valid); assign commit_if.valid = all_valid; localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1; wire [COMMIT_DATAW-1:0] commit_if_data = { - execute_if_data_deq, /* uuid ~ rd */ + execute_if_data_deq[wb_wid], /* uuid ~ rd */ subcommit == 1'b0 ? wb_data_0 : wb_data_1, /* data */ 1'b0, /* pid */ 1'b1, /* sop */ @@ -263,6 +277,7 @@ module VX_tensor_octet #( input [1:0] step, output [3:0][3:0][31:0] D_out, + output [`NW_WIDTH-1:0] D_wid, output result_valid, input result_ready ); @@ -380,6 +395,7 @@ module VX_tensor_octet #( // C is 4x4 fp32 matrix logic [3:0][3:0][31:0] C_tile; logic [3:0][3:0][31:0] D_tile; + logic [`NW_WIDTH-1:0] D_warp_id; always @(*) begin C_tile[3] = { C_half[7], C_buffer[operands_wid][7], C_half[5], C_buffer[operands_wid][5] }; @@ -407,23 +423,25 @@ module VX_tensor_octet #( .A_tile(A_tile), .B_tile(B_tile), .C_tile(C_tile), + .warp_id(operands_wid), .valid_out(dpu_valid), - .D_tile(D_tile) + .D_tile(D_tile), + .D_warp_id(D_warp_id) ); // buffer to stage the result tile for 2 cycles until commit/writeback is // complete VX_stream_buffer #( - .DATAW ($bits(D_out)), + .DATAW ($bits(D_wid) + $bits(D_out)), .OUT_REG (1) // not sure this is necessary ) output_buffer ( .clk (clk), .reset (reset), .valid_in (dpu_valid), .ready_in (outbuf_ready_in), - .data_in (D_tile), - .data_out (D_out), + .data_in ({D_warp_id, D_tile}), + .data_out ({D_wid, D_out}), .ready_out (result_ready), .valid_out (result_valid) ); diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv index 4130fb98..1ffbb6d3 100644 --- a/hw/rtl/fpu/VX_tensor_dpu.sv +++ b/hw/rtl/fpu/VX_tensor_dpu.sv @@ -15,9 +15,11 @@ module VX_tensor_dpu #( input [3:0][1:0][31:0] A_tile, input [1:0][3:0][31:0] B_tile, input [3:0][3:0][31:0] C_tile, + input [`NW_WIDTH-1:0] warp_id, output valid_out, - output [3:0][3:0][31:0] D_tile + output [3:0][3:0][31:0] D_tile, + output [`NW_WIDTH-1:0] D_warp_id ); logic [3:0][3:0][31:0] result_hmma; @@ -42,15 +44,15 @@ module VX_tensor_dpu #( // fixed-latency model VX_shift_register #( - .DATAW (1 + $bits(D_tile)), + .DATAW (1 + $bits(warp_id) + $bits(D_tile)), .DEPTH (`LATENCY_HMMA), .RESETW (1) ) shift_reg ( .clk (clk), .reset (reset), .enable (~stall), - .data_in ({valid_in, result_hmma}), - .data_out ({valid_out, D_tile}) + .data_in ({valid_in, warp_id, result_hmma}), + .data_out ({valid_out, D_warp_id, D_tile}) ); endmodule `endif From 28f6cd59b5dcfc8885827d33fd2e881a2c33e96e Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 26 May 2024 21:59:25 -0700 Subject: [PATCH 10/31] tensor: Improve commit efficiency by decoupling dpu with fifo --- hw/rtl/VX_config.vh | 2 +- hw/rtl/core/VX_tensor_core.sv | 42 +++++++++++++++++++++++------------ hw/rtl/fpu/VX_tensor_dpu.sv | 10 ++++----- 3 files changed, 34 insertions(+), 20 deletions(-) diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 65d56e8a..5ef71794 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -391,7 +391,7 @@ // Tensor Core Latency `ifndef LATENCY_HMMA -`define LATENCY_HMMA 8 +`define LATENCY_HMMA 2 `endif // Icache Configurable Knobs ////////////////////////////////////////////////// diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index d1ee3b38..0612ca12 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -189,8 +189,8 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( wire [`NUM_WARPS-1:0][DATAW-1:0] execute_if_data_deq; for (genvar i = 0; i < `NUM_WARPS; i++) begin - wire enq = execute_if_fire && (execute_if.data.wid == i); - wire deq = commit_if_fire && (wb_wid == i); + wire enq = execute_if_fire && (execute_if.data.wid == `NW_WIDTH'(i)); + wire deq = commit_if_fire && ( wb_wid == `NW_WIDTH'(i)); logic full; // execute_if request queue. @@ -395,7 +395,7 @@ module VX_tensor_octet #( // C is 4x4 fp32 matrix logic [3:0][3:0][31:0] C_tile; logic [3:0][3:0][31:0] D_tile; - logic [`NW_WIDTH-1:0] D_warp_id; + logic [`NW_WIDTH-1:0] D_wid_dpu; always @(*) begin C_tile[3] = { C_half[7], C_buffer[operands_wid][7], C_half[5], C_buffer[operands_wid][5] }; @@ -423,27 +423,41 @@ module VX_tensor_octet #( .A_tile(A_tile), .B_tile(B_tile), .C_tile(C_tile), - .warp_id(operands_wid), + .wid(operands_wid), .valid_out(dpu_valid), .D_tile(D_tile), - .D_warp_id(D_warp_id) + .D_wid(D_wid_dpu) ); + wire outbuf_empty; + wire outbuf_full; + assign outbuf_ready_in = ~outbuf_full; + assign result_valid = ~outbuf_empty; + + wire outbuf_enq = outbuf_ready_in && dpu_valid; + wire outbuf_deq = result_valid && result_ready; + // buffer to stage the result tile for 2 cycles until commit/writeback is - // complete - VX_stream_buffer #( + // complete. This decouples the irregular dpu output traffic from the + // regular, every-2-cycle commit traffic and thereby ensures the commit + // pipeline is used more efficiently. + // TODO: This is probably oversized. + VX_fifo_queue #( .DATAW ($bits(D_wid) + $bits(D_out)), - .OUT_REG (1) // not sure this is necessary + .DEPTH (8 /* FIXME: arbitrary */) ) output_buffer ( - .clk (clk), + .clk (clk), .reset (reset), - .valid_in (dpu_valid), - .ready_in (outbuf_ready_in), - .data_in ({D_warp_id, D_tile}), + .push (outbuf_enq), + .pop (outbuf_deq), + .data_in ({D_wid_dpu, D_tile}), .data_out ({D_wid, D_out}), - .ready_out (result_ready), - .valid_out (result_valid) + .empty (outbuf_empty), + `UNUSED_PIN(alm_empty), + .full (outbuf_full), // should be impossible to overflow + `UNUSED_PIN(alm_full), + `UNUSED_PIN(size) ); `ifdef PERF_ENABLE diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv index 1ffbb6d3..7a3ee41d 100644 --- a/hw/rtl/fpu/VX_tensor_dpu.sv +++ b/hw/rtl/fpu/VX_tensor_dpu.sv @@ -15,11 +15,11 @@ module VX_tensor_dpu #( input [3:0][1:0][31:0] A_tile, input [1:0][3:0][31:0] B_tile, input [3:0][3:0][31:0] C_tile, - input [`NW_WIDTH-1:0] warp_id, + input [`NW_WIDTH-1:0] wid, output valid_out, output [3:0][3:0][31:0] D_tile, - output [`NW_WIDTH-1:0] D_warp_id + output [`NW_WIDTH-1:0] D_wid ); logic [3:0][3:0][31:0] result_hmma; @@ -44,15 +44,15 @@ module VX_tensor_dpu #( // fixed-latency model VX_shift_register #( - .DATAW (1 + $bits(warp_id) + $bits(D_tile)), + .DATAW (1 + $bits(wid) + $bits(D_tile)), .DEPTH (`LATENCY_HMMA), .RESETW (1) ) shift_reg ( .clk (clk), .reset (reset), .enable (~stall), - .data_in ({valid_in, warp_id, result_hmma}), - .data_out ({valid_out, D_warp_id, D_tile}) + .data_in ({valid_in, wid, result_hmma}), + .data_out ({valid_out, D_wid, D_tile}) ); endmodule `endif From c03a5b070c4046b5a708c1799ea880249c85d2d5 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 27 May 2024 18:24:24 -0700 Subject: [PATCH 11/31] tensor: Issue queue for dpu to improve utilization --- hw/rtl/core/VX_tensor_core.sv | 198 +++++++++++++++++++++++----------- hw/rtl/fpu/VX_tensor_dpu.sv | 2 +- 2 files changed, 138 insertions(+), 62 deletions(-) diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index 0612ca12..5f32f504 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -125,10 +125,9 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( .operands_valid(execute_if.valid), .operands_wid(execute_if.data.wid), .operands_last_in_pair(last_in_pair), + .operands_step(step), .operands_ready(octet_operands_ready[i]), - .step(step), - .D_out(octet_D), .D_wid(wb_wid), .result_valid(result_valid), @@ -186,18 +185,38 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( // pid/sop/eop set later }; + // wire [DATAW-1:0] execute_if_data_deq; + + // VX_fifo_queue #( + // .DATAW(DATAW), + // .DEPTH(4 /* FIXME: arbitrary */) + // ) pending_uops ( + // .clk(clk), + // .reset(reset), + // .push(execute_if_fire), + // .pop(commit_if_fire), + // .data_in(execute_if_data_enq), + // .data_out(execute_if_data_deq), + // `UNUSED_PIN(empty), + // `UNUSED_PIN(alm_empty), + // `UNUSED_PIN(full), // should be impossible to overflow + // `UNUSED_PIN(alm_full), + // `UNUSED_PIN(size) + // ); + wire [`NUM_WARPS-1:0][DATAW-1:0] execute_if_data_deq; for (genvar i = 0; i < `NUM_WARPS; i++) begin - wire enq = execute_if_fire && (execute_if.data.wid == `NW_WIDTH'(i)); - wire deq = commit_if_fire && ( wb_wid == `NW_WIDTH'(i)); - logic full; - // execute_if request queue. // This has to be separated per-warp, as otherwise requests from // multiple warps can be enqueued interleaved, which makes it hard to - // ensure two consecutive dequeues are associated to the same warp for + // ensure two consecutive dequeues are associated with the same warp for // commit. + + wire enq = execute_if_fire && (execute_if.data.wid == `NW_WIDTH'(i)); + wire deq = commit_if_fire && ( wb_wid == `NW_WIDTH'(i)); + wire full; + VX_fifo_queue #( .DATAW(DATAW), .DEPTH(4 /* FIXME: arbitrary */) @@ -215,7 +234,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( `UNUSED_PIN(size) ); - `RUNTIME_ASSERT(!full, ("tensor core uop queue is full!")); + `RUNTIME_ASSERT(!(!reset && full), ("tensor core uop queue is full!")); end // unlike execute which can be interleaved between warps, commit is @@ -229,6 +248,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1; wire [COMMIT_DATAW-1:0] commit_if_data = { execute_if_data_deq[wb_wid], /* uuid ~ rd */ + // execute_if_data_deq, /* uuid ~ rd */ subcommit == 1'b0 ? wb_data_0 : wb_data_1, /* data */ 1'b0, /* pid */ 1'b1, /* sop */ @@ -271,11 +291,10 @@ module VX_tensor_octet #( input operands_valid, input [`NW_WIDTH-1:0] operands_wid, input operands_last_in_pair, + input [1:0] operands_step, // we have to backpressure due to there potentially being contention over commit output operands_ready, - input [1:0] step, - output [3:0][3:0][31:0] D_out, output [`NW_WIDTH-1:0] D_wid, output result_valid, @@ -292,11 +311,73 @@ module VX_tensor_octet #( logic [3:0][31:0] A_half; logic [3:0][31:0] B_half; logic [7:0][31:0] C_half; + logic [3:0][31:0] A_half_buf; + logic [3:0][31:0] B_half_buf; + logic [7:0][31:0] C_half_buf; + logic [`NUM_WARPS-1:0] substeps; logic [`NUM_WARPS-1:0] substeps_n; - always @(*) begin + wire [7:0][31:0] A_in_buf; + wire [7:0][31:0] B_in_buf; + wire [7:0][31:0] C_in_buf; + wire operands_valid_buf; + wire operands_ready_buf; + wire [`NW_WIDTH-1:0] operands_wid_buf; + wire operands_last_in_pair_buf; + wire [1:0] operands_step_buf; + + wire inbuf_empty; + wire inbuf_full; + wire inbuf_ready_in; + assign inbuf_ready_in = !inbuf_full; + assign operands_ready = inbuf_ready_in; + assign operands_valid_buf = !inbuf_empty; + + wire inbuf_enq = operands_ready && operands_valid && operands_last_in_pair; + wire inbuf_deq = operands_valid_buf && operands_ready_buf; + + // the 'issue queue' for the dpu. + // This exists to decouple the input of the dot-product unit from + // execute_if.ready. execute_if can arrive intermittently according to + // the frontend's behavior, and since the dpu can also stall for a fixed + // initiation latency, we need to decouple the two to efficiently feed the + // dpu. + // This only applies to the last instruction in a pair, since the first + // instruction only acts to buffer the operands and can execute + // immediately without backpressure. So we don't enqueue them. + VX_fifo_queue #( + .DATAW ($bits(A_in) + $bits(B_in) + $bits(C_in) + + $bits(operands_wid) + $bits(operands_step) + $bits(operands_last_in_pair)), + .DEPTH (4 /* FIXME: arbitrary */) + ) input_buffer ( + .clk (clk), + .reset (reset), + .push (inbuf_enq), + .pop (inbuf_deq), + .data_in ({A_in, B_in, C_in, operands_wid, operands_step, operands_last_in_pair}), + .data_out ({A_in_buf, B_in_buf, C_in_buf, operands_wid_buf, operands_step_buf, operands_last_in_pair_buf}), + .empty (inbuf_empty), + `UNUSED_PIN(alm_empty), + .full (inbuf_full), + `UNUSED_PIN(alm_full), + `UNUSED_PIN(size) + ); + + typedef struct { + logic [3:0][31:0] A_half; + logic [3:0][31:0] B_half; + logic [7:0][31:0] C_half; + } half_t; + + function half_t get_operand_half( + input logic [1:0] step, + input logic [7:0][31:0] A_in, + input logic [7:0][31:0] B_in, + input logic [7:0][31:0] C_in + ); + half_t half; // note that not all lanes participate at every step case (step) 2'b00: begin @@ -304,28 +385,34 @@ module VX_tensor_octet #( // by two threadgroups: [0:2,0:2] and [4:6,0:2] in Step 0 of // Figure 10(b). B_in OTOH is shared by two threadgroups. // Note k-dimension is shrunk from 4 to 2. - A_half = { A_in[5:4], A_in[1:0] }; - B_half = B_in[3:0]; + half.A_half = { A_in[5:4], A_in[1:0] }; + half.B_half = B_in[3:0]; end 2'b01: begin - A_half = { A_in[7:6], A_in[3:2] }; - B_half = B_in[3:0]; + half.A_half = { A_in[7:6], A_in[3:2] }; + half.B_half = B_in[3:0]; end 2'b10: begin - A_half = { A_in[5:4], A_in[1:0] }; - B_half = B_in[7:4]; + half.A_half = { A_in[5:4], A_in[1:0] }; + half.B_half = B_in[7:4]; end 2'b11: begin - A_half = { A_in[7:6], A_in[3:2] }; - B_half = B_in[7:4]; + half.A_half = { A_in[7:6], A_in[3:2] }; + half.B_half = B_in[7:4]; end endcase - C_half = C_in; - end + half.C_half = C_in; + return half; + endfunction - logic substep; - wire operands_fire = operands_ready && operands_valid; - wire substep_n = operands_fire && operands_last_in_pair; + half_t halves; + half_t halves_buf; + assign halves = get_operand_half(operands_step, A_in, B_in, C_in); + assign halves_buf = get_operand_half(operands_step_buf, A_in_buf, B_in_buf, C_in_buf); + + wire do_hmma = operands_ready_buf && operands_valid_buf && operands_last_in_pair_buf; + wire operands_first_in_pair_fire = operands_ready && operands_valid && (!operands_last_in_pair); + // wire operands_first_in_pair_fire = operands_ready && operands_valid; always @(*) begin A_buffer_n = A_buffer; @@ -333,20 +420,15 @@ module VX_tensor_octet #( C_buffer_n = C_buffer; substeps_n = substeps; - if (operands_fire) begin - substeps_n[operands_wid] = ~substeps[operands_wid]; - if (!operands_last_in_pair) begin - A_buffer_n[operands_wid] = A_half; - B_buffer_n[operands_wid] = B_half; - C_buffer_n[operands_wid] = C_half; - end + if (operands_first_in_pair_fire) begin + substeps_n[operands_wid] = 1'b1; // ready for hmma + A_buffer_n[operands_wid] = halves.A_half; + B_buffer_n[operands_wid] = halves.B_half; + C_buffer_n[operands_wid] = halves.C_half; + end + if (do_hmma) begin + substeps_n[operands_wid_buf] = 1'b0; // finished hmma, ready for next operand end - - // if (operands_fire && (substep == 1'b0)) begin - // A_buffer_n[operands_wid] = A_half; - // B_buffer_n[operands_wid] = B_half; - // C_buffer_n[operands_wid] = C_half; - // end end always @(posedge clk) begin @@ -354,43 +436,39 @@ module VX_tensor_octet #( A_buffer <= '0; B_buffer <= '0; C_buffer <= '0; - - substep <= '0; substeps <= '0; end else begin A_buffer <= A_buffer_n; B_buffer <= B_buffer_n; C_buffer <= C_buffer_n; - - substep <= substep_n; substeps <= substeps_n; end end - wire hmma_ready; wire outbuf_ready_in; - // wire stall = result_valid && ~result_ready; // backpressure from commit wire stall = ~outbuf_ready_in; + wire hmma_ready; + // assign operands_ready = ~stall; // TODO: Below line is to only allow 1 warp to occupy the octet at a time; // currently, dpu is fully-pipelined and allows concurrency between // multiple warps. This seems to be not a problem though given that the // RF operand read takes >=2 cycles, which should be the end-to-end // latency of the DPU anyways - assign operands_ready = hmma_ready && ~stall; + assign operands_ready_buf = hmma_ready && ~stall; // A is 4x2 fp32 matrix wire [3:0][1:0][31:0] A_tile = { - { A_half[3], A_buffer[operands_wid][3] }, - { A_half[2], A_buffer[operands_wid][2] }, - { A_half[1], A_buffer[operands_wid][1] }, - { A_half[0], A_buffer[operands_wid][0] } + { halves_buf.A_half[3], A_buffer[operands_wid_buf][3] }, + { halves_buf.A_half[2], A_buffer[operands_wid_buf][2] }, + { halves_buf.A_half[1], A_buffer[operands_wid_buf][1] }, + { halves_buf.A_half[0], A_buffer[operands_wid_buf][0] } }; // B is 2x4 fp32 matrix wire [1:0][3:0][31:0] B_tile = { - B_half, B_buffer[operands_wid] + halves_buf.B_half, B_buffer[operands_wid_buf] }; // C is 4x4 fp32 matrix logic [3:0][3:0][31:0] C_tile; @@ -398,14 +476,12 @@ module VX_tensor_octet #( logic [`NW_WIDTH-1:0] D_wid_dpu; always @(*) begin - C_tile[3] = { C_half[7], C_buffer[operands_wid][7], C_half[5], C_buffer[operands_wid][5] }; - C_tile[2] = { C_half[6], C_buffer[operands_wid][6], C_half[4], C_buffer[operands_wid][4] }; - C_tile[1] = { C_half[3], C_buffer[operands_wid][3], C_half[1], C_buffer[operands_wid][1] }; - C_tile[0] = { C_half[2], C_buffer[operands_wid][2], C_half[0], C_buffer[operands_wid][0] }; + C_tile[3] = { halves_buf.C_half[7], C_buffer[operands_wid_buf][7], halves_buf.C_half[5], C_buffer[operands_wid_buf][5] }; + C_tile[2] = { halves_buf.C_half[6], C_buffer[operands_wid_buf][6], halves_buf.C_half[4], C_buffer[operands_wid_buf][4] }; + C_tile[1] = { halves_buf.C_half[3], C_buffer[operands_wid_buf][3], halves_buf.C_half[1], C_buffer[operands_wid_buf][1] }; + C_tile[0] = { halves_buf.C_half[2], C_buffer[operands_wid_buf][2], halves_buf.C_half[0], C_buffer[operands_wid_buf][0] }; end - // wire do_hmma = operands_fire && (substeps[operands_wid] == 1'b1); - wire do_hmma = operands_fire && operands_last_in_pair; wire dpu_valid; // this does (m,n,k)=(4,4,2) matmul, modeling compute of a single octet @@ -423,7 +499,7 @@ module VX_tensor_octet #( .A_tile(A_tile), .B_tile(B_tile), .C_tile(C_tile), - .wid(operands_wid), + .wid(operands_wid_buf), .valid_out(dpu_valid), .D_tile(D_tile), @@ -438,14 +514,14 @@ module VX_tensor_octet #( wire outbuf_enq = outbuf_ready_in && dpu_valid; wire outbuf_deq = result_valid && result_ready; - // buffer to stage the result tile for 2 cycles until commit/writeback is - // complete. This decouples the irregular dpu output traffic from the - // regular, every-2-cycle commit traffic and thereby ensures the commit - // pipeline is used more efficiently. + // buffer to stage the result D tile for 2 cycles until commit/writeback + // is complete. This decouples the irregular dpu output traffic from the + // regular, every-2-cycle commit traffic to ensure the commit pipeline is + // used more efficiently. // TODO: This is probably oversized. VX_fifo_queue #( .DATAW ($bits(D_wid) + $bits(D_out)), - .DEPTH (8 /* FIXME: arbitrary */) + .DEPTH (4 /* FIXME: arbitrary */) ) output_buffer ( .clk (clk), .reset (reset), diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv index 7a3ee41d..7e96a296 100644 --- a/hw/rtl/fpu/VX_tensor_dpu.sv +++ b/hw/rtl/fpu/VX_tensor_dpu.sv @@ -51,7 +51,7 @@ module VX_tensor_dpu #( .clk (clk), .reset (reset), .enable (~stall), - .data_in ({valid_in, wid, result_hmma}), + .data_in ({valid_in && ready_in, wid, result_hmma}), .data_out ({valid_out, D_wid, D_tile}) ); endmodule From e9df173745295d78acabf9613098186f2df5d164 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 29 May 2024 13:34:25 -0700 Subject: [PATCH 12/31] tensor: Use chisel-generated dpu module --- hw/rtl/fpu/VX_tensor_dpu.sv | 113 ++++++++++++++++++++++++++++++++++-- 1 file changed, 108 insertions(+), 5 deletions(-) diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv index 7e96a296..33529370 100644 --- a/hw/rtl/fpu/VX_tensor_dpu.sv +++ b/hw/rtl/fpu/VX_tensor_dpu.sv @@ -42,17 +42,120 @@ module VX_tensor_dpu #( // ready as soon as valid_out assign ready_in = ready_reg || valid_out; - // fixed-latency model + // fully pipelined; always ready + // assign ready_in = 1'b1; + + // wire dpu_valid; + // wire [31:0] dpu_data; + // TensorDotProductUnit dpu_pipe ( + // .clock (clk), + // .reset (reset), + // .io_in_valid (valid_in && ready_in), + // .io_in_bits_a_0 (32'h40000000), + // .io_in_bits_a_1 (32'h40000000), + // .io_in_bits_a_2 (32'h40000000), + // .io_in_bits_a_3 (32'h40000000), + // .io_in_bits_b_0 (32'h40000000), + // .io_in_bits_b_1 (32'h40000000), + // .io_in_bits_b_2 (32'h40000000), + // .io_in_bits_b_3 (32'h40000000), + // .io_in_bits_c (32'h3f800000), + // .io_out_valid (dpu_valid), + // .io_out_bits_data (dpu_data) + // ); + + logic [1:0] threadgroup_valids; + // B_tile is shared across the two threadgroups; see Figure 13 + VX_tensor_threadgroup #( + ) threadgroup_0 ( + .clk (clk), + .reset (reset), + .valid_in (valid_in && ready_in), + .stall (stall), + .A_frag (A_tile[1:0]), + .B_frag (B_tile), + .C_frag (C_tile[1:0]), + .valid_out (threadgroup_valids[0]), + .D_frag (D_tile[1:0]) + ); + VX_tensor_threadgroup #( + ) threadgroup_1 ( + .clk (clk), + .reset (reset), + .valid_in (valid_in && ready_in), + .stall (stall), + .A_frag (A_tile[3:2]), + .B_frag (B_tile), + .C_frag (C_tile[3:2]), + .valid_out (threadgroup_valids[1]), + .D_frag (D_tile[3:2]) + ); + + // fixed-latency queue VX_shift_register #( - .DATAW (1 + $bits(wid) + $bits(D_tile)), - .DEPTH (`LATENCY_HMMA), + .DATAW (1 + $bits(wid)/* + $bits(D_tile)*/), + // .DEPTH (`LATENCY_HMMA), + .DEPTH (2), .RESETW (1) ) shift_reg ( .clk (clk), .reset (reset), .enable (~stall), - .data_in ({valid_in && ready_in, wid, result_hmma}), - .data_out ({valid_out, D_wid, D_tile}) + .data_in ({valid_in && ready_in, wid /*, result_hmma*/}), + .data_out ({valid_out, D_wid/*, D_tile */}) ); + + // FIXME: breaks when stall is on! + `RUNTIME_ASSERT(reset || (&(threadgroup_valids) == valid_out), + ("FEDP and metadata queue went out of sync!")) endmodule + +// does (m,n,k) = (2,4,2) matmul compute over 2 cycles. +// matches Figure 10(b) of the paper. +module VX_tensor_threadgroup #( +) ( + input clk, + input reset, + + input valid_in, + input stall, + input [1:0][1:0][31:0] A_frag, + input [1:0][3:0][31:0] B_frag, + input [1:0][3:0][31:0] C_frag, + + output valid_out, + output [1:0][3:0][31:0] D_frag + +); + // 4 FEDPs per threadgroup + // FIXME: experimenting with 8 FEDPs first + logic [1:0][3:0] valids; + for (genvar D_row = 0; D_row < 2; ++D_row) begin + for (genvar D_col = 0; D_col < 4; ++D_col) begin + // four-element dot product (FEDP) unit + TensorDotProductUnit fedp ( + .clock (clk), + .reset (reset), + .io_in_valid (valid_in), + .io_in_bits_a_0 (A_frag[D_row][0]), + .io_in_bits_a_1 (A_frag[D_row][1]), + .io_in_bits_a_2 (32'h0), + .io_in_bits_a_3 (32'h0), + .io_in_bits_b_0 (B_frag[0][D_col]), + .io_in_bits_b_1 (B_frag[1][D_col]), + .io_in_bits_b_2 (32'h0), + .io_in_bits_b_3 (32'h0), + .io_in_bits_c (C_frag[D_row][D_col]), + .io_stall (1'b0), // FIXME + .io_out_valid (valids[D_row][D_col]), + .io_out_bits_data (D_frag[D_row][D_col]) + ); + end + end + + assign valid_out = (&(valids[0])) && (&(valids[1])); + + `RUNTIME_ASSERT(reset || !stall, ("stall not supported yet in tensor dpu!")) +endmodule + `endif From f5a9ca5bf31fc4ddc70a81b5c7a5e6d8bc751697 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 29 May 2024 14:47:25 -0700 Subject: [PATCH 13/31] tensor: Enqueue both insts in pair to issue queue Otherwise the first-in-pair instructions can run ahead, latching their inputs for the next pair before the second-in-pair insts finish compute on the current one. Might introduce more frontend stalls, need more experimenting --- hw/rtl/core/VX_tensor_core.sv | 27 +++++++++++++++++---------- hw/rtl/fpu/VX_tensor_dpu.sv | 2 +- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index 5f32f504..2fc54fc5 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -219,7 +219,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( VX_fifo_queue #( .DATAW(DATAW), - .DEPTH(4 /* FIXME: arbitrary */) + .DEPTH(8 /* FIXME: arbitrary */) ) pending_uops ( .clk(clk), .reset(reset), @@ -335,7 +335,8 @@ module VX_tensor_octet #( assign operands_ready = inbuf_ready_in; assign operands_valid_buf = !inbuf_empty; - wire inbuf_enq = operands_ready && operands_valid && operands_last_in_pair; + // wire inbuf_enq = operands_ready && operands_valid && operands_last_in_pair; + wire inbuf_enq = operands_ready && operands_valid; wire inbuf_deq = operands_valid_buf && operands_ready_buf; // the 'issue queue' for the dpu. @@ -350,7 +351,7 @@ module VX_tensor_octet #( VX_fifo_queue #( .DATAW ($bits(A_in) + $bits(B_in) + $bits(C_in) + $bits(operands_wid) + $bits(operands_step) + $bits(operands_last_in_pair)), - .DEPTH (4 /* FIXME: arbitrary */) + .DEPTH (8 /* FIXME: arbitrary */) ) input_buffer ( .clk (clk), .reset (reset), @@ -365,6 +366,9 @@ module VX_tensor_octet #( `UNUSED_PIN(size) ); + // FIXME: this shouldn't be necessary + `RUNTIME_ASSERT(reset || !inbuf_full, ("dpu issue queue is full!")) + typedef struct { logic [3:0][31:0] A_half; logic [3:0][31:0] B_half; @@ -411,8 +415,8 @@ module VX_tensor_octet #( assign halves_buf = get_operand_half(operands_step_buf, A_in_buf, B_in_buf, C_in_buf); wire do_hmma = operands_ready_buf && operands_valid_buf && operands_last_in_pair_buf; - wire operands_first_in_pair_fire = operands_ready && operands_valid && (!operands_last_in_pair); - // wire operands_first_in_pair_fire = operands_ready && operands_valid; + // wire operands_first_in_pair_fire = operands_ready && operands_valid && (!operands_last_in_pair); + wire operands_first_in_pair_fire = operands_ready_buf && operands_valid_buf && (!operands_last_in_pair_buf); always @(*) begin A_buffer_n = A_buffer; @@ -421,10 +425,10 @@ module VX_tensor_octet #( substeps_n = substeps; if (operands_first_in_pair_fire) begin - substeps_n[operands_wid] = 1'b1; // ready for hmma - A_buffer_n[operands_wid] = halves.A_half; - B_buffer_n[operands_wid] = halves.B_half; - C_buffer_n[operands_wid] = halves.C_half; + substeps_n[operands_wid_buf] = 1'b1; // ready for hmma + A_buffer_n[operands_wid_buf] = halves_buf.A_half; + B_buffer_n[operands_wid_buf] = halves_buf.B_half; + C_buffer_n[operands_wid_buf] = halves_buf.C_half; end if (do_hmma) begin substeps_n[operands_wid_buf] = 1'b0; // finished hmma, ready for next operand @@ -521,7 +525,7 @@ module VX_tensor_octet #( // TODO: This is probably oversized. VX_fifo_queue #( .DATAW ($bits(D_wid) + $bits(D_out)), - .DEPTH (4 /* FIXME: arbitrary */) + .DEPTH (8 /* FIXME: arbitrary */) ) output_buffer ( .clk (clk), .reset (reset), @@ -536,6 +540,9 @@ module VX_tensor_octet #( `UNUSED_PIN(size) ); + // FIXME: this shouldn't be necessary + `RUNTIME_ASSERT(reset || !outbuf_full, ("dpu result queue is full!")) + `ifdef PERF_ENABLE logic [`PERF_CTR_BITS-1:0] perf_tensor_dpu_total; diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv index 33529370..90c2c7ed 100644 --- a/hw/rtl/fpu/VX_tensor_dpu.sv +++ b/hw/rtl/fpu/VX_tensor_dpu.sv @@ -95,7 +95,7 @@ module VX_tensor_dpu #( VX_shift_register #( .DATAW (1 + $bits(wid)/* + $bits(D_tile)*/), // .DEPTH (`LATENCY_HMMA), - .DEPTH (2), + .DEPTH (4), .RESETW (1) ) shift_reg ( .clk (clk), From 5ed6041e33bf6c00000bdf322bd814f1c27b71e5 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 29 May 2024 17:05:12 -0700 Subject: [PATCH 14/31] tensor: Properly stall dpu upon commit backpressure & better-reasoned queue depths --- hw/rtl/core/VX_tensor_core.sv | 29 ++++++++++++++--------------- hw/rtl/fpu/VX_tensor_dpu.sv | 28 ++++++++++++---------------- 2 files changed, 26 insertions(+), 31 deletions(-) diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index 2fc54fc5..71b17e08 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -77,6 +77,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( // octet. E.g. two tgs map lane 0-3 and lane 16-19 -> 16 // FIXME: not sure this is the right logic. just filling in what works localparam LANE_OFFSET_THREADGROUP = (4 * NUM_OCTETS); + localparam REQ_QUEUE_DEPTH = 4; wire [1:0] step = 2'(execute_if.data.op_type); wire last_in_pair = (execute_if.data.op_mod == `INST_MOD_BITS'(1)); @@ -219,7 +220,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( VX_fifo_queue #( .DATAW(DATAW), - .DEPTH(8 /* FIXME: arbitrary */) + .DEPTH(REQ_QUEUE_DEPTH) ) pending_uops ( .clk(clk), .reset(reset), @@ -234,6 +235,8 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( `UNUSED_PIN(size) ); + // this shouldn't really happen unless there's a big contention over + // the commit stage `RUNTIME_ASSERT(!(!reset && full), ("tensor core uop queue is full!")); end @@ -300,6 +303,8 @@ module VX_tensor_octet #( output result_valid, input result_ready ); + localparam ISSUE_QUEUE_DEPTH = 4; + // 512 bits/octet * 4 octets per warp logic [`NUM_WARPS-1:0][3:0][31:0] A_buffer, A_buffer_n; logic [`NUM_WARPS-1:0][3:0][31:0] B_buffer, B_buffer_n; @@ -351,7 +356,7 @@ module VX_tensor_octet #( VX_fifo_queue #( .DATAW ($bits(A_in) + $bits(B_in) + $bits(C_in) + $bits(operands_wid) + $bits(operands_step) + $bits(operands_last_in_pair)), - .DEPTH (8 /* FIXME: arbitrary */) + .DEPTH (ISSUE_QUEUE_DEPTH) ) input_buffer ( .clk (clk), .reset (reset), @@ -451,17 +456,8 @@ module VX_tensor_octet #( end wire outbuf_ready_in; - // backpressure from commit - wire stall = ~outbuf_ready_in; wire hmma_ready; - - // assign operands_ready = ~stall; - // TODO: Below line is to only allow 1 warp to occupy the octet at a time; - // currently, dpu is fully-pipelined and allows concurrency between - // multiple warps. This seems to be not a problem though given that the - // RF operand read takes >=2 cycles, which should be the end-to-end - // latency of the DPU anyways - assign operands_ready_buf = hmma_ready && ~stall; + assign operands_ready_buf = hmma_ready; // A is 4x2 fp32 matrix wire [3:0][1:0][31:0] A_tile = { @@ -496,8 +492,6 @@ module VX_tensor_octet #( .clk(clk), .reset(reset), - .stall(stall), - .valid_in(do_hmma), .ready_in(hmma_ready), .A_tile(A_tile), @@ -506,12 +500,14 @@ module VX_tensor_octet #( .wid(operands_wid_buf), .valid_out(dpu_valid), + .ready_out(outbuf_ready_in), .D_tile(D_tile), .D_wid(D_wid_dpu) ); wire outbuf_empty; wire outbuf_full; + // backpressure from commit assign outbuf_ready_in = ~outbuf_full; assign result_valid = ~outbuf_empty; @@ -525,7 +521,10 @@ module VX_tensor_octet #( // TODO: This is probably oversized. VX_fifo_queue #( .DATAW ($bits(D_wid) + $bits(D_out)), - .DEPTH (8 /* FIXME: arbitrary */) + // depth of this queue should ideally be deeper than the dpu pipeline + // latency, since the dpu is fully-pipelined and it can output the + // latency-number of outputs in a burst-y way. + .DEPTH (`LATENCY_HMMA + `LATENCY_HMMA) ) output_buffer ( .clk (clk), .reset (reset), diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv index 90c2c7ed..51112c96 100644 --- a/hw/rtl/fpu/VX_tensor_dpu.sv +++ b/hw/rtl/fpu/VX_tensor_dpu.sv @@ -8,8 +8,6 @@ module VX_tensor_dpu #( input clk, input reset, - input stall, - input valid_in, output ready_in, input [3:0][1:0][31:0] A_tile, @@ -18,6 +16,7 @@ module VX_tensor_dpu #( input [`NW_WIDTH-1:0] wid, output valid_out, + input ready_out, output [3:0][3:0][31:0] D_tile, output [`NW_WIDTH-1:0] D_wid ); @@ -40,10 +39,11 @@ module VX_tensor_dpu #( end // ready as soon as valid_out - assign ready_in = ready_reg || valid_out; + // assign ready_in = ready_reg || valid_out; - // fully pipelined; always ready - // assign ready_in = 1'b1; + // fully pipelined; ready_in is coupled to ready_out by immediately + // stalling + assign ready_in = ready_out; // wire dpu_valid; // wire [31:0] dpu_data; @@ -70,8 +70,8 @@ module VX_tensor_dpu #( ) threadgroup_0 ( .clk (clk), .reset (reset), - .valid_in (valid_in && ready_in), - .stall (stall), + .valid_in (valid_in), + .stall (!ready_out), .A_frag (A_tile[1:0]), .B_frag (B_tile), .C_frag (C_tile[1:0]), @@ -82,8 +82,8 @@ module VX_tensor_dpu #( ) threadgroup_1 ( .clk (clk), .reset (reset), - .valid_in (valid_in && ready_in), - .stall (stall), + .valid_in (valid_in), + .stall (!ready_out), .A_frag (A_tile[3:2]), .B_frag (B_tile), .C_frag (C_tile[3:2]), @@ -94,18 +94,16 @@ module VX_tensor_dpu #( // fixed-latency queue VX_shift_register #( .DATAW (1 + $bits(wid)/* + $bits(D_tile)*/), - // .DEPTH (`LATENCY_HMMA), - .DEPTH (4), + .DEPTH (`LATENCY_HMMA), .RESETW (1) ) shift_reg ( .clk (clk), .reset (reset), - .enable (~stall), + .enable (ready_out), .data_in ({valid_in && ready_in, wid /*, result_hmma*/}), .data_out ({valid_out, D_wid/*, D_tile */}) ); - // FIXME: breaks when stall is on! `RUNTIME_ASSERT(reset || (&(threadgroup_valids) == valid_out), ("FEDP and metadata queue went out of sync!")) endmodule @@ -146,7 +144,7 @@ module VX_tensor_threadgroup #( .io_in_bits_b_2 (32'h0), .io_in_bits_b_3 (32'h0), .io_in_bits_c (C_frag[D_row][D_col]), - .io_stall (1'b0), // FIXME + .io_stall (stall), .io_out_valid (valids[D_row][D_col]), .io_out_bits_data (D_frag[D_row][D_col]) ); @@ -154,8 +152,6 @@ module VX_tensor_threadgroup #( end assign valid_out = (&(valids[0])) && (&(valids[1])); - - `RUNTIME_ASSERT(reset || !stall, ("stall not supported yet in tensor dpu!")) endmodule `endif From 35273b3d742b4391a4edc1c689b1acf85b75f4d6 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 29 May 2024 17:14:54 -0700 Subject: [PATCH 15/31] Set correct dpu hmma latency --- hw/rtl/VX_config.vh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh index 5ef71794..8905bd3d 100644 --- a/hw/rtl/VX_config.vh +++ b/hw/rtl/VX_config.vh @@ -391,7 +391,7 @@ // Tensor Core Latency `ifndef LATENCY_HMMA -`define LATENCY_HMMA 2 +`define LATENCY_HMMA 4 `endif // Icache Configurable Knobs ////////////////////////////////////////////////// From 73a2f5781e242746f076eadaed36a391bcf34951 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Wed, 29 May 2024 22:01:03 -0700 Subject: [PATCH 16/31] Do two-cycle compute with 1 FEDP per lane --- hw/rtl/fpu/VX_tensor_dpu.sv | 199 +++++++++++++++++++++++++++--------- 1 file changed, 150 insertions(+), 49 deletions(-) diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv index 51112c96..faace3f0 100644 --- a/hw/rtl/fpu/VX_tensor_dpu.sv +++ b/hw/rtl/fpu/VX_tensor_dpu.sv @@ -30,47 +30,43 @@ module VX_tensor_dpu #( always @(posedge clk) begin if (reset) begin ready_reg <= '1; - end else if (valid_in) begin + end else if (valid_in && ready_in) begin ready_reg <= '0; dpi_print_results(int'(ISW), int'(OCTET), A_tile, B_tile, C_tile, result_hmma); - end else if (valid_out) begin + end else if (valid_out && ready_out) begin ready_reg <= '1; end end // ready as soon as valid_out - // assign ready_in = ready_reg || valid_out; + // assign ready_in = ready_reg; // fully pipelined; ready_in is coupled to ready_out by immediately // stalling - assign ready_in = ready_out; + // assign ready_in = ready_out; - // wire dpu_valid; - // wire [31:0] dpu_data; - // TensorDotProductUnit dpu_pipe ( - // .clock (clk), - // .reset (reset), - // .io_in_valid (valid_in && ready_in), - // .io_in_bits_a_0 (32'h40000000), - // .io_in_bits_a_1 (32'h40000000), - // .io_in_bits_a_2 (32'h40000000), - // .io_in_bits_a_3 (32'h40000000), - // .io_in_bits_b_0 (32'h40000000), - // .io_in_bits_b_1 (32'h40000000), - // .io_in_bits_b_2 (32'h40000000), - // .io_in_bits_b_3 (32'h40000000), - // .io_in_bits_c (32'h3f800000), - // .io_out_valid (dpu_valid), - // .io_out_bits_data (dpu_data) + // // fixed-latency queue + // VX_shift_register #( + // .DATAW (1 + $bits(wid)/* + $bits(D_tile)*/), + // .DEPTH (`LATENCY_HMMA + 1), + // .RESETW (1) + // ) shift_reg ( + // .clk (clk), + // .reset (reset), + // .enable (ready_out), + // .data_in ({valid_in && ready_in, wid /*, result_hmma*/}), + // .data_out ({valid_out, D_wid/*, D_tile */}) // ); logic [1:0] threadgroup_valids; + logic [1:0] threadgroup_readys; // B_tile is shared across the two threadgroups; see Figure 13 VX_tensor_threadgroup #( ) threadgroup_0 ( .clk (clk), .reset (reset), .valid_in (valid_in), + .ready_in (threadgroup_readys[0]), .stall (!ready_out), .A_frag (A_tile[1:0]), .B_frag (B_tile), @@ -83,6 +79,7 @@ module VX_tensor_dpu #( .clk (clk), .reset (reset), .valid_in (valid_in), + .ready_in (threadgroup_readys[1]), .stall (!ready_out), .A_frag (A_tile[3:2]), .B_frag (B_tile), @@ -91,21 +88,36 @@ module VX_tensor_dpu #( .D_frag (D_tile[3:2]) ); - // fixed-latency queue - VX_shift_register #( - .DATAW (1 + $bits(wid)/* + $bits(D_tile)*/), - .DEPTH (`LATENCY_HMMA), - .RESETW (1) - ) shift_reg ( - .clk (clk), - .reset (reset), - .enable (ready_out), - .data_in ({valid_in && ready_in, wid /*, result_hmma*/}), - .data_out ({valid_out, D_wid/*, D_tile */}) + wire empty; + wire full; + wire enq = valid_in && ready_in; + wire deq = valid_out && ready_out; + + assign ready_in = &(threadgroup_readys); + assign valid_out = &(threadgroup_valids); + + // need to pass along warp id's to do multithreading + VX_fifo_queue #( + .DATAW ($bits(wid)), + .DEPTH (`LATENCY_HMMA + `LATENCY_HMMA) + ) wid_queue ( + .clk (clk), + .reset (reset), + .push (enq), + .pop (deq), + .data_in (wid), + .data_out (D_wid), + .empty (empty), + `UNUSED_PIN(alm_empty), + .full (full), // should be impossible to overflow + `UNUSED_PIN(alm_full), + `UNUSED_PIN(size) ); - `RUNTIME_ASSERT(reset || (&(threadgroup_valids) == valid_out), - ("FEDP and metadata queue went out of sync!")) + `RUNTIME_ASSERT(reset || !full, ("dpu wid queue is full!")) + + // `RUNTIME_ASSERT(reset || (&(threadgroup_valids) == valid_out), + // ("FEDP and metadata queue went out of sync!")) endmodule // does (m,n,k) = (2,4,2) matmul compute over 2 cycles. @@ -116,6 +128,7 @@ module VX_tensor_threadgroup #( input reset, input valid_in, + output ready_in, input stall, input [1:0][1:0][31:0] A_frag, input [1:0][3:0][31:0] B_frag, @@ -123,35 +136,123 @@ module VX_tensor_threadgroup #( output valid_out, output [1:0][3:0][31:0] D_frag - ); + wire [1:0][1:0][31:0] A_frag_buf; + wire [1:0][3:0][31:0] B_frag_buf; + wire [1:0][3:0][31:0] C_frag_buf; + + wire valid_buf; + wire ready_buf; + + wire enq = valid_in && ready_in; + wire deq = valid_buf && ready_buf; + wire empty; + wire full; + assign ready_in = !full; + assign valid_buf = !empty; + + VX_fifo_queue #( + .DATAW ($bits(A_frag) + $bits(B_frag) + $bits(C_frag)), + .DEPTH (4) + ) input_buffer ( + .clk (clk), + .reset (reset), + .push (enq), + .pop (deq), + .data_in ({A_frag, B_frag, C_frag}), + .data_out ({A_frag_buf, B_frag_buf, C_frag_buf}), + .empty (empty), + `UNUSED_PIN(alm_empty), + .full (full), + `UNUSED_PIN(alm_full), + `UNUSED_PIN(size) + ); + + logic [3:0] fedp_valids; + wire fedp_valid_out = &(fedp_valids); + wire fedp_ready_out = !stall; + wire fedp_fire_out = fedp_valid_out && fedp_ready_out; + + wire fedp_valid_in = valid_buf; + wire fedp_ready_in = fedp_ready_out; // coupled + wire fedp_fire_in = fedp_valid_in && fedp_ready_in; + + // 0: FEDP uses first half from input_buffer + // 1: FEDP uses last half and pops input_buffer + logic step_in; + // 0: FEDP produces first half of D_frag + // 1: FEDP produces last half of D_frag and asserts valid_out + logic step_out; + assign ready_buf = fedp_fire_in && (step_in == 1'b1); + + // FIXME shrink size + logic [1:0][3:0][31:0] D_reg, D_reg_n; + wire [3:0][31:0] D_half; + always @(*) begin + D_reg_n = D_reg; + + if (fedp_fire_out) begin + if (step_out == 1'b0) begin + D_reg_n[0][0] = D_half[0]; + D_reg_n[0][2] = D_half[1]; + D_reg_n[1][0] = D_half[2]; + D_reg_n[1][2] = D_half[3]; + end + end + end + + always @(posedge clk) begin + if (reset) begin + step_in <= '0; + step_out <= '0; + + D_reg <= '0; + end else begin + if (fedp_fire_in) begin + step_in <= ~step_in; + end + if (fedp_fire_out) begin + step_out <= ~step_out; + end + + D_reg <= D_reg_n; + end + end + + assign D_frag[0][0] = D_reg[0][0]; + assign D_frag[0][2] = D_reg[0][2]; + assign D_frag[1][0] = D_reg[1][0]; + assign D_frag[1][2] = D_reg[1][2]; + assign D_frag[0][1] = D_half[0]; + assign D_frag[0][3] = D_half[1]; + assign D_frag[1][1] = D_half[2]; + assign D_frag[1][3] = D_half[3]; + // 4 FEDPs per threadgroup - // FIXME: experimenting with 8 FEDPs first - logic [1:0][3:0] valids; - for (genvar D_row = 0; D_row < 2; ++D_row) begin - for (genvar D_col = 0; D_col < 4; ++D_col) begin + for (genvar i = 0; i < 4; ++i) begin + localparam int d_row = i / 2; + localparam int d_col = (i % 2) * 2; // four-element dot product (FEDP) unit TensorDotProductUnit fedp ( .clock (clk), .reset (reset), - .io_in_valid (valid_in), - .io_in_bits_a_0 (A_frag[D_row][0]), - .io_in_bits_a_1 (A_frag[D_row][1]), + .io_in_valid (fedp_fire_in), + .io_in_bits_a_0 (A_frag_buf[d_row][0]), + .io_in_bits_a_1 (A_frag_buf[d_row][1]), .io_in_bits_a_2 (32'h0), .io_in_bits_a_3 (32'h0), - .io_in_bits_b_0 (B_frag[0][D_col]), - .io_in_bits_b_1 (B_frag[1][D_col]), + .io_in_bits_b_0 (step_in == 1'b0 ? B_frag_buf[0][d_col] : B_frag_buf[0][d_col + 1]), + .io_in_bits_b_1 (step_in == 1'b0 ? B_frag_buf[1][d_col] : B_frag_buf[1][d_col + 1]), .io_in_bits_b_2 (32'h0), .io_in_bits_b_3 (32'h0), - .io_in_bits_c (C_frag[D_row][D_col]), + .io_in_bits_c (step_in == 1'b0 ? C_frag_buf[d_row][d_col] : C_frag_buf[d_row][d_col + 1]), .io_stall (stall), - .io_out_valid (valids[D_row][D_col]), - .io_out_bits_data (D_frag[D_row][D_col]) + .io_out_valid (fedp_valids[i]), + .io_out_bits_data (D_half[i]) ); - end end - assign valid_out = (&(valids[0])) && (&(valids[1])); + assign valid_out = fedp_valid_out && (step_out == 1'b1); endmodule `endif From 2e2decc8b6fa2877b0d844b4a0395589d5e146e9 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 30 May 2024 12:46:45 -0700 Subject: [PATCH 17/31] Shrink size of D_half latch --- hw/rtl/fpu/VX_tensor_dpu.sv | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv index faace3f0..49d2418d 100644 --- a/hw/rtl/fpu/VX_tensor_dpu.sv +++ b/hw/rtl/fpu/VX_tensor_dpu.sv @@ -185,18 +185,14 @@ module VX_tensor_threadgroup #( logic step_out; assign ready_buf = fedp_fire_in && (step_in == 1'b1); - // FIXME shrink size - logic [1:0][3:0][31:0] D_reg, D_reg_n; + // latch the first-half result of D_frag + logic [3:0][31:0] D_reg, D_reg_n; wire [3:0][31:0] D_half; always @(*) begin D_reg_n = D_reg; - if (fedp_fire_out) begin if (step_out == 1'b0) begin - D_reg_n[0][0] = D_half[0]; - D_reg_n[0][2] = D_half[1]; - D_reg_n[1][0] = D_half[2]; - D_reg_n[1][2] = D_half[3]; + D_reg_n = D_half; end end end @@ -219,10 +215,10 @@ module VX_tensor_threadgroup #( end end - assign D_frag[0][0] = D_reg[0][0]; - assign D_frag[0][2] = D_reg[0][2]; - assign D_frag[1][0] = D_reg[1][0]; - assign D_frag[1][2] = D_reg[1][2]; + assign D_frag[0][0] = D_reg[0]; + assign D_frag[0][2] = D_reg[1]; + assign D_frag[1][0] = D_reg[2]; + assign D_frag[1][2] = D_reg[3]; assign D_frag[0][1] = D_half[0]; assign D_frag[0][3] = D_half[1]; assign D_frag[1][1] = D_half[2]; From 2743d32bd2658b362656088f45736942a6e699bc Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 30 May 2024 15:25:00 -0700 Subject: [PATCH 18/31] tensor: Handle wid queue backpressure in dpu --- hw/rtl/core/VX_tensor_core.sv | 4 ++-- hw/rtl/fpu/VX_tensor_dpu.sv | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index 71b17e08..2ddd6a70 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -77,7 +77,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( // octet. E.g. two tgs map lane 0-3 and lane 16-19 -> 16 // FIXME: not sure this is the right logic. just filling in what works localparam LANE_OFFSET_THREADGROUP = (4 * NUM_OCTETS); - localparam REQ_QUEUE_DEPTH = 4; + localparam METADATA_QUEUE_DEPTH = 4; wire [1:0] step = 2'(execute_if.data.op_type); wire last_in_pair = (execute_if.data.op_mod == `INST_MOD_BITS'(1)); @@ -220,7 +220,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( VX_fifo_queue #( .DATAW(DATAW), - .DEPTH(REQ_QUEUE_DEPTH) + .DEPTH(METADATA_QUEUE_DEPTH) ) pending_uops ( .clk(clk), .reset(reset), diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv index 49d2418d..870f6870 100644 --- a/hw/rtl/fpu/VX_tensor_dpu.sv +++ b/hw/rtl/fpu/VX_tensor_dpu.sv @@ -93,7 +93,7 @@ module VX_tensor_dpu #( wire enq = valid_in && ready_in; wire deq = valid_out && ready_out; - assign ready_in = &(threadgroup_readys); + assign ready_in = &(threadgroup_readys) && !full; assign valid_out = &(threadgroup_valids); // need to pass along warp id's to do multithreading @@ -109,13 +109,11 @@ module VX_tensor_dpu #( .data_out (D_wid), .empty (empty), `UNUSED_PIN(alm_empty), - .full (full), // should be impossible to overflow + .full (full), `UNUSED_PIN(alm_full), `UNUSED_PIN(size) ); - `RUNTIME_ASSERT(reset || !full, ("dpu wid queue is full!")) - // `RUNTIME_ASSERT(reset || (&(threadgroup_valids) == valid_out), // ("FEDP and metadata queue went out of sync!")) endmodule From dfb2276657c8ba4f7fcafc18ee0f091ce6e1481d Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 30 May 2024 17:29:59 -0700 Subject: [PATCH 19/31] tensor: Remove redundant issue queue outside pdu --- hw/rtl/core/VX_tensor_core.sv | 83 +++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 37 deletions(-) diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index 2ddd6a70..bedf8245 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -333,46 +333,55 @@ module VX_tensor_octet #( wire operands_last_in_pair_buf; wire [1:0] operands_step_buf; - wire inbuf_empty; - wire inbuf_full; - wire inbuf_ready_in; - assign inbuf_ready_in = !inbuf_full; - assign operands_ready = inbuf_ready_in; - assign operands_valid_buf = !inbuf_empty; + // wire inbuf_empty; + // wire inbuf_full; + // wire inbuf_ready_in; + // assign inbuf_ready_in = !inbuf_full; + // assign operands_ready = inbuf_ready_in; + // assign operands_valid_buf = !inbuf_empty; - // wire inbuf_enq = operands_ready && operands_valid && operands_last_in_pair; - wire inbuf_enq = operands_ready && operands_valid; - wire inbuf_deq = operands_valid_buf && operands_ready_buf; + // // wire inbuf_enq = operands_ready && operands_valid && operands_last_in_pair; + // wire inbuf_enq = operands_ready && operands_valid; + // wire inbuf_deq = operands_valid_buf && operands_ready_buf; - // the 'issue queue' for the dpu. - // This exists to decouple the input of the dot-product unit from - // execute_if.ready. execute_if can arrive intermittently according to - // the frontend's behavior, and since the dpu can also stall for a fixed - // initiation latency, we need to decouple the two to efficiently feed the - // dpu. - // This only applies to the last instruction in a pair, since the first - // instruction only acts to buffer the operands and can execute - // immediately without backpressure. So we don't enqueue them. - VX_fifo_queue #( - .DATAW ($bits(A_in) + $bits(B_in) + $bits(C_in) + - $bits(operands_wid) + $bits(operands_step) + $bits(operands_last_in_pair)), - .DEPTH (ISSUE_QUEUE_DEPTH) - ) input_buffer ( - .clk (clk), - .reset (reset), - .push (inbuf_enq), - .pop (inbuf_deq), - .data_in ({A_in, B_in, C_in, operands_wid, operands_step, operands_last_in_pair}), - .data_out ({A_in_buf, B_in_buf, C_in_buf, operands_wid_buf, operands_step_buf, operands_last_in_pair_buf}), - .empty (inbuf_empty), - `UNUSED_PIN(alm_empty), - .full (inbuf_full), - `UNUSED_PIN(alm_full), - `UNUSED_PIN(size) - ); + // // the 'issue queue' for the dpu. + // // This exists to decouple the input of the dot-product unit from + // // execute_if.ready. execute_if can arrive intermittently according to + // // the frontend's behavior, and since the dpu can also stall for a fixed + // // initiation latency, we need to decouple the two to efficiently feed the + // // dpu. + // // This only applies to the last instruction in a pair, since the first + // // instruction only acts to buffer the operands and can execute + // // immediately without backpressure. So we don't enqueue them. + // VX_fifo_queue #( + // .DATAW ($bits(A_in) + $bits(B_in) + $bits(C_in) + + // $bits(operands_wid) + $bits(operands_step) + $bits(operands_last_in_pair)), + // .DEPTH (ISSUE_QUEUE_DEPTH) + // ) input_buffer ( + // .clk (clk), + // .reset (reset), + // .push (inbuf_enq), + // .pop (inbuf_deq), + // .data_in ({A_in, B_in, C_in, operands_wid, operands_step, operands_last_in_pair}), + // .data_out ({A_in_buf, B_in_buf, C_in_buf, operands_wid_buf, operands_step_buf, operands_last_in_pair_buf}), + // .empty (inbuf_empty), + // `UNUSED_PIN(alm_empty), + // .full (inbuf_full), + // `UNUSED_PIN(alm_full), + // `UNUSED_PIN(size) + // ); - // FIXME: this shouldn't be necessary - `RUNTIME_ASSERT(reset || !inbuf_full, ("dpu issue queue is full!")) + // // FIXME: this shouldn't be necessary + // `RUNTIME_ASSERT(reset || !inbuf_full, ("dpu issue queue is full!")) + + assign A_in_buf = A_in; + assign B_in_buf = B_in; + assign C_in_buf = C_in; + assign operands_step_buf = operands_step; + assign operands_wid_buf = operands_wid; + assign operands_last_in_pair_buf = operands_last_in_pair; + assign operands_valid_buf = operands_valid; + assign operands_ready = operands_ready_buf; typedef struct { logic [3:0][31:0] A_half; From 06e0f901ff44cad582a8247457956106890a9eab Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 30 May 2024 17:34:39 -0700 Subject: [PATCH 20/31] tensor: Handle backpressure from metadata queue --- hw/rtl/core/VX_tensor_core.sv | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index bedf8245..5d4c02a4 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -90,8 +90,6 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( logic [`NUM_THREADS-1:0][`XLEN-1:0] wb_data_1; wire [`NW_WIDTH-1:0] wb_wid; - assign execute_if.ready = &octet_operands_ready; - `ifdef EXT_T_ENABLE for (genvar i = 0; i < NUM_OCTETS; ++i) begin `else @@ -207,16 +205,23 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( wire [`NUM_WARPS-1:0][DATAW-1:0] execute_if_data_deq; + wire [`NUM_WARPS-1:0] metadata_queue_fulls; + // OR not AND, we don't want any warp full + wire metadata_queue_full = |(metadata_queue_fulls); + + assign execute_if.ready = &(octet_operands_ready) && !metadata_queue_full; + for (genvar i = 0; i < `NUM_WARPS; i++) begin - // execute_if request queue. + // Metadata queue for commit_if. This simply copies execute_if's + // metadata and pops them in conjunction with commit fire. + // // This has to be separated per-warp, as otherwise requests from // multiple warps can be enqueued interleaved, which makes it hard to // ensure two consecutive dequeues are associated with the same warp for - // commit. + // commit. (FIXME: this is not strictly necessary though.) wire enq = execute_if_fire && (execute_if.data.wid == `NW_WIDTH'(i)); wire deq = commit_if_fire && ( wb_wid == `NW_WIDTH'(i)); - wire full; VX_fifo_queue #( .DATAW(DATAW), @@ -230,16 +235,16 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( .data_out(execute_if_data_deq[i]), `UNUSED_PIN(empty), `UNUSED_PIN(alm_empty), - .full(full), // should be impossible to overflow + .full(metadata_queue_fulls[i]), `UNUSED_PIN(alm_full), `UNUSED_PIN(size) ); - - // this shouldn't really happen unless there's a big contention over - // the commit stage - `RUNTIME_ASSERT(!(!reset && full), ("tensor core uop queue is full!")); end + // this shouldn't really happen unless there's a big contention over + // the commit stage + `RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!")); + // unlike execute which can be interleaved between warps, commit is // serialized and completed one-warp-by-warp, therefore we only need to // keep one subcommit state bit unlike for `substeps` @@ -527,13 +532,11 @@ module VX_tensor_octet #( // is complete. This decouples the irregular dpu output traffic from the // regular, every-2-cycle commit traffic to ensure the commit pipeline is // used more efficiently. + // FIXME: unnecessary? // TODO: This is probably oversized. VX_fifo_queue #( .DATAW ($bits(D_wid) + $bits(D_out)), - // depth of this queue should ideally be deeper than the dpu pipeline - // latency, since the dpu is fully-pipelined and it can output the - // latency-number of outputs in a burst-y way. - .DEPTH (`LATENCY_HMMA + `LATENCY_HMMA) + .DEPTH (`LATENCY_HMMA) ) output_buffer ( .clk (clk), .reset (reset), From 97f37b1c75d4efbca80cd8c9bde639c3500f4e8c Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 30 May 2024 18:00:26 -0700 Subject: [PATCH 21/31] tensor: Add commit stall injection for debugging --- hw/rtl/core/VX_tensor_core.sv | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index 5d4c02a4..b00d0a46 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -251,7 +251,27 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( logic subcommit, subcommit_n; wire all_valid = (& octet_results_valid); + +// define this to inject artificial commit backpressure for debugging +`define INJECT_COMMIT_BACKPRESSURE +`ifndef INJECT_COMMIT_BACKPRESSURE assign commit_if.valid = all_valid; + assign commit_if_ready_override = commit_if.ready; +`else + logic [1:0] counter; + always @(posedge clk) begin + if (reset) begin + counter <= '0; + end else begin + if (all_valid) begin + counter <= counter + 1'b1; + end + end + end + + assign commit_if.valid = all_valid && (counter == 2'b0); + assign commit_if_ready_override = commit_if.ready && (counter == 2'b0); +`endif localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1; wire [COMMIT_DATAW-1:0] commit_if_data = { From 0a032ab400ae94d525d40dc8673e7b3b8d56e89b Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 30 May 2024 18:03:04 -0700 Subject: [PATCH 22/31] tensor: Fix out-of-sync enqueue to dpu and metadata queue --- hw/rtl/core/VX_tensor_core.sv | 92 ++++++++--------------------------- hw/rtl/fpu/VX_tensor_dpu.sv | 19 ++++++-- 2 files changed, 37 insertions(+), 74 deletions(-) diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index b00d0a46..44485ccb 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -77,7 +77,8 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( // octet. E.g. two tgs map lane 0-3 and lane 16-19 -> 16 // FIXME: not sure this is the right logic. just filling in what works localparam LANE_OFFSET_THREADGROUP = (4 * NUM_OCTETS); - localparam METADATA_QUEUE_DEPTH = 4; + // this is only a rule of thumb + localparam METADATA_QUEUE_DEPTH = `LATENCY_HMMA; wire [1:0] step = 2'(execute_if.data.op_type); wire last_in_pair = (execute_if.data.op_mod == `INST_MOD_BITS'(1)); @@ -89,7 +90,11 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( logic [`NUM_THREADS-1:0][`XLEN-1:0] wb_data_0; logic [`NUM_THREADS-1:0][`XLEN-1:0] wb_data_1; wire [`NW_WIDTH-1:0] wb_wid; - + + // valid signal synced between the functional units (octet) and the + // metadata queue + wire operands_valid_synced; + `ifdef EXT_T_ENABLE for (genvar i = 0; i < NUM_OCTETS; ++i) begin `else @@ -121,7 +126,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( .A_in(octet_A), .B_in(octet_B), .C_in(octet_C), - .operands_valid(execute_if.valid), + .operands_valid(operands_valid_synced), .operands_wid(execute_if.data.wid), .operands_last_in_pair(last_in_pair), .operands_step(step), @@ -172,8 +177,10 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS; - wire execute_if_fire = execute_if.valid && execute_if.ready; - wire commit_if_fire = commit_if.valid && commit_if.ready; + wire commit_if_ready_override; + + wire operand_enq_fire = operands_valid_synced && execute_if.ready; + wire commit_if_fire = commit_if.valid && commit_if_ready_override; wire [DATAW-1:0] execute_if_data_enq = { execute_if.data.uuid, execute_if.data.wid, @@ -184,31 +191,14 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( // pid/sop/eop set later }; - // wire [DATAW-1:0] execute_if_data_deq; - - // VX_fifo_queue #( - // .DATAW(DATAW), - // .DEPTH(4 /* FIXME: arbitrary */) - // ) pending_uops ( - // .clk(clk), - // .reset(reset), - // .push(execute_if_fire), - // .pop(commit_if_fire), - // .data_in(execute_if_data_enq), - // .data_out(execute_if_data_deq), - // `UNUSED_PIN(empty), - // `UNUSED_PIN(alm_empty), - // `UNUSED_PIN(full), // should be impossible to overflow - // `UNUSED_PIN(alm_full), - // `UNUSED_PIN(size) - // ); - wire [`NUM_WARPS-1:0][DATAW-1:0] execute_if_data_deq; wire [`NUM_WARPS-1:0] metadata_queue_fulls; // OR not AND, we don't want any warp full wire metadata_queue_full = |(metadata_queue_fulls); + // need to make sure both metadata and octet issue queues are in sync + assign operands_valid_synced = execute_if.valid && !metadata_queue_full; assign execute_if.ready = &(octet_operands_ready) && !metadata_queue_full; for (genvar i = 0; i < `NUM_WARPS; i++) begin @@ -220,8 +210,8 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( // ensure two consecutive dequeues are associated with the same warp for // commit. (FIXME: this is not strictly necessary though.) - wire enq = execute_if_fire && (execute_if.data.wid == `NW_WIDTH'(i)); - wire deq = commit_if_fire && ( wb_wid == `NW_WIDTH'(i)); + wire enq = operand_enq_fire && (execute_if.data.wid == `NW_WIDTH'(i)); + wire deq = commit_if_fire && ( wb_wid == `NW_WIDTH'(i)); VX_fifo_queue #( .DATAW(DATAW), @@ -253,8 +243,8 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( wire all_valid = (& octet_results_valid); // define this to inject artificial commit backpressure for debugging -`define INJECT_COMMIT_BACKPRESSURE -`ifndef INJECT_COMMIT_BACKPRESSURE +// `define TENSOR_INJECT_COMMIT_BACKPRESSURE +`ifndef TENSOR_INJECT_COMMIT_BACKPRESSURE assign commit_if.valid = all_valid; assign commit_if_ready_override = commit_if.ready; `else @@ -358,47 +348,6 @@ module VX_tensor_octet #( wire operands_last_in_pair_buf; wire [1:0] operands_step_buf; - // wire inbuf_empty; - // wire inbuf_full; - // wire inbuf_ready_in; - // assign inbuf_ready_in = !inbuf_full; - // assign operands_ready = inbuf_ready_in; - // assign operands_valid_buf = !inbuf_empty; - - // // wire inbuf_enq = operands_ready && operands_valid && operands_last_in_pair; - // wire inbuf_enq = operands_ready && operands_valid; - // wire inbuf_deq = operands_valid_buf && operands_ready_buf; - - // // the 'issue queue' for the dpu. - // // This exists to decouple the input of the dot-product unit from - // // execute_if.ready. execute_if can arrive intermittently according to - // // the frontend's behavior, and since the dpu can also stall for a fixed - // // initiation latency, we need to decouple the two to efficiently feed the - // // dpu. - // // This only applies to the last instruction in a pair, since the first - // // instruction only acts to buffer the operands and can execute - // // immediately without backpressure. So we don't enqueue them. - // VX_fifo_queue #( - // .DATAW ($bits(A_in) + $bits(B_in) + $bits(C_in) + - // $bits(operands_wid) + $bits(operands_step) + $bits(operands_last_in_pair)), - // .DEPTH (ISSUE_QUEUE_DEPTH) - // ) input_buffer ( - // .clk (clk), - // .reset (reset), - // .push (inbuf_enq), - // .pop (inbuf_deq), - // .data_in ({A_in, B_in, C_in, operands_wid, operands_step, operands_last_in_pair}), - // .data_out ({A_in_buf, B_in_buf, C_in_buf, operands_wid_buf, operands_step_buf, operands_last_in_pair_buf}), - // .empty (inbuf_empty), - // `UNUSED_PIN(alm_empty), - // .full (inbuf_full), - // `UNUSED_PIN(alm_full), - // `UNUSED_PIN(size) - // ); - - // // FIXME: this shouldn't be necessary - // `RUNTIME_ASSERT(reset || !inbuf_full, ("dpu issue queue is full!")) - assign A_in_buf = A_in; assign B_in_buf = B_in; assign C_in_buf = C_in; @@ -521,7 +470,8 @@ module VX_tensor_octet #( // this does (m,n,k)=(4,4,2) matmul, modeling compute of a single octet VX_tensor_dpu #( .ISW(ISW), - .OCTET(OCTET) + .OCTET(OCTET), + .ISSUE_QUEUE_DEPTH(2) ) dpu ( .clk(clk), .reset(reset), @@ -556,7 +506,7 @@ module VX_tensor_octet #( // TODO: This is probably oversized. VX_fifo_queue #( .DATAW ($bits(D_wid) + $bits(D_out)), - .DEPTH (`LATENCY_HMMA) + .DEPTH (2 /*`LATENCY_HMMA*/) ) output_buffer ( .clk (clk), .reset (reset), diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv index 870f6870..694af4ae 100644 --- a/hw/rtl/fpu/VX_tensor_dpu.sv +++ b/hw/rtl/fpu/VX_tensor_dpu.sv @@ -3,7 +3,8 @@ module VX_tensor_dpu #( parameter ISW, - parameter OCTET + parameter OCTET, + parameter ISSUE_QUEUE_DEPTH = `LATENCY_HMMA ) ( input clk, input reset, @@ -62,6 +63,7 @@ module VX_tensor_dpu #( logic [1:0] threadgroup_readys; // B_tile is shared across the two threadgroups; see Figure 13 VX_tensor_threadgroup #( + .ISSUE_QUEUE_DEPTH(ISSUE_QUEUE_DEPTH) ) threadgroup_0 ( .clk (clk), .reset (reset), @@ -75,6 +77,7 @@ module VX_tensor_dpu #( .D_frag (D_tile[1:0]) ); VX_tensor_threadgroup #( + .ISSUE_QUEUE_DEPTH(ISSUE_QUEUE_DEPTH) ) threadgroup_1 ( .clk (clk), .reset (reset), @@ -99,7 +102,7 @@ module VX_tensor_dpu #( // need to pass along warp id's to do multithreading VX_fifo_queue #( .DATAW ($bits(wid)), - .DEPTH (`LATENCY_HMMA + `LATENCY_HMMA) + .DEPTH (ISSUE_QUEUE_DEPTH) ) wid_queue ( .clk (clk), .reset (reset), @@ -121,6 +124,7 @@ endmodule // does (m,n,k) = (2,4,2) matmul compute over 2 cycles. // matches Figure 10(b) of the paper. module VX_tensor_threadgroup #( + parameter ISSUE_QUEUE_DEPTH ) ( input clk, input reset, @@ -149,9 +153,18 @@ module VX_tensor_threadgroup #( assign ready_in = !full; assign valid_buf = !empty; + // 'Issue queue' for the FEDP units. + // This exists to decouple the execution of the dot-product unit from + // the operand arrival. Operands from execute_if can arrive + // intermittently according to the frontend's behavior, and since the dpu + // can also stall for a fixed initiation latency, we need to decouple the + // two to efficiently feed the dpu. + // + // TODO: better queue design possible; e.g. B_frag is shared by two + // threadgroups, so we need only 1 queue per octet for B VX_fifo_queue #( .DATAW ($bits(A_frag) + $bits(B_frag) + $bits(C_frag)), - .DEPTH (4) + .DEPTH (ISSUE_QUEUE_DEPTH) ) input_buffer ( .clk (clk), .reset (reset), From 83f9f6d84fc3f662d257bdc899682176d4be0cff Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 30 May 2024 18:22:36 -0700 Subject: [PATCH 23/31] tensor: Fix sync for dpu warp queue as well --- hw/rtl/core/VX_tensor_core.sv | 2 +- hw/rtl/fpu/VX_tensor_dpu.sv | 27 +++++++++++++++------------ 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index 44485ccb..1f363f45 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -471,7 +471,7 @@ module VX_tensor_octet #( VX_tensor_dpu #( .ISW(ISW), .OCTET(OCTET), - .ISSUE_QUEUE_DEPTH(2) + .ISSUE_QUEUE_DEPTH(4) ) dpu ( .clk(clk), .reset(reset), diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv index 694af4ae..08e37cfa 100644 --- a/hw/rtl/fpu/VX_tensor_dpu.sv +++ b/hw/rtl/fpu/VX_tensor_dpu.sv @@ -39,13 +39,6 @@ module VX_tensor_dpu #( end end - // ready as soon as valid_out - // assign ready_in = ready_reg; - - // fully pipelined; ready_in is coupled to ready_out by immediately - // stalling - // assign ready_in = ready_out; - // // fixed-latency queue // VX_shift_register #( // .DATAW (1 + $bits(wid)/* + $bits(D_tile)*/), @@ -59,6 +52,16 @@ module VX_tensor_dpu #( // .data_out ({valid_out, D_wid/*, D_tile */}) // ); + // ready as soon as valid_out + // assign ready_in = ready_reg || valid_out; + + // fully pipelined; ready_in is coupled to ready_out by immediately + // stalling + // assign ready_in = ready_out; + + logic synced_fire; + assign synced_fire = valid_in && ready_in; + logic [1:0] threadgroup_valids; logic [1:0] threadgroup_readys; // B_tile is shared across the two threadgroups; see Figure 13 @@ -67,7 +70,7 @@ module VX_tensor_dpu #( ) threadgroup_0 ( .clk (clk), .reset (reset), - .valid_in (valid_in), + .valid_in (synced_fire), .ready_in (threadgroup_readys[0]), .stall (!ready_out), .A_frag (A_tile[1:0]), @@ -81,7 +84,7 @@ module VX_tensor_dpu #( ) threadgroup_1 ( .clk (clk), .reset (reset), - .valid_in (valid_in), + .valid_in (synced_fire), .ready_in (threadgroup_readys[1]), .stall (!ready_out), .A_frag (A_tile[3:2]), @@ -102,7 +105,7 @@ module VX_tensor_dpu #( // need to pass along warp id's to do multithreading VX_fifo_queue #( .DATAW ($bits(wid)), - .DEPTH (ISSUE_QUEUE_DEPTH) + .DEPTH (ISSUE_QUEUE_DEPTH + ISSUE_QUEUE_DEPTH) ) wid_queue ( .clk (clk), .reset (reset), @@ -117,8 +120,8 @@ module VX_tensor_dpu #( `UNUSED_PIN(size) ); - // `RUNTIME_ASSERT(reset || (&(threadgroup_valids) == valid_out), - // ("FEDP and metadata queue went out of sync!")) + `RUNTIME_ASSERT(reset || !(deq && empty), + ("dequeueing from empty warp id queue!")) endmodule // does (m,n,k) = (2,4,2) matmul compute over 2 cycles. From 574cc0e5f035826745d281820c625cbe678c5bfb Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 30 May 2024 18:32:27 -0700 Subject: [PATCH 24/31] tensor: Document configuring queue depths --- hw/rtl/core/VX_tensor_core.sv | 7 ++----- hw/rtl/fpu/VX_tensor_dpu.sv | 9 +++++++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index 1f363f45..a5128272 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -318,8 +318,6 @@ module VX_tensor_octet #( output result_valid, input result_ready ); - localparam ISSUE_QUEUE_DEPTH = 4; - // 512 bits/octet * 4 octets per warp logic [`NUM_WARPS-1:0][3:0][31:0] A_buffer, A_buffer_n; logic [`NUM_WARPS-1:0][3:0][31:0] B_buffer, B_buffer_n; @@ -471,7 +469,7 @@ module VX_tensor_octet #( VX_tensor_dpu #( .ISW(ISW), .OCTET(OCTET), - .ISSUE_QUEUE_DEPTH(4) + .ISSUE_QUEUE_DEPTH(4 /*@perf: arbtirary*/) ) dpu ( .clk(clk), .reset(reset), @@ -503,10 +501,9 @@ module VX_tensor_octet #( // regular, every-2-cycle commit traffic to ensure the commit pipeline is // used more efficiently. // FIXME: unnecessary? - // TODO: This is probably oversized. VX_fifo_queue #( .DATAW ($bits(D_wid) + $bits(D_out)), - .DEPTH (2 /*`LATENCY_HMMA*/) + .DEPTH (2 /* arbitrary */) ) output_buffer ( .clk (clk), .reset (reset), diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv index 08e37cfa..79ee5757 100644 --- a/hw/rtl/fpu/VX_tensor_dpu.sv +++ b/hw/rtl/fpu/VX_tensor_dpu.sv @@ -4,6 +4,9 @@ module VX_tensor_dpu #( parameter ISW, parameter OCTET, + // @perf: has big impact on throughput. A rule of thumb is to set it to + // the pipeline length of FEDPs in order to make sure there are enough + // entries to fully saturate the pipeline, but this is still rough parameter ISSUE_QUEUE_DEPTH = `LATENCY_HMMA ) ( input clk, @@ -105,7 +108,9 @@ module VX_tensor_dpu #( // need to pass along warp id's to do multithreading VX_fifo_queue #( .DATAW ($bits(wid)), - .DEPTH (ISSUE_QUEUE_DEPTH + ISSUE_QUEUE_DEPTH) + // @perf: seems to require deeper depth than the FEDP issue queues to + // not cause stalls. + .DEPTH (2 * ISSUE_QUEUE_DEPTH) ) wid_queue ( .clk (clk), .reset (reset), @@ -167,7 +172,7 @@ module VX_tensor_threadgroup #( // threadgroups, so we need only 1 queue per octet for B VX_fifo_queue #( .DATAW ($bits(A_frag) + $bits(B_frag) + $bits(C_frag)), - .DEPTH (ISSUE_QUEUE_DEPTH) + .DEPTH (ISSUE_QUEUE_DEPTH) ) input_buffer ( .clk (clk), .reset (reset), From a02773eb922c02fe516cf96756bd5a2b18b58149 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 30 May 2024 21:55:42 -0700 Subject: [PATCH 25/31] Add more efficient dispatch_unit Instead of having a single candidate to be considered for dispatch (designated by 'batch_idx' counter), add a dispatch_unit variant that considerse all `ISSUE_WIDTH dispatch signals and picks a valid one in a round-robin manner. This increases core utilization significantly due to better overlapping of smem/tensor ops. --- hw/rtl/core/VX_dispatch_unit_sane.sv | 163 +++++++++++++++++++++++++++ hw/rtl/core/VX_lsu_unit.sv | 2 +- hw/rtl/core/VX_tensor_core.sv | 5 +- 3 files changed, 166 insertions(+), 4 deletions(-) create mode 100644 hw/rtl/core/VX_dispatch_unit_sane.sv diff --git a/hw/rtl/core/VX_dispatch_unit_sane.sv b/hw/rtl/core/VX_dispatch_unit_sane.sv new file mode 100644 index 00000000..26d2800b --- /dev/null +++ b/hw/rtl/core/VX_dispatch_unit_sane.sv @@ -0,0 +1,163 @@ +// Copyright © 2019-2023 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +`include "VX_define.vh" + +module VX_dispatch_unit_sane import VX_gpu_pkg::*; #( + parameter BLOCK_SIZE = 1, + parameter NUM_LANES = 1, + parameter OUT_REG = 0, + parameter MAX_FANOUT = `MAX_FANOUT +) ( + input wire clk, + input wire reset, + + // inputs + VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH], + + // outputs + VX_execute_if.master execute_if [BLOCK_SIZE] + +); + `STATIC_ASSERT ((`NUM_THREADS == NUM_LANES * (`NUM_THREADS / NUM_LANES)), ("invalid parameter")) + localparam BLOCK_SIZE_W = `LOG2UP(BLOCK_SIZE); + localparam NUM_PACKETS = `NUM_THREADS / NUM_LANES; + localparam PID_BITS = `CLOG2(NUM_PACKETS); + localparam PID_WIDTH = `UP(PID_BITS); + localparam BATCH_COUNT = `ISSUE_WIDTH / BLOCK_SIZE; + localparam BATCH_COUNT_W= `LOG2UP(BATCH_COUNT); + localparam ISSUE_W = `LOG2UP(`ISSUE_WIDTH); + localparam IN_DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + 1 + `XLEN + `XLEN + `NR_BITS + `NT_WIDTH + (3 * `NUM_THREADS * `XLEN); + localparam OUT_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + 1 + `XLEN + `XLEN + `NR_BITS + `NT_WIDTH + (3 * NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1; + localparam FANOUT_ENABLE= (`NUM_THREADS > (MAX_FANOUT + MAX_FANOUT/2)); + + localparam DATA_TMASK_OFF = IN_DATAW - (`UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS); + localparam DATA_REGS_OFF = 0; + + wire [`ISSUE_WIDTH-1:0] dispatch_valid; + wire [`ISSUE_WIDTH-1:0][IN_DATAW-1:0] dispatch_data; + wire [`ISSUE_WIDTH-1:0] dispatch_ready; + + for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin + assign dispatch_valid[i] = dispatch_if[i].valid; + assign dispatch_data[i] = dispatch_if[i].data; + assign dispatch_if[i].ready = dispatch_ready[i]; + end + + wire [BLOCK_SIZE-1:0][ISSUE_W-1:0] issue_indices; + wire [BLOCK_SIZE-1:0] block_ready; + wire [BLOCK_SIZE-1:0][NUM_LANES-1:0] block_tmask; + wire [BLOCK_SIZE-1:0][2:0][NUM_LANES-1:0][`XLEN-1:0] block_regs; + wire [BLOCK_SIZE-1:0][PID_WIDTH-1:0] block_pid; + wire [BLOCK_SIZE-1:0] block_sop; + wire [BLOCK_SIZE-1:0] block_eop; + wire [BLOCK_SIZE-1:0] block_done; + + wire batch_done = (& block_done); + + logic [BATCH_COUNT_W-1:0] batch_idx; + // if (BATCH_COUNT != 1) begin + // always @(posedge clk) begin + // if (reset) begin + // batch_idx <= '0; + // end else begin + // batch_idx <= batch_idx + BATCH_COUNT_W'(batch_done); + // end + // end + // end else begin + // assign batch_idx = 0; + // `UNUSED_VAR(batch_done) + // end + + wire dispatch_any_valid; + VX_lzc_rr #( + .N (`ISSUE_WIDTH) + ) batch_select ( + .clk (clk), + .reset (reset), + .data_in (dispatch_valid), + .data_out (batch_idx), + .valid_out (dispatch_any_valid) + ); + + `STATIC_ASSERT ((BLOCK_SIZE == 1), ("dispatch_unit_sane only supports BLOCK_SIZE == 1 for now")) + + for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin + + wire [ISSUE_W-1:0] issue_idx = ISSUE_W'(batch_idx * BLOCK_SIZE) + ISSUE_W'(block_idx); + assign issue_indices[block_idx] = issue_idx; + + wire valid_p, ready_p; + + assign valid_p = dispatch_valid[issue_idx]; + assign block_tmask[block_idx] = dispatch_data[issue_idx][DATA_TMASK_OFF +: `NUM_THREADS]; + assign block_regs[block_idx][0] = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN]; + assign block_regs[block_idx][1] = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN]; + assign block_regs[block_idx][2] = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN]; + assign block_pid[block_idx] = '0; + assign block_sop[block_idx] = 1'b1; + assign block_eop[block_idx] = 1'b1; + assign block_ready[block_idx] = ready_p; + assign block_done[block_idx] = ~valid_p || ready_p; + + wire [ISSUE_ISW_W-1:0] isw; + if (BATCH_COUNT != 1) begin + if (BLOCK_SIZE != 1) begin + assign isw = {batch_idx, BLOCK_SIZE_W'(block_idx)}; + end else begin + assign isw = batch_idx; + end + end else begin + assign isw = block_idx; + end + + `RESET_RELAY(buf_out_reset, reset); + + wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], isw); + + VX_elastic_buffer #( + .DATAW (OUT_DATAW), + .SIZE (`OUT_REG_TO_EB_SIZE(OUT_REG)), + .OUT_REG (`OUT_REG_TO_EB_REG(OUT_REG)) + ) buf_out ( + .clk (clk), + .reset (buf_out_reset), + .valid_in (valid_p), + .ready_in (ready_p), + .data_in ({ + dispatch_data[issue_idx][IN_DATAW-1 : DATA_TMASK_OFF+`NUM_THREADS+ISSUE_WIS_W], + block_wid, + block_tmask[block_idx], + dispatch_data[issue_idx][DATA_TMASK_OFF-1 : DATA_REGS_OFF + 3 * `NUM_THREADS * `XLEN], + block_regs[block_idx][0], + block_regs[block_idx][1], + block_regs[block_idx][2], + block_pid[block_idx], + block_sop[block_idx], + block_eop[block_idx]}), + .data_out (execute_if[block_idx].data), + .valid_out (execute_if[block_idx].valid), + .ready_out (execute_if[block_idx].ready) + ); + end + + reg [`ISSUE_WIDTH-1:0] ready_in; + always @(*) begin + ready_in = 0; + for (integer i = 0; i < BLOCK_SIZE; ++i) begin + ready_in[issue_indices[i]] = block_ready[i] && block_eop[i]; + end + end + assign dispatch_ready = ready_in; + +endmodule diff --git a/hw/rtl/core/VX_lsu_unit.sv b/hw/rtl/core/VX_lsu_unit.sv index 63f1d4c6..20fac1d1 100644 --- a/hw/rtl/core/VX_lsu_unit.sv +++ b/hw/rtl/core/VX_lsu_unit.sv @@ -49,7 +49,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #( `RESET_RELAY (dispatch_reset, reset); - VX_dispatch_unit #( + VX_dispatch_unit_sane #( .BLOCK_SIZE (BLOCK_SIZE), .NUM_LANES (NUM_LANES), .OUT_REG (1) diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index a5128272..6c9d9f6b 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -21,7 +21,7 @@ module VX_tensor_core import VX_gpu_pkg::*; #( `RESET_RELAY (dispatch_reset, reset); - VX_dispatch_unit #( + VX_dispatch_unit_sane #( .BLOCK_SIZE (BLOCK_SIZE), .NUM_LANES (NUM_LANES), .OUT_REG (PARTIAL_BW ? 1 : 0) @@ -177,9 +177,8 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS; - wire commit_if_ready_override; - wire operand_enq_fire = operands_valid_synced && execute_if.ready; + wire commit_if_ready_override; wire commit_if_fire = commit_if.valid && commit_if_ready_override; wire [DATAW-1:0] execute_if_data_enq = { execute_if.data.uuid, From 52bb827a4665bdddbc8968b3e2eefdc06947db2f Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 30 May 2024 23:20:21 -0700 Subject: [PATCH 26/31] Handle BLOCK_SIZE != 1 in dispatch_unit + change ALU and FPU unit to use it as well --- hw/rtl/core/VX_alu_unit.sv | 2 +- hw/rtl/core/VX_dispatch_unit_sane.sv | 141 ++++++++++++++++++++++++--- hw/rtl/core/VX_fpu_unit.sv | 2 +- 3 files changed, 128 insertions(+), 17 deletions(-) diff --git a/hw/rtl/core/VX_alu_unit.sv b/hw/rtl/core/VX_alu_unit.sv index 7546f4b3..c1724360 100644 --- a/hw/rtl/core/VX_alu_unit.sv +++ b/hw/rtl/core/VX_alu_unit.sv @@ -42,7 +42,7 @@ module VX_alu_unit #( `RESET_RELAY (dispatch_reset, reset); - VX_dispatch_unit #( + VX_dispatch_unit_sane #( .BLOCK_SIZE (BLOCK_SIZE), .NUM_LANES (NUM_LANES), .OUT_REG (PARTIAL_BW ? 1 : 0) diff --git a/hw/rtl/core/VX_dispatch_unit_sane.sv b/hw/rtl/core/VX_dispatch_unit_sane.sv index 26d2800b..3e31ced2 100644 --- a/hw/rtl/core/VX_dispatch_unit_sane.sv +++ b/hw/rtl/core/VX_dispatch_unit_sane.sv @@ -78,20 +78,25 @@ module VX_dispatch_unit_sane import VX_gpu_pkg::*; #( // assign batch_idx = 0; // `UNUSED_VAR(batch_done) // end - + + // group dispatch_valid by blocks + wire [BATCH_COUNT-1:0] batch_valids; + for (genvar i = 0; i < BATCH_COUNT; ++i) begin + assign batch_valids[i] = |(dispatch_valid[(BLOCK_SIZE * i) +: BLOCK_SIZE]); + end + + // elect the leftmost-valid batch for the dispatch wire dispatch_any_valid; VX_lzc_rr #( - .N (`ISSUE_WIDTH) + .N (BATCH_COUNT) ) batch_select ( .clk (clk), .reset (reset), - .data_in (dispatch_valid), + .data_in (batch_valids), .data_out (batch_idx), .valid_out (dispatch_any_valid) ); - `STATIC_ASSERT ((BLOCK_SIZE == 1), ("dispatch_unit_sane only supports BLOCK_SIZE == 1 for now")) - for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin wire [ISSUE_W-1:0] issue_idx = ISSUE_W'(batch_idx * BLOCK_SIZE) + ISSUE_W'(block_idx); @@ -99,16 +104,122 @@ module VX_dispatch_unit_sane import VX_gpu_pkg::*; #( wire valid_p, ready_p; - assign valid_p = dispatch_valid[issue_idx]; - assign block_tmask[block_idx] = dispatch_data[issue_idx][DATA_TMASK_OFF +: `NUM_THREADS]; - assign block_regs[block_idx][0] = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN]; - assign block_regs[block_idx][1] = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN]; - assign block_regs[block_idx][2] = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN]; - assign block_pid[block_idx] = '0; - assign block_sop[block_idx] = 1'b1; - assign block_eop[block_idx] = 1'b1; - assign block_ready[block_idx] = ready_p; - assign block_done[block_idx] = ~valid_p || ready_p; + if (`NUM_THREADS != NUM_LANES) begin + reg [NUM_PACKETS-1:0] sent_mask_p; + wire [PID_WIDTH-1:0] start_p_n, start_p, end_p; + wire dispatch_valid_r; + reg is_first_p; + + wire fire_p = valid_p && ready_p; + + wire is_last_p = (start_p == end_p); + + wire fire_eop = fire_p && is_last_p; + + always @(posedge clk) begin + if (reset) begin + sent_mask_p <= '0; + is_first_p <= 1; + end else begin + if ((BATCH_COUNT != 1) ? batch_done : fire_eop) begin + sent_mask_p <= '0; + is_first_p <= 1; + end else if (fire_p) begin + sent_mask_p[start_p] <= 1; + is_first_p <= 0; + end + end + end + + wire [NUM_PACKETS-1:0][NUM_LANES-1:0] per_packet_tmask; + wire [NUM_PACKETS-1:0][2:0][NUM_LANES-1:0][`XLEN-1:0] per_packet_regs; + + wire [`NUM_THREADS-1:0] dispatch_tmask = dispatch_data[issue_idx][DATA_TMASK_OFF +: `NUM_THREADS]; + wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs1_data = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN]; + wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs2_data = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN]; + wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs3_data = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN]; + + for (genvar i = 0; i < NUM_PACKETS; ++i) begin + for (genvar j = 0; j < NUM_LANES; ++j) begin + localparam k = i * NUM_LANES + j; + assign per_packet_tmask[i][j] = dispatch_tmask[k]; + assign per_packet_regs[i][0][j] = dispatch_rs1_data[k]; + assign per_packet_regs[i][1][j] = dispatch_rs2_data[k]; + assign per_packet_regs[i][2][j] = dispatch_rs3_data[k]; + end + end + + wire [NUM_PACKETS-1:0] packet_valids; + wire [NUM_PACKETS-1:0][PID_WIDTH-1:0] packet_ids; + + for (genvar i = 0; i < NUM_PACKETS; ++i) begin + assign packet_valids[i] = (| per_packet_tmask[i]); + assign packet_ids[i] = PID_WIDTH'(i); + end + + VX_find_first #( + .N (NUM_PACKETS), + .DATAW (PID_WIDTH), + .REVERSE (0) + ) find_first ( + .valid_in (packet_valids & ~sent_mask_p), + .data_in (packet_ids), + .data_out (start_p_n), + `UNUSED_PIN (valid_out) + ); + + VX_find_first #( + .N (NUM_PACKETS), + .DATAW (PID_WIDTH), + .REVERSE (1) + ) find_last ( + .valid_in (packet_valids), + .data_in (packet_ids), + .data_out (end_p), + `UNUSED_PIN (valid_out) + ); + + VX_pipe_register #( + .DATAW (1 + PID_WIDTH), + .RESETW (1), + .DEPTH (FANOUT_ENABLE ? 1 : 0) + ) pipe_reg ( + .clk (clk), + .reset (reset || fire_p), // should flush on fire + .enable (1'b1), + .data_in ({dispatch_valid[issue_idx], start_p_n}), + .data_out ({dispatch_valid_r, start_p}) + ); + + wire [NUM_LANES-1:0] tmask_p = per_packet_tmask[start_p]; + wire [2:0][NUM_LANES-1:0][`XLEN-1:0] regs_p = per_packet_regs[start_p]; + + wire block_enable = (BATCH_COUNT == 1 || ~(& sent_mask_p)); + + assign valid_p = dispatch_valid_r && block_enable; + assign block_tmask[block_idx] = tmask_p; + assign block_regs[block_idx] = regs_p; + assign block_pid[block_idx] = start_p; + assign block_sop[block_idx] = is_first_p; + assign block_eop[block_idx] = is_last_p; + if (FANOUT_ENABLE) begin + assign block_ready[block_idx] = dispatch_valid_r && ready_p && block_enable; + end else begin + assign block_ready[block_idx] = ready_p && block_enable; + end + assign block_done[block_idx] = ~dispatch_valid[issue_idx] || fire_eop; + end else begin + assign valid_p = dispatch_valid[issue_idx]; + assign block_tmask[block_idx] = dispatch_data[issue_idx][DATA_TMASK_OFF +: `NUM_THREADS]; + assign block_regs[block_idx][0] = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN]; + assign block_regs[block_idx][1] = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN]; + assign block_regs[block_idx][2] = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN]; + assign block_pid[block_idx] = '0; + assign block_sop[block_idx] = 1'b1; + assign block_eop[block_idx] = 1'b1; + assign block_ready[block_idx] = ready_p; + assign block_done[block_idx] = ~valid_p || ready_p; + end wire [ISSUE_ISW_W-1:0] isw; if (BATCH_COUNT != 1) begin diff --git a/hw/rtl/core/VX_fpu_unit.sv b/hw/rtl/core/VX_fpu_unit.sv index 26956213..7e0875ba 100644 --- a/hw/rtl/core/VX_fpu_unit.sv +++ b/hw/rtl/core/VX_fpu_unit.sv @@ -39,7 +39,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #( `RESET_RELAY (dispatch_reset, reset); - VX_dispatch_unit #( + VX_dispatch_unit_sane #( .BLOCK_SIZE (BLOCK_SIZE), .NUM_LANES (NUM_LANES), .OUT_REG (PARTIAL_BW ? 1 : 0) From 73293061ea6887f9926c34fd6ef2f169045eb73b Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 30 May 2024 23:21:23 -0700 Subject: [PATCH 27/31] tensor: Enlarge metadata queue --- hw/rtl/core/VX_tensor_core.sv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index 6c9d9f6b..105fab2f 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -78,7 +78,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( // FIXME: not sure this is the right logic. just filling in what works localparam LANE_OFFSET_THREADGROUP = (4 * NUM_OCTETS); // this is only a rule of thumb - localparam METADATA_QUEUE_DEPTH = `LATENCY_HMMA; + localparam METADATA_QUEUE_DEPTH = 2 * `LATENCY_HMMA; wire [1:0] step = 2'(execute_if.data.op_type); wire last_in_pair = (execute_if.data.op_mod == `INST_MOD_BITS'(1)); From 0ebbb8e2238c90542824b6dad124af0adbb3ec55 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 31 May 2024 00:32:32 -0700 Subject: [PATCH 28/31] tensor: Fix perf counter; comment out dpi --- hw/rtl/core/VX_tensor_core.sv | 2 +- hw/rtl/fpu/VX_tensor_dpu.sv | 30 +++++++++++++++--------------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index 105fab2f..ca0d1064 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -528,7 +528,7 @@ module VX_tensor_octet #( perf_tensor_dpu_total <= '0; end else begin if (do_hmma) begin - perf_tensor_dpu_total <= perf_tensor_dpu_total + 1'b1; + perf_tensor_dpu_total <= perf_tensor_dpu_total + 2'd2; end end end diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv index 79ee5757..0155417b 100644 --- a/hw/rtl/fpu/VX_tensor_dpu.sv +++ b/hw/rtl/fpu/VX_tensor_dpu.sv @@ -24,23 +24,23 @@ module VX_tensor_dpu #( output [3:0][3:0][31:0] D_tile, output [`NW_WIDTH-1:0] D_wid ); - logic [3:0][3:0][31:0] result_hmma; + // logic [3:0][3:0][31:0] result_hmma; - always @(*) begin - dpi_hmma(valid_in, A_tile, B_tile, C_tile, result_hmma); - end + // always @(*) begin + // dpi_hmma(valid_in, A_tile, B_tile, C_tile, result_hmma); + // end - logic ready_reg; - always @(posedge clk) begin - if (reset) begin - ready_reg <= '1; - end else if (valid_in && ready_in) begin - ready_reg <= '0; - dpi_print_results(int'(ISW), int'(OCTET), A_tile, B_tile, C_tile, result_hmma); - end else if (valid_out && ready_out) begin - ready_reg <= '1; - end - end + // logic ready_reg; + // always @(posedge clk) begin + // if (reset) begin + // ready_reg <= '1; + // end else if (valid_in && ready_in) begin + // ready_reg <= '0; + // dpi_print_results(int'(ISW), int'(OCTET), A_tile, B_tile, C_tile, result_hmma); + // end else if (valid_out && ready_out) begin + // ready_reg <= '1; + // end + // end // // fixed-latency queue // VX_shift_register #( From 9caafb2d8a153f84e88c8134bb5e6423c6fbd044 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Fri, 31 May 2024 19:17:56 -0700 Subject: [PATCH 29/31] tensor: Decode rd of macro-op to designate additional accumulator This is useful when you want to have the tensor core output to multiple accumulator registers, e.g. when doing outer product within the RF. --- hw/rtl/core/VX_decode.sv | 6 ++++++ hw/rtl/core/VX_uop_sequencer.sv | 18 ++++++++++++++---- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/hw/rtl/core/VX_decode.sv b/hw/rtl/core/VX_decode.sv index 6f4539e7..2ca414cd 100644 --- a/hw/rtl/core/VX_decode.sv +++ b/hw/rtl/core/VX_decode.sv @@ -545,6 +545,12 @@ module VX_decode #( `INST_EXT4: begin ex_type = `EX_TENSOR; op_type = `INST_TENSOR_HMMA; + // tensor core macroop is encoded as r-type + use_rd = 1; + `USED_IREG (rd); + `USED_IREG (rs1); + `USED_IREG (rs2); + `USED_IREG (rs3); end `endif default:; diff --git a/hw/rtl/core/VX_uop_sequencer.sv b/hw/rtl/core/VX_uop_sequencer.sv index 24b5af3c..130866de 100644 --- a/hw/rtl/core/VX_uop_sequencer.sv +++ b/hw/rtl/core/VX_uop_sequencer.sv @@ -14,10 +14,9 @@ module VX_uop_sequencer import VX_gpu_pkg::*; ( localparam UOP_TABLE_SIZE = 64; localparam UPC_BITS = `CLOG2(UOP_TABLE_SIZE); - localparam NEXT = 2'b00; - localparam FINISH = 2'b01; - localparam UBR_BITS = 2; + localparam NEXT = UBR_BITS'(2'b00); + localparam FINISH = UBR_BITS'(2'b01); // uop metadata (sequencing, next state), execution metadata (EX_TYPE, OP_TYPE, OP_MOD), wb, use pc, use imm, pc, imm, rd, rs1, rs2, rs3 localparam UOP_TABLE_WIDTH = UBR_BITS + UPC_BITS + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + 1 + `XLEN + `XLEN + (`NR_BITS * 4); @@ -122,7 +121,18 @@ module VX_uop_sequencer import VX_gpu_pkg::*; ( // passthrough when !use_uop assign ibuffer_if.valid = use_uop ? 1'b1 : uop_sequencer_if.valid; assign uop_sequencer_if.ready = use_uop ? (uop_fire && ubr == FINISH) : ibuffer_if.ready; - assign ibuffer_if.data = use_uop ? ibuffer_output : uop_sequencer_if.data; + + always @(*) begin + ibuffer_if.data = use_uop ? ibuffer_output : uop_sequencer_if.data; + + if (uop_sequencer_if.valid && use_uop && + uop_sequencer_if.data.rd == `NR_BITS'(1)) begin + // a little sketchy? but shouldn't create any loop + ibuffer_if.data.rd = ibuffer_if.data.rd + `NR_BITS'(8); + ibuffer_if.data.rs3 = ibuffer_if.data.rs3 + `NR_BITS'(8); + $display("yoooooooo! uop rd=%d\n", ibuffer_if.data.rd); + end + end always @(posedge clk) begin if (uop_start) begin From 12f8722dd5b9505bbe22a0ac62dfd79df49d9f56 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Mon, 3 Jun 2024 13:04:09 -0700 Subject: [PATCH 30/31] Shush display --- hw/rtl/core/VX_tensor_core.sv | 2 +- hw/rtl/core/VX_uop_sequencer.sv | 3 +-- hw/rtl/fpu/VX_tensor_dpu.sv | 3 +++ 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index ca0d1064..d1c14588 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -81,6 +81,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( localparam METADATA_QUEUE_DEPTH = 2 * `LATENCY_HMMA; wire [1:0] step = 2'(execute_if.data.op_type); + // op_mod is reused to indicate instruction's id in pair wire last_in_pair = (execute_if.data.op_mod == `INST_MOD_BITS'(1)); logic [NUM_OCTETS-1:0] octet_results_valid; @@ -115,7 +116,6 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( logic result_valid; logic result_ready; - // op_mod is reused to indicate instruction's id in pair VX_tensor_octet #( .ISW(ISW), .OCTET(i) diff --git a/hw/rtl/core/VX_uop_sequencer.sv b/hw/rtl/core/VX_uop_sequencer.sv index 130866de..26817b8d 100644 --- a/hw/rtl/core/VX_uop_sequencer.sv +++ b/hw/rtl/core/VX_uop_sequencer.sv @@ -128,9 +128,8 @@ module VX_uop_sequencer import VX_gpu_pkg::*; ( if (uop_sequencer_if.valid && use_uop && uop_sequencer_if.data.rd == `NR_BITS'(1)) begin // a little sketchy? but shouldn't create any loop - ibuffer_if.data.rd = ibuffer_if.data.rd + `NR_BITS'(8); + ibuffer_if.data.rd = ibuffer_if.data.rd + `NR_BITS'(8); // FIXME: 8 is hardcoded ibuffer_if.data.rs3 = ibuffer_if.data.rs3 + `NR_BITS'(8); - $display("yoooooooo! uop rd=%d\n", ibuffer_if.data.rd); end end diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv index 0155417b..8b7a1c26 100644 --- a/hw/rtl/fpu/VX_tensor_dpu.sv +++ b/hw/rtl/fpu/VX_tensor_dpu.sv @@ -234,6 +234,9 @@ module VX_tensor_threadgroup #( end end + // TODO: Instead of latching half-result and constructing a full D tile, + // we should be able to send these half fragments down to commit stage + // immediately, saving flop space assign D_frag[0][0] = D_reg[0]; assign D_frag[0][2] = D_reg[1]; assign D_frag[1][0] = D_reg[2]; From 874a3bf1945f773951ce54eae73c9903e08f9737 Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Sun, 9 Jun 2024 13:41:00 -0700 Subject: [PATCH 31/31] Doc changes --- hw/rtl/VX_platform.vh | 2 +- hw/rtl/core/VX_smem_unit.sv | 1 + hw/rtl/core/VX_tensor_core.sv | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/hw/rtl/VX_platform.vh b/hw/rtl/VX_platform.vh index 65cbd0bf..282018b8 100644 --- a/hw/rtl/VX_platform.vh +++ b/hw/rtl/VX_platform.vh @@ -14,7 +14,7 @@ `ifndef VX_PLATFORM_VH `define VX_PLATFORM_VH -// synthesis only +// enable synthesizable build if SIMULATION not explicitly defined `ifndef SIMULATION `define SYNTHESIS `define NDEBUG diff --git a/hw/rtl/core/VX_smem_unit.sv b/hw/rtl/core/VX_smem_unit.sv index 91587b2f..532dba55 100644 --- a/hw/rtl/core/VX_smem_unit.sv +++ b/hw/rtl/core/VX_smem_unit.sv @@ -66,6 +66,7 @@ module VX_smem_unit import VX_gpu_pkg::*; #( .req_valid (smem_req_valid), .req_rw (smem_req_rw), .req_byteen (smem_req_byteen), + // FIXME: synthesis complains undriven when USE_EXTERNAL_SMEM .req_addr (smem_req_addr), .req_data (smem_req_data), .req_tag (smem_req_tag), diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index d1c14588..efa74afd 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -232,7 +232,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( // this shouldn't really happen unless there's a big contention over // the commit stage - `RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!")); + `RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!")) // unlike execute which can be interleaved between warps, commit is // serialized and completed one-warp-by-warp, therefore we only need to