From 0a032ab400ae94d525d40dc8673e7b3b8d56e89b Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 30 May 2024 18:03:04 -0700 Subject: [PATCH] tensor: Fix out-of-sync enqueue to dpu and metadata queue --- hw/rtl/core/VX_tensor_core.sv | 92 ++++++++--------------------------- hw/rtl/fpu/VX_tensor_dpu.sv | 19 ++++++-- 2 files changed, 37 insertions(+), 74 deletions(-) diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index b00d0a46..44485ccb 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -77,7 +77,8 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( // octet. E.g. two tgs map lane 0-3 and lane 16-19 -> 16 // FIXME: not sure this is the right logic. just filling in what works localparam LANE_OFFSET_THREADGROUP = (4 * NUM_OCTETS); - localparam METADATA_QUEUE_DEPTH = 4; + // this is only a rule of thumb + localparam METADATA_QUEUE_DEPTH = `LATENCY_HMMA; wire [1:0] step = 2'(execute_if.data.op_type); wire last_in_pair = (execute_if.data.op_mod == `INST_MOD_BITS'(1)); @@ -89,7 +90,11 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( logic [`NUM_THREADS-1:0][`XLEN-1:0] wb_data_0; logic [`NUM_THREADS-1:0][`XLEN-1:0] wb_data_1; wire [`NW_WIDTH-1:0] wb_wid; - + + // valid signal synced between the functional units (octet) and the + // metadata queue + wire operands_valid_synced; + `ifdef EXT_T_ENABLE for (genvar i = 0; i < NUM_OCTETS; ++i) begin `else @@ -121,7 +126,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( .A_in(octet_A), .B_in(octet_B), .C_in(octet_C), - .operands_valid(execute_if.valid), + .operands_valid(operands_valid_synced), .operands_wid(execute_if.data.wid), .operands_last_in_pair(last_in_pair), .operands_step(step), @@ -172,8 +177,10 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS; - wire execute_if_fire = execute_if.valid && execute_if.ready; - wire commit_if_fire = commit_if.valid && commit_if.ready; + wire commit_if_ready_override; + + wire operand_enq_fire = operands_valid_synced && execute_if.ready; + wire commit_if_fire = commit_if.valid && commit_if_ready_override; wire [DATAW-1:0] execute_if_data_enq = { execute_if.data.uuid, execute_if.data.wid, @@ -184,31 +191,14 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( // pid/sop/eop set later }; - // wire [DATAW-1:0] execute_if_data_deq; - - // VX_fifo_queue #( - // .DATAW(DATAW), - // .DEPTH(4 /* FIXME: arbitrary */) - // ) pending_uops ( - // .clk(clk), - // .reset(reset), - // .push(execute_if_fire), - // .pop(commit_if_fire), - // .data_in(execute_if_data_enq), - // .data_out(execute_if_data_deq), - // `UNUSED_PIN(empty), - // `UNUSED_PIN(alm_empty), - // `UNUSED_PIN(full), // should be impossible to overflow - // `UNUSED_PIN(alm_full), - // `UNUSED_PIN(size) - // ); - wire [`NUM_WARPS-1:0][DATAW-1:0] execute_if_data_deq; wire [`NUM_WARPS-1:0] metadata_queue_fulls; // OR not AND, we don't want any warp full wire metadata_queue_full = |(metadata_queue_fulls); + // need to make sure both metadata and octet issue queues are in sync + assign operands_valid_synced = execute_if.valid && !metadata_queue_full; assign execute_if.ready = &(octet_operands_ready) && !metadata_queue_full; for (genvar i = 0; i < `NUM_WARPS; i++) begin @@ -220,8 +210,8 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( // ensure two consecutive dequeues are associated with the same warp for // commit. (FIXME: this is not strictly necessary though.) - wire enq = execute_if_fire && (execute_if.data.wid == `NW_WIDTH'(i)); - wire deq = commit_if_fire && ( wb_wid == `NW_WIDTH'(i)); + wire enq = operand_enq_fire && (execute_if.data.wid == `NW_WIDTH'(i)); + wire deq = commit_if_fire && ( wb_wid == `NW_WIDTH'(i)); VX_fifo_queue #( .DATAW(DATAW), @@ -253,8 +243,8 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( wire all_valid = (& octet_results_valid); // define this to inject artificial commit backpressure for debugging -`define INJECT_COMMIT_BACKPRESSURE -`ifndef INJECT_COMMIT_BACKPRESSURE +// `define TENSOR_INJECT_COMMIT_BACKPRESSURE +`ifndef TENSOR_INJECT_COMMIT_BACKPRESSURE assign commit_if.valid = all_valid; assign commit_if_ready_override = commit_if.ready; `else @@ -358,47 +348,6 @@ module VX_tensor_octet #( wire operands_last_in_pair_buf; wire [1:0] operands_step_buf; - // wire inbuf_empty; - // wire inbuf_full; - // wire inbuf_ready_in; - // assign inbuf_ready_in = !inbuf_full; - // assign operands_ready = inbuf_ready_in; - // assign operands_valid_buf = !inbuf_empty; - - // // wire inbuf_enq = operands_ready && operands_valid && operands_last_in_pair; - // wire inbuf_enq = operands_ready && operands_valid; - // wire inbuf_deq = operands_valid_buf && operands_ready_buf; - - // // the 'issue queue' for the dpu. - // // This exists to decouple the input of the dot-product unit from - // // execute_if.ready. execute_if can arrive intermittently according to - // // the frontend's behavior, and since the dpu can also stall for a fixed - // // initiation latency, we need to decouple the two to efficiently feed the - // // dpu. - // // This only applies to the last instruction in a pair, since the first - // // instruction only acts to buffer the operands and can execute - // // immediately without backpressure. So we don't enqueue them. - // VX_fifo_queue #( - // .DATAW ($bits(A_in) + $bits(B_in) + $bits(C_in) + - // $bits(operands_wid) + $bits(operands_step) + $bits(operands_last_in_pair)), - // .DEPTH (ISSUE_QUEUE_DEPTH) - // ) input_buffer ( - // .clk (clk), - // .reset (reset), - // .push (inbuf_enq), - // .pop (inbuf_deq), - // .data_in ({A_in, B_in, C_in, operands_wid, operands_step, operands_last_in_pair}), - // .data_out ({A_in_buf, B_in_buf, C_in_buf, operands_wid_buf, operands_step_buf, operands_last_in_pair_buf}), - // .empty (inbuf_empty), - // `UNUSED_PIN(alm_empty), - // .full (inbuf_full), - // `UNUSED_PIN(alm_full), - // `UNUSED_PIN(size) - // ); - - // // FIXME: this shouldn't be necessary - // `RUNTIME_ASSERT(reset || !inbuf_full, ("dpu issue queue is full!")) - assign A_in_buf = A_in; assign B_in_buf = B_in; assign C_in_buf = C_in; @@ -521,7 +470,8 @@ module VX_tensor_octet #( // this does (m,n,k)=(4,4,2) matmul, modeling compute of a single octet VX_tensor_dpu #( .ISW(ISW), - .OCTET(OCTET) + .OCTET(OCTET), + .ISSUE_QUEUE_DEPTH(2) ) dpu ( .clk(clk), .reset(reset), @@ -556,7 +506,7 @@ module VX_tensor_octet #( // TODO: This is probably oversized. VX_fifo_queue #( .DATAW ($bits(D_wid) + $bits(D_out)), - .DEPTH (`LATENCY_HMMA) + .DEPTH (2 /*`LATENCY_HMMA*/) ) output_buffer ( .clk (clk), .reset (reset), diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv index 870f6870..694af4ae 100644 --- a/hw/rtl/fpu/VX_tensor_dpu.sv +++ b/hw/rtl/fpu/VX_tensor_dpu.sv @@ -3,7 +3,8 @@ module VX_tensor_dpu #( parameter ISW, - parameter OCTET + parameter OCTET, + parameter ISSUE_QUEUE_DEPTH = `LATENCY_HMMA ) ( input clk, input reset, @@ -62,6 +63,7 @@ module VX_tensor_dpu #( logic [1:0] threadgroup_readys; // B_tile is shared across the two threadgroups; see Figure 13 VX_tensor_threadgroup #( + .ISSUE_QUEUE_DEPTH(ISSUE_QUEUE_DEPTH) ) threadgroup_0 ( .clk (clk), .reset (reset), @@ -75,6 +77,7 @@ module VX_tensor_dpu #( .D_frag (D_tile[1:0]) ); VX_tensor_threadgroup #( + .ISSUE_QUEUE_DEPTH(ISSUE_QUEUE_DEPTH) ) threadgroup_1 ( .clk (clk), .reset (reset), @@ -99,7 +102,7 @@ module VX_tensor_dpu #( // need to pass along warp id's to do multithreading VX_fifo_queue #( .DATAW ($bits(wid)), - .DEPTH (`LATENCY_HMMA + `LATENCY_HMMA) + .DEPTH (ISSUE_QUEUE_DEPTH) ) wid_queue ( .clk (clk), .reset (reset), @@ -121,6 +124,7 @@ endmodule // does (m,n,k) = (2,4,2) matmul compute over 2 cycles. // matches Figure 10(b) of the paper. module VX_tensor_threadgroup #( + parameter ISSUE_QUEUE_DEPTH ) ( input clk, input reset, @@ -149,9 +153,18 @@ module VX_tensor_threadgroup #( assign ready_in = !full; assign valid_buf = !empty; + // 'Issue queue' for the FEDP units. + // This exists to decouple the execution of the dot-product unit from + // the operand arrival. Operands from execute_if can arrive + // intermittently according to the frontend's behavior, and since the dpu + // can also stall for a fixed initiation latency, we need to decouple the + // two to efficiently feed the dpu. + // + // TODO: better queue design possible; e.g. B_frag is shared by two + // threadgroups, so we need only 1 queue per octet for B VX_fifo_queue #( .DATAW ($bits(A_frag) + $bits(B_frag) + $bits(C_frag)), - .DEPTH (4) + .DEPTH (ISSUE_QUEUE_DEPTH) ) input_buffer ( .clk (clk), .reset (reset),