From 06e0f901ff44cad582a8247457956106890a9eab Mon Sep 17 00:00:00 2001 From: Hansung Kim Date: Thu, 30 May 2024 17:34:39 -0700 Subject: [PATCH] tensor: Handle backpressure from metadata queue --- hw/rtl/core/VX_tensor_core.sv | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv index bedf8245..5d4c02a4 100644 --- a/hw/rtl/core/VX_tensor_core.sv +++ b/hw/rtl/core/VX_tensor_core.sv @@ -90,8 +90,6 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( logic [`NUM_THREADS-1:0][`XLEN-1:0] wb_data_1; wire [`NW_WIDTH-1:0] wb_wid; - assign execute_if.ready = &octet_operands_ready; - `ifdef EXT_T_ENABLE for (genvar i = 0; i < NUM_OCTETS; ++i) begin `else @@ -207,16 +205,23 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( wire [`NUM_WARPS-1:0][DATAW-1:0] execute_if_data_deq; + wire [`NUM_WARPS-1:0] metadata_queue_fulls; + // OR not AND, we don't want any warp full + wire metadata_queue_full = |(metadata_queue_fulls); + + assign execute_if.ready = &(octet_operands_ready) && !metadata_queue_full; + for (genvar i = 0; i < `NUM_WARPS; i++) begin - // execute_if request queue. + // Metadata queue for commit_if. This simply copies execute_if's + // metadata and pops them in conjunction with commit fire. + // // This has to be separated per-warp, as otherwise requests from // multiple warps can be enqueued interleaved, which makes it hard to // ensure two consecutive dequeues are associated with the same warp for - // commit. + // commit. (FIXME: this is not strictly necessary though.) wire enq = execute_if_fire && (execute_if.data.wid == `NW_WIDTH'(i)); wire deq = commit_if_fire && ( wb_wid == `NW_WIDTH'(i)); - wire full; VX_fifo_queue #( .DATAW(DATAW), @@ -230,16 +235,16 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #( .data_out(execute_if_data_deq[i]), `UNUSED_PIN(empty), `UNUSED_PIN(alm_empty), - .full(full), // should be impossible to overflow + .full(metadata_queue_fulls[i]), `UNUSED_PIN(alm_full), `UNUSED_PIN(size) ); - - // this shouldn't really happen unless there's a big contention over - // the commit stage - `RUNTIME_ASSERT(!(!reset && full), ("tensor core uop queue is full!")); end + // this shouldn't really happen unless there's a big contention over + // the commit stage + `RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!")); + // unlike execute which can be interleaved between warps, commit is // serialized and completed one-warp-by-warp, therefore we only need to // keep one subcommit state bit unlike for `substeps` @@ -527,13 +532,11 @@ module VX_tensor_octet #( // is complete. This decouples the irregular dpu output traffic from the // regular, every-2-cycle commit traffic to ensure the commit pipeline is // used more efficiently. + // FIXME: unnecessary? // TODO: This is probably oversized. VX_fifo_queue #( .DATAW ($bits(D_wid) + $bits(D_out)), - // depth of this queue should ideally be deeper than the dpu pipeline - // latency, since the dpu is fully-pipelined and it can output the - // latency-number of outputs in a burst-y way. - .DEPTH (`LATENCY_HMMA + `LATENCY_HMMA) + .DEPTH (`LATENCY_HMMA) ) output_buffer ( .clk (clk), .reset (reset),