tensor: Handle backpressure from metadata queue

2024-05-30 17:34:39 -07:00
parent dfb2276657
commit 06e0f901ff
1 changed files with 17 additions and 14 deletions
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -90,8 +90,6 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
    logic [`NUM_THREADS-1:0][`XLEN-1:0] wb_data_1;
    wire [`NW_WIDTH-1:0] wb_wid;
    
-    assign execute_if.ready = &octet_operands_ready;
-
 `ifdef EXT_T_ENABLE
    for (genvar i = 0; i < NUM_OCTETS; ++i) begin
 `else
@@ -207,16 +205,23 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(

    wire [`NUM_WARPS-1:0][DATAW-1:0] execute_if_data_deq;

+    wire [`NUM_WARPS-1:0] metadata_queue_fulls;
+    // OR not AND, we don't want any warp full
+    wire metadata_queue_full = |(metadata_queue_fulls);
+
+    assign execute_if.ready = &(octet_operands_ready) && !metadata_queue_full;
+
    for (genvar i = 0; i < `NUM_WARPS; i++) begin
-        // execute_if request queue.
+        // Metadata queue for commit_if.  This simply copies execute_if's
+        // metadata and pops them in conjunction with commit fire.
+        //
        // This has to be separated per-warp, as otherwise requests from
        // multiple warps can be enqueued interleaved, which makes it hard to
        // ensure two consecutive dequeues are associated with the same warp for
-        // commit.
+        // commit. (FIXME: this is not strictly necessary though.)

        wire enq = execute_if_fire && (execute_if.data.wid == `NW_WIDTH'(i));
        wire deq =  commit_if_fire && (             wb_wid == `NW_WIDTH'(i));
-        wire full;

        VX_fifo_queue #(
            .DATAW(DATAW),
@@ -230,16 +235,16 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
            .data_out(execute_if_data_deq[i]),
            `UNUSED_PIN(empty),
            `UNUSED_PIN(alm_empty),
-            .full(full), // should be impossible to overflow
+            .full(metadata_queue_fulls[i]),
            `UNUSED_PIN(alm_full),
            `UNUSED_PIN(size)
        );
-
-        // this shouldn't really happen unless there's a big contention over
-        // the commit stage
-        `RUNTIME_ASSERT(!(!reset && full), ("tensor core uop queue is full!"));
    end

+    // this shouldn't really happen unless there's a big contention over
+    // the commit stage
+    `RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!"));
+
    // unlike execute which can be interleaved between warps, commit is
    // serialized and completed one-warp-by-warp, therefore we only need to
    // keep one subcommit state bit unlike for `substeps`
@@ -527,13 +532,11 @@ module VX_tensor_octet #(
    // is complete.  This decouples the irregular dpu output traffic from the
    // regular, every-2-cycle commit traffic to ensure the commit pipeline is
    // used more efficiently.
+    // FIXME: unnecessary?
    // TODO: This is probably oversized.
    VX_fifo_queue #(
        .DATAW   ($bits(D_wid) + $bits(D_out)),
-        // depth of this queue should ideally be deeper than the dpu pipeline
-        // latency, since the dpu is fully-pipelined and it can output the
-        // latency-number of outputs in a burst-y way.
-        .DEPTH   (`LATENCY_HMMA + `LATENCY_HMMA)
+        .DEPTH   (`LATENCY_HMMA)
    ) output_buffer (
        .clk   (clk),
        .reset (reset),