From 4dcbc31a88915fff35ccefd00c6e753fa5ef135a Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 11 Oct 2024 21:32:20 -0700
Subject: [PATCH] tensor: Separate async commit from tensor commit

With this we can prioritize commit of the async hgmma instructions over
the "ghost" commits from the TC.
---
 hw/rtl/core/VX_commit.sv             |   9 ++-
 hw/rtl/core/VX_tensor_hopper_core.sv | 107 +++++++++++++++++++--------
 2 files changed, 83 insertions(+), 33 deletions(-)

diff --git a/hw/rtl/core/VX_commit.sv b/hw/rtl/core/VX_commit.sv
index 9b930818..faca0a2a 100644
--- a/hw/rtl/core/VX_commit.sv
+++ b/hw/rtl/core/VX_commit.sv
@@ -177,17 +177,22 @@ module VX_commit import VX_gpu_pkg::*; #(
     // probably want to change this at some point
     // (i.e. pass a "don't count this towards pending instructions" signal down the pipeline)
     wire [`ISSUE_WIDTH-1:0] final_hmma;
+    // if this is a "ghost" commit generated from the tensor core, don't count
+    // toward committed
+    wire [`ISSUE_WIDTH-1:0] tensor_ghost;
 `ifdef EXT_T_ENABLE
     for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
         // if PC is 0, this means it is not final step of a wmma, shouldn't be committed
         assign final_hmma[i] = (commit_if[i].data.PC != 32'b0); 
+        // handle 'x' with ===.  FIXME fix unitialization
+        assign tensor_ghost[i] = (commit_if[i].data.tensor == 1'b1);
     end
 `else
     assign final_hmma = '1;
+    assign tensor_ghost = '0;
 `endif
 
-
-    wire [`ISSUE_WIDTH-1:0] committed = (commit_fire & commit_eop) & final_hmma;
+    wire [`ISSUE_WIDTH-1:0] committed = (commit_fire & commit_eop) & final_hmma & (~tensor_ghost);
 
     VX_pipe_register #(
         .DATAW  (`ISSUE_WIDTH * (1 + `NW_WIDTH)),
diff --git a/hw/rtl/core/VX_tensor_hopper_core.sv b/hw/rtl/core/VX_tensor_hopper_core.sv
index 21fad57c..2b2136b6 100644
--- a/hw/rtl/core/VX_tensor_hopper_core.sv
+++ b/hw/rtl/core/VX_tensor_hopper_core.sv
@@ -22,12 +22,13 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
         - wb
         - rd
     */
-    wire [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] execute_if_data_uuid;
-    wire [`NUM_WARPS-1:0][`NW_WIDTH-1:0]   execute_if_data_wid;
-    wire [`NUM_WARPS-1:0][NUM_LANES-1:0]   execute_if_data_tmask;
-    wire [`NUM_WARPS-1:0][`XLEN-1:0]       execute_if_data_PC;
-    wire [`NUM_WARPS-1:0]                  execute_if_data_wb;
-    wire [`NUM_WARPS-1:0][`NR_BITS-1:0]    execute_if_data_rd;
+    wire [`NUM_WARPS-1:0][`UUID_WIDTH-1:0]    execute_if_data_uuid;
+    wire [`NUM_WARPS-1:0][`NW_WIDTH-1:0]      execute_if_data_wid;
+    wire [`NUM_WARPS-1:0][NUM_LANES-1:0]      execute_if_data_tmask;
+    wire [`NUM_WARPS-1:0][`INST_ALU_BITS-1:0] execute_if_data_op_type;
+    wire [`NUM_WARPS-1:0][`XLEN-1:0]          execute_if_data_PC;
+    wire [`NUM_WARPS-1:0]                     execute_if_data_wb;
+    wire [`NUM_WARPS-1:0][`NR_BITS-1:0]       execute_if_data_rd;
 
     wire [`NUM_WARPS-1:0] metadata_queue_fulls;
     wire [`NUM_WARPS-1:0] metadata_queue_emptys;
@@ -38,7 +39,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
     `RUNTIME_ASSERT((!execute_if.valid || execute_if.data.wid == `NW_WIDTH'(0)),
         ("runtime error: WGMMA execute not supported for warps other than 0!"))
 
-    wire metadata_deq;
+    logic metadata_deq;
 
     for (genvar i = 0; i < `NUM_WARPS; i++) begin
         // Metadata queue for commit_if.  This simply copies execute_if's
@@ -54,7 +55,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
         // FIXME: commit only warp 0
         wire deq = metadata_deq && (`NW_WIDTH'(i) == `NW_WIDTH'(0));
 
-        localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS;
+        localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `INST_ALU_BITS + `XLEN + 1 + `NR_BITS;
         VX_fifo_queue #(
             .DATAW(DATAW),
             .DEPTH(METADATA_QUEUE_DEPTH)
@@ -64,10 +65,10 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
             .push(enq),
             .pop(deq),
             .data_in({execute_if.data.uuid,  execute_if.data.wid,
-                      execute_if.data.tmask, execute_if.data.PC,
+                      execute_if.data.tmask, execute_if.data.op_type, execute_if.data.PC,
                       execute_if.data.wb,    execute_if.data.rd}),
             .data_out({execute_if_data_uuid[i],  execute_if_data_wid[i],
-                       execute_if_data_tmask[i], execute_if_data_PC[i],
+                       execute_if_data_tmask[i], execute_if_data_op_type[i], execute_if_data_PC[i],
                        execute_if_data_wb[i],    execute_if_data_rd[i]}),
             .empty(metadata_queue_emptys[i]),
             `UNUSED_PIN(alm_empty),
@@ -81,47 +82,91 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
     // the commit stage
     `RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!"))
 
-    wire initiate_ready; // FIXME: unused
+    wire initiate_ready;
     wire writeback_valid;
     wire writeback_last;
+    logic writeback_ready;
 
     wire metadata_valid = ~metadata_queue_emptys[0/*FIXME*/];
-    // dequeue metadata at the last writeback
-    assign metadata_deq = metadata_valid && writeback_valid && writeback_last;
+    wire not_wait = metadata_valid && (execute_if_data_op_type[0] != `INST_TENSOR_HGMMA_WAIT);
+    // skip HGMMA_WAIT for kickoff
+    wire initiate_valid = metadata_valid && not_wait;
+
+    // we're recycling execute_if.op_type as operands_if.op_type which might
+    // have a different width; let's be safe
+    `STATIC_ASSERT((`INST_ALU_BITS == `INST_OP_BITS),
+        ("static assertion failed: `INST_ALU_BITS != `INST_OP_BITS"))
 
     VX_tensor_hopper_core #(
     ) tensor_hopper_core (
         .clk(clk),
         .reset(reset),
 
-        .initiate_valid(metadata_valid),
+        .initiate_valid(initiate_valid),
         .initiate_wid(`NW_WIDTH'(0)/*FIXME*/),
         .initiate_ready(initiate_ready),
 
         .writeback_valid(writeback_valid),
         `UNUSED_PIN(writeback_wid),
         .writeback_last(writeback_last),
-        .writeback_ready(commit_if.ready)
+        .writeback_ready(writeback_ready)
     );
 
     wire [`NUM_THREADS-1:0][`XLEN-1:0] wb_data = '0;
 
-    assign commit_if.valid = writeback_valid;
-    assign commit_if.data.uuid   = execute_if_data_uuid[0];
-    assign commit_if.data.wid    = execute_if_data_wid[0];
-    assign commit_if.data.tmask  = execute_if_data_tmask[0];
-    assign commit_if.data.PC     = execute_if_data_PC[0];
-    assign commit_if.data.wb     = writeback_last;
-    // custom rd
-    assign commit_if.data.rd     = (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(4'd3/*FIXME*/));
-    assign commit_if.data.data   = wb_data;
-    assign commit_if.data.tensor = writeback_last;
-    assign commit_if.data.pid    = 1'b0;
-    assign commit_if.data.sop    = 1'b1;
-    // eop is deliberately set so that we don't underflow the pending_instr
-    // buffer in VX_schedule.  An instruction is considered committed only
-    // when the eop bit is set to one (see VX_commit).
-    assign commit_if.data.eop    = writeback_last;
+    always @(*) begin
+        metadata_deq = 1'b0;
+
+        // if there's something in the meta queue, give it priority for commit,
+        // since every HGMMA instructions are asynchronous and should not
+        // block
+        if (metadata_valid) begin
+            // block tensor core writeback
+            writeback_ready = 1'b0;
+
+            commit_if.valid       = metadata_valid;
+            commit_if.data.uuid   = execute_if_data_uuid[0];
+            commit_if.data.wid    = execute_if_data_wid[0];
+            commit_if.data.tmask  = execute_if_data_tmask[0];
+            commit_if.data.PC     = execute_if_data_PC[0];
+            commit_if.data.wb     = execute_if_data_wb[0];
+            commit_if.data.rd     = execute_if_data_rd[0];
+            commit_if.data.data   = wb_data; // FIXME ?
+            commit_if.data.tensor = 1'b0;
+            commit_if.data.pid    = 1'b0;
+            commit_if.data.sop    = 1'b1;
+            commit_if.data.eop    = 1'b1;
+
+            // block meta queue until tensor core is ready.  This will
+            // effectively stall further issue of async HGMMA when tensor core
+            // is busy with too many outstanding requests (depth of meta queue).
+            // be careful to not miss the commit backpressure.
+            metadata_deq = metadata_valid && commit_if.ready && initiate_ready;
+        end else begin
+            // allow tensor core writeback, provided there's no commit
+            // backpressure
+            writeback_ready = commit_if.ready;
+
+            commit_if.valid       = writeback_valid;
+            commit_if.data.uuid   = '0;
+            commit_if.data.wid    = '0; // FIXME
+            commit_if.data.tmask  = {NUM_LANES{1'b1}};
+            commit_if.data.PC     = '0;
+            commit_if.data.wb     = writeback_last;
+            commit_if.data.rd     = (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(4'd3/*FIXME*/));
+            commit_if.data.data   = wb_data;
+            // mark as "ghost" commit.  This will prevent this commit from
+            // decrementing from pending_instr buffer
+            commit_if.data.tensor = 1'b1;
+            // eop is deliberately set so that we don't underflow the pending_instr
+            // buffer in VX_schedule.  An instruction is considered committed only
+            // when the eop bit is set to one (see VX_commit).
+            // only the last ghost commit has eop set, which will trigger
+            // scoreboard to clear out the busy bit.
+            commit_if.data.eop    = writeback_last;
+        end
+    end
+
 endmodule