From 42b9d23f832d2192231e66dd55a012193ecdc860 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 11 Oct 2024 17:27:51 -0700
Subject: [PATCH] tensor: Write release logic for hgmma

Upon completion of an op, tensor_core_hopper sends a "ghost" commit
signal down the pipeline with the `wb` and `tensor` bit set in
commit_if.  The scoreboard receives this signal via writeback_if and
resets the inuse_tensor status bit back to zero, which unblocks the
HGMMA_WAIT instruction.
---
 hw/rtl/core/VX_scoreboard.sv         |  7 +++-
 hw/rtl/core/VX_tensor_hopper_core.sv | 56 +++++++++++++---------------
 2 files changed, 30 insertions(+), 33 deletions(-)

diff --git a/hw/rtl/core/VX_scoreboard.sv b/hw/rtl/core/VX_scoreboard.sv
index 59886098..2a39c058 100644
--- a/hw/rtl/core/VX_scoreboard.sv
+++ b/hw/rtl/core/VX_scoreboard.sv
@@ -146,7 +146,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
         // have an explicit destination register, use a separate status bit.
         reg [`UP(ISSUE_RATIO)-1:0] inuse_tensor;
 
-        wire hgmma_start = (ibuffer_if[i].data.ex_type == `EX_TENSOR) &&
+        wire hgmma_start = (ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR)) &&
             (ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA);
 
         wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
@@ -213,7 +213,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
         wire [3:0] operands_busy = {inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3};
     `ifdef EXT_T_HOPPER
         wire hgmma_wait = ibuffer_if[i].valid &&
-            (ibuffer_if[i].data.ex_type == `EX_TENSOR) &&
+            (ibuffer_if[i].data.ex_type == `EX_BITS'(`EX_TENSOR)) &&
             (ibuffer_if[i].data.op_type == `INST_TENSOR_HGMMA_WAIT);
         wire hgmma_ready = ~(hgmma_wait && inuse_tensor[ibuffer_if[i].data.wis]);
         wire operands_ready = (~(| operands_busy)) && hgmma_ready;
@@ -250,6 +250,9 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
                     inuse_regs[ibuffer_if[i].data.wis][ibuffer_if[i].data.rd] <= 1;
                 end
             `ifdef EXT_T_HOPPER
+                if (writeback_fire && writeback_if[i].data.tensor) begin
+                    inuse_tensor[ibuffer_if[i].data.wis] <= 1'b0;
+                end
                 if (ibuffer_if[i].valid && ibuffer_if[i].ready && hgmma_start) begin
                     inuse_tensor[ibuffer_if[i].data.wis] <= 1'b1;
                 end
diff --git a/hw/rtl/core/VX_tensor_hopper_core.sv b/hw/rtl/core/VX_tensor_hopper_core.sv
index 2ecbea70..dc763d48 100644
--- a/hw/rtl/core/VX_tensor_hopper_core.sv
+++ b/hw/rtl/core/VX_tensor_hopper_core.sv
@@ -11,6 +11,7 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
     VX_execute_if.slave execute_if,
     VX_commit_if.master commit_if
 );
+    localparam NUM_LANES = `NUM_THREADS;
     localparam METADATA_QUEUE_DEPTH = 16; // FIXME: arbitrary
 
     /* commit_if.data_t parts that we need to keep around:
@@ -21,22 +22,17 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
         - wb
         - rd
     */
-
     localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS;
 
     wire operand_enq_fire = execute_if.valid && execute_if.ready;
     wire commit_if_fire = commit_if.valid && commit_if.ready;
-    wire [DATAW-1:0] execute_if_data_enq = {
-        execute_if.data.uuid,
-        execute_if.data.wid,
-        execute_if.data.tmask,
-        execute_if.data.PC,
-        execute_if.data.wb,
-        execute_if.data.rd
-        // pid/sop/eop set later
-    };
 
-    wire [`NUM_WARPS-1:0][DATAW-1:0] execute_if_data_deq;
+    wire [`NUM_WARPS-1:0][`UUID_WIDTH-1:0] execute_if_data_uuid;
+    wire [`NUM_WARPS-1:0][`NW_WIDTH-1:0]   execute_if_data_wid;
+    wire [`NUM_WARPS-1:0][NUM_LANES-1:0]   execute_if_data_tmask;
+    wire [`NUM_WARPS-1:0][`XLEN-1:0]       execute_if_data_PC;
+    wire [`NUM_WARPS-1:0]                  execute_if_data_wb;
+    wire [`NUM_WARPS-1:0][`NR_BITS-1:0]    execute_if_data_rd;
     logic [DATAW-1:0] execute_if_data_new_rd;
 
     wire [`NUM_WARPS-1:0] metadata_queue_fulls;
@@ -71,8 +67,12 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
             .reset(reset),
             .push(enq),
             .pop(deq),
-            .data_in(execute_if_data_enq),
-            .data_out(execute_if_data_deq[i]),
+            .data_in({execute_if.data.uuid,  execute_if.data.wid,
+                      execute_if.data.tmask, execute_if.data.PC,
+                      execute_if.data.wb,    execute_if.data.rd}),
+            .data_out({execute_if_data_uuid[i],  execute_if_data_wid[i],
+                       execute_if_data_tmask[i], execute_if_data_PC[i],
+                       execute_if_data_wb[i],    execute_if_data_rd[i]}),
             .empty(metadata_queue_emptys[i]),
             `UNUSED_PIN(alm_empty),
             .full(metadata_queue_fulls[i]),
@@ -108,11 +108,6 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
         if ((state != STATE_IDLE) && (state_n == STATE_IDLE)) begin
             metadata_deq = 1'b1;
         end
-
-        // change rd of the commit data according to state
-        execute_if_data_new_rd =
-            {execute_if_data_deq[0/*FIXME*/][DATAW-1:`NR_BITS],
-             (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(state))};
     end
 
     always @(posedge clk) begin
@@ -128,19 +123,18 @@ module VX_tensor_hopper_core_block import VX_gpu_pkg::*; #(
 
     wire [`NUM_THREADS-1:0][`XLEN-1:0] wb_data = '0;
 
-    localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1 + 1;
-    wire [COMMIT_DATAW-1:0] commit_if_data = {
-        // write-back to the correct rd only when eop
-        ((state == 2'b11) ? execute_if_data_deq[0/*FIXME*/] : execute_if_data_new_rd), /* uuid ~ rd */
-        wb_data, /* data */
-        1'b0, /* tensor */
-        1'b0, /* pid */
-        1'b1, /* sop */
-        (state == 2'b11)  /* eop */
-        // 1'b1  /* eop */
-    };
-
-    assign commit_if.data = commit_if_data;
+    assign commit_if.data.uuid   = execute_if_data_uuid[0];
+    assign commit_if.data.wid    = execute_if_data_wid[0];
+    assign commit_if.data.tmask  = execute_if_data_tmask[0];
+    assign commit_if.data.PC     = execute_if_data_PC[0];
+    assign commit_if.data.wb     = (state == 2'b11);
+    // custom rd
+    assign commit_if.data.rd     = (`NR_BITS'(`NUM_IREGS) + `NR_BITS'(state));
+    assign commit_if.data.data   = wb_data;
+    assign commit_if.data.tensor = (state == 2'b11);
+    assign commit_if.data.pid    = 1'b0;
+    assign commit_if.data.sop    = 1'b1;
+    assign commit_if.data.eop    = (state == 2'b11);
 endmodule
 
 `endif