From 9f9ec109604ad6d21c366015d74538cd318c987a Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Wed, 8 May 2024 11:26:09 -0700
Subject: [PATCH 01/31] tensor: Enable scaling NUM_THREADS by octets

todo: lane-to-octet mapping is arbitrary atm
---
 hw/rtl/core/VX_tensor_core.sv         | 38 +++++++++++----------
 hw/rtl/core/VX_tensor_ucode_8lanes.vh | 49 +++++++++++++++++++++++++++
 2 files changed, 70 insertions(+), 17 deletions(-)
 create mode 100644 hw/rtl/core/VX_tensor_ucode_8lanes.vh

diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index 9971d619..71ed8538 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -10,8 +10,6 @@ module VX_tensor_core #(
     VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
     VX_commit_if.master commit_if [`ISSUE_WIDTH]
 );
-    `STATIC_ASSERT(`NUM_THREADS == 32, ("tensor core requires # of threads in a warp to be 32 (try running w/ CONFIGS=\"-DNUM_THREADS=32\")"));
-    
     for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
         VX_tensor_core_warp #(
             .ISW(i)
@@ -35,29 +33,35 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
     VX_dispatch_if.slave dispatch_if,
     VX_commit_if.master commit_if
 );
+    localparam NUM_OCTETS = (`NUM_THREADS / 8);
+    // offet in the lane numbers that get mapped to the two threadgroups in an
+    // octet. E.g. two tgs map lane 0-3 and lane 16-19 -> 16
+    // FIXME: not sure this is the right logic.  just filling in what works
+    localparam LANE_OFFSET_THREADGROUP = (4 * NUM_OCTETS);
+
     wire [1:0] step = 2'(dispatch_if.data.op_type);
-    logic [3:0] octet_results_valid;
-    logic [3:0] octet_results_ready;
-    logic [3:0] octet_operands_ready;
+    logic [NUM_OCTETS-1:0] octet_results_valid;
+    logic [NUM_OCTETS-1:0] octet_results_ready;
+    logic [NUM_OCTETS-1:0] octet_operands_ready;
     logic [`NUM_THREADS-1:0][`XLEN-1:0] wb_data_0;
     logic [`NUM_THREADS-1:0][`XLEN-1:0] wb_data_1;
     
     assign dispatch_if.ready = &octet_operands_ready;
 
 `ifdef EXT_T_ENABLE
-    for (genvar i = 0; i < 4/*octets*/; ++i) begin
+    for (genvar i = 0; i < NUM_OCTETS; ++i) begin
 `else
     for (genvar i = 0; i < 0; ++i) begin
 `endif
         // lane-to-octet mapping; see figure 13 of the paper
         wire [7:0][31:0] octet_A = {
-            dispatch_if.data.rs1_data[16+4*i +: 4], dispatch_if.data.rs1_data[4*i +: 4]
+            dispatch_if.data.rs1_data[LANE_OFFSET_THREADGROUP + 4*i +: 4], dispatch_if.data.rs1_data[4*i +: 4]
         };
         wire [7:0][31:0] octet_B = {
-            dispatch_if.data.rs2_data[16+4*i +: 4], dispatch_if.data.rs2_data[4*i +: 4]
+            dispatch_if.data.rs2_data[LANE_OFFSET_THREADGROUP + 4*i +: 4], dispatch_if.data.rs2_data[4*i +: 4]
         };
         wire [7:0][31:0] octet_C = {
-            dispatch_if.data.rs3_data[16+4*i +: 4], dispatch_if.data.rs3_data[4*i +: 4]
+            dispatch_if.data.rs3_data[LANE_OFFSET_THREADGROUP + 4*i +: 4], dispatch_if.data.rs3_data[4*i +: 4]
         };
 
         logic [3:0][3:0][31:0] octet_D;
@@ -100,15 +104,15 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
         assign wb_data_1[4*i+2] = octet_D[0][3];
         assign wb_data_1[4*i+3] = octet_D[1][3];
 
-        assign wb_data_0[4*i+16+0] = octet_D[2][0];
-        assign wb_data_0[4*i+16+1] = octet_D[3][0];
-        assign wb_data_0[4*i+16+2] = octet_D[2][2];
-        assign wb_data_0[4*i+16+3] = octet_D[3][2];
+        assign wb_data_0[4*i+LANE_OFFSET_THREADGROUP+0] = octet_D[2][0];
+        assign wb_data_0[4*i+LANE_OFFSET_THREADGROUP+1] = octet_D[3][0];
+        assign wb_data_0[4*i+LANE_OFFSET_THREADGROUP+2] = octet_D[2][2];
+        assign wb_data_0[4*i+LANE_OFFSET_THREADGROUP+3] = octet_D[3][2];
 
-        assign wb_data_1[4*i+16+0] = octet_D[2][1];
-        assign wb_data_1[4*i+16+1] = octet_D[3][1];
-        assign wb_data_1[4*i+16+2] = octet_D[2][3];
-        assign wb_data_1[4*i+16+3] = octet_D[3][3];
+        assign wb_data_1[4*i+LANE_OFFSET_THREADGROUP+0] = octet_D[2][1];
+        assign wb_data_1[4*i+LANE_OFFSET_THREADGROUP+1] = octet_D[3][1];
+        assign wb_data_1[4*i+LANE_OFFSET_THREADGROUP+2] = octet_D[2][3];
+        assign wb_data_1[4*i+LANE_OFFSET_THREADGROUP+3] = octet_D[3][3];
     end
     
     /* commit_if.data_t parts that we need to keep around:
diff --git a/hw/rtl/core/VX_tensor_ucode_8lanes.vh b/hw/rtl/core/VX_tensor_ucode_8lanes.vh
new file mode 100644
index 00000000..41ec857e
--- /dev/null
+++ b/hw/rtl/core/VX_tensor_ucode_8lanes.vh
@@ -0,0 +1,49 @@
+// uop metadata (sequencing, next state), execution metadata (EX_TYPE, OP_TYPE, OP_MOD), wb, use pc, use imm, pc, imm, rd, rs1, rs2, rs3
+HMMA_SET0_STEP0_0: begin 
+	uop = {NEXT, HMMA_SET0_STEP0_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(16), `FREG(0), `FREG(8), `FREG(16)}; 
+end 
+HMMA_SET0_STEP0_1: begin 
+	uop = {NEXT, HMMA_SET0_STEP1_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(17), `FREG(1), `FREG(9), `FREG(17)}; 
+end 
+HMMA_SET0_STEP1_0: begin 
+	uop = {NEXT, HMMA_SET0_STEP1_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(18), `FREG(0), `FREG(8), `FREG(18)}; 
+end 
+HMMA_SET0_STEP1_1: begin 
+	uop = {NEXT, HMMA_SET0_STEP2_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(19), `FREG(1), `FREG(9), `FREG(19)}; 
+end 
+HMMA_SET0_STEP2_0: begin 
+	uop = {NEXT, HMMA_SET0_STEP2_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(20), `FREG(0), `FREG(8), `FREG(20)}; 
+end 
+HMMA_SET0_STEP2_1: begin 
+	uop = {NEXT, HMMA_SET0_STEP3_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(21), `FREG(1), `FREG(9), `FREG(21)}; 
+end 
+HMMA_SET0_STEP3_0: begin 
+	uop = {NEXT, HMMA_SET0_STEP3_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(22), `FREG(0), `FREG(8), `FREG(22)}; 
+end 
+HMMA_SET0_STEP3_1: begin 
+	uop = {NEXT, HMMA_SET1_STEP0_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(23), `FREG(1), `FREG(9), `FREG(23)}; 
+end 
+HMMA_SET1_STEP0_0: begin 
+	uop = {NEXT, HMMA_SET1_STEP0_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(16), `FREG(2), `FREG(10), `FREG(16)}; 
+end 
+HMMA_SET1_STEP0_1: begin 
+	uop = {NEXT, HMMA_SET1_STEP1_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(0), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(17), `FREG(3), `FREG(11), `FREG(17)}; 
+end 
+HMMA_SET1_STEP1_0: begin 
+	uop = {NEXT, HMMA_SET1_STEP1_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(18), `FREG(2), `FREG(10), `FREG(18)}; 
+end 
+HMMA_SET1_STEP1_1: begin 
+	uop = {NEXT, HMMA_SET1_STEP2_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(1), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(19), `FREG(3), `FREG(11), `FREG(19)}; 
+end 
+HMMA_SET1_STEP2_0: begin 
+	uop = {NEXT, HMMA_SET1_STEP2_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(20), `FREG(2), `FREG(10), `FREG(20)}; 
+end 
+HMMA_SET1_STEP2_1: begin 
+	uop = {NEXT, HMMA_SET1_STEP3_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(2), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(21), `FREG(3), `FREG(11), `FREG(21)}; 
+end 
+HMMA_SET1_STEP3_0: begin 
+	uop = {NEXT, HMMA_SET1_STEP3_1, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(0), 1'b1, 1'b0, 1'b0, 32'b0, 32'b0, `FREG(22), `FREG(2), `FREG(10), `FREG(22)}; 
+end 
+HMMA_SET1_STEP3_1: begin 
+	uop = {FINISH, HMMA_SET0_STEP0_0, `EX_BITS'(`EX_TENSOR), `INST_OP_BITS'(3), `INST_MOD_BITS'(1), 1'b1, 1'b0, 1'b0, 32'b1, 32'b1, `FREG(23), `FREG(3), `FREG(11), `FREG(23)}; 
+end 

From 1a1094b2bb6b6b986ca4add4deb731e1cd6e5a1c Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Wed, 15 May 2024 15:34:26 -0700
Subject: [PATCH 02/31] tensor: Add dispatch unit to narrow to BLOCK_SIZE=1

---
 hw/rtl/core/VX_tensor_core.sv | 94 +++++++++++++++++++++++++----------
 1 file changed, 69 insertions(+), 25 deletions(-)

diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index 71ed8538..14d8175b 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -1,7 +1,7 @@
 `ifdef EXT_T_ENABLE
 `include "VX_fpu_define.vh"
 
-module VX_tensor_core #(
+module VX_tensor_core import VX_gpu_pkg::*; #(
 
 ) (
     input clk,
@@ -10,15 +10,54 @@ module VX_tensor_core #(
     VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
     VX_commit_if.master commit_if [`ISSUE_WIDTH]
 );
-    for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
+    localparam BLOCK_SIZE = 1;
+    localparam NUM_LANES  = `NUM_THREADS;
+    // localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
+    localparam PARTIAL_BW = 1;
+
+    VX_execute_if #(
+        .NUM_LANES (NUM_LANES)
+    ) execute_if[BLOCK_SIZE]();
+
+    `RESET_RELAY (dispatch_reset, reset);
+
+    VX_dispatch_unit #(
+        .BLOCK_SIZE (BLOCK_SIZE),
+        .NUM_LANES  (NUM_LANES),
+        .OUT_REG    (PARTIAL_BW ? 1 : 0)
+    ) dispatch_unit (
+        .clk        (clk),
+        .reset      (dispatch_reset),
+        .dispatch_if(dispatch_if),
+        .execute_if (execute_if)
+    );
+
+    VX_commit_if #(
+        .NUM_LANES (NUM_LANES)
+    ) commit_block_if[BLOCK_SIZE]();
+
+    `RESET_RELAY (commit_reset, reset);
+
+    VX_gather_unit #(
+        .BLOCK_SIZE (BLOCK_SIZE),
+        .NUM_LANES  (NUM_LANES),
+        .OUT_REG    (PARTIAL_BW ? 3 : 0) // FIXME: why 3?
+    ) gather_unit (
+        .clk           (clk),
+        .reset         (commit_reset),
+        .commit_in_if  (commit_block_if),
+        .commit_out_if (commit_if)
+    );
+
+    for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
         VX_tensor_core_warp #(
-            .ISW(i)
+            .ISW(1) // FIXME: not block_idx
         ) tensor_core (
             .clk(clk),
             .reset(reset),
 
-            .dispatch_if(dispatch_if[i]),
-            .commit_if(commit_if[i])
+            .execute_if(execute_if[block_idx]),
+            .commit_if(commit_block_if[block_idx])
         );
     end
     
@@ -30,7 +69,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
     input clk,
     input reset,
 
-    VX_dispatch_if.slave dispatch_if,
+    VX_execute_if.slave execute_if,
     VX_commit_if.master commit_if
 );
     localparam NUM_OCTETS = (`NUM_THREADS / 8);
@@ -39,14 +78,15 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
     // FIXME: not sure this is the right logic.  just filling in what works
     localparam LANE_OFFSET_THREADGROUP = (4 * NUM_OCTETS);
 
-    wire [1:0] step = 2'(dispatch_if.data.op_type);
+    wire [1:0] step = 2'(execute_if.data.op_type);
     logic [NUM_OCTETS-1:0] octet_results_valid;
     logic [NUM_OCTETS-1:0] octet_results_ready;
     logic [NUM_OCTETS-1:0] octet_operands_ready;
+    // FIXME: should be NUM_LANES?
     logic [`NUM_THREADS-1:0][`XLEN-1:0] wb_data_0;
     logic [`NUM_THREADS-1:0][`XLEN-1:0] wb_data_1;
     
-    assign dispatch_if.ready = &octet_operands_ready;
+    assign execute_if.ready = &octet_operands_ready;
 
 `ifdef EXT_T_ENABLE
     for (genvar i = 0; i < NUM_OCTETS; ++i) begin
@@ -55,13 +95,13 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
 `endif
         // lane-to-octet mapping; see figure 13 of the paper
         wire [7:0][31:0] octet_A = {
-            dispatch_if.data.rs1_data[LANE_OFFSET_THREADGROUP + 4*i +: 4], dispatch_if.data.rs1_data[4*i +: 4]
+            execute_if.data.rs1_data[LANE_OFFSET_THREADGROUP + 4*i +: 4], execute_if.data.rs1_data[4*i +: 4]
         };
         wire [7:0][31:0] octet_B = {
-            dispatch_if.data.rs2_data[LANE_OFFSET_THREADGROUP + 4*i +: 4], dispatch_if.data.rs2_data[4*i +: 4]
+            execute_if.data.rs2_data[LANE_OFFSET_THREADGROUP + 4*i +: 4], execute_if.data.rs2_data[4*i +: 4]
         };
         wire [7:0][31:0] octet_C = {
-            dispatch_if.data.rs3_data[LANE_OFFSET_THREADGROUP + 4*i +: 4], dispatch_if.data.rs3_data[4*i +: 4]
+            execute_if.data.rs3_data[LANE_OFFSET_THREADGROUP + 4*i +: 4], execute_if.data.rs3_data[4*i +: 4]
         };
 
         logic [3:0][3:0][31:0] octet_D;
@@ -77,7 +117,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
             .A_in(octet_A),
             .B_in(octet_B),
             .C_in(octet_C),
-            .operands_valid(dispatch_if.valid),
+            .operands_valid(execute_if.valid),
             .operands_ready(octet_operands_ready[i]),
 
             .step(step),
@@ -126,18 +166,18 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
 
     localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS;
     
-    wire dispatch_if_fire = dispatch_if.valid && dispatch_if.ready;
+    wire execute_if_fire = execute_if.valid && execute_if.ready;
     wire commit_if_fire = commit_if.valid && commit_if.ready;
-    wire [DATAW-1:0] dispatch_if_data_enq = {
-        dispatch_if.data.uuid, 
-        wis_to_wid(dispatch_if.data.wis, ISW), 
-        dispatch_if.data.tmask, 
-        dispatch_if.data.PC, 
-        dispatch_if.data.wb, 
-        dispatch_if.data.rd
+    wire [DATAW-1:0] execute_if_data_enq = {
+        execute_if.data.uuid, 
+        execute_if.data.wid,
+        execute_if.data.tmask, 
+        execute_if.data.PC, 
+        execute_if.data.wb, 
+        execute_if.data.rd
     };
 
-    wire [DATAW-1:0] dispatch_if_data_deq;
+    wire [DATAW-1:0] execute_if_data_deq;
 
     // this is probably a little oversized
     VX_fifo_queue #(
@@ -146,10 +186,10 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
     ) pending_uops (
         .clk(clk),
         .reset(reset),    
-        .push(dispatch_if_fire),
+        .push(execute_if_fire),
         .pop(commit_if_fire),
-        .data_in(dispatch_if_data_enq),
-        .data_out(dispatch_if_data_deq),
+        .data_in(execute_if_data_enq),
+        .data_out(execute_if_data_deq),
         `UNUSED_PIN(empty),      
         `UNUSED_PIN(alm_empty),
         `UNUSED_PIN(full), // should be impossible to overflow            
@@ -163,7 +203,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
 
     localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1;
     wire [COMMIT_DATAW-1:0] commit_if_data = {
-        dispatch_if_data_deq, /* uuid ~ rd */
+        execute_if_data_deq, /* uuid ~ rd */
         subcommit == 1'b0 ? wb_data_0 : wb_data_1, /* data */
         1'b0, /* pid */
         1'b1, /* sop */
@@ -227,6 +267,10 @@ module VX_tensor_octet #(
         // note that not all lanes participate at every step
         case (step)
             2'b00: begin
+                // Two A_in segments correspond to two 2x2 subtiles of A read
+                // by two threadgroups: [0:2,0:2] and [4:6,0:2] in Step 0 of
+                // Figure 10(b).  B_in OTOH is shared by two threadgroups.
+                // Note k-dimension is shrunk from 4 to 2.
                 A_half = { A_in[5:4], A_in[1:0] };
                 B_half = B_in[3:0];
             end

From 89e7d65926db97f5f4d8a422b39f30ed0609c9e2 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 16 May 2024 12:49:15 -0700
Subject: [PATCH 03/31] tensor: Add ready signal to enforce 1 warp occupancy

Currently disabled as the timing behavior is already ~accurate
---
 hw/dpi/float_dpi.cpp          |  4 ++--
 hw/rtl/VX_config.vh           |  2 +-
 hw/rtl/core/VX_tensor_core.sv | 14 ++++++++++++++
 hw/rtl/fpu/VX_tensor_dpu.sv   | 13 +++++++++++--
 4 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/hw/dpi/float_dpi.cpp b/hw/dpi/float_dpi.cpp
index 29ca22df..6a810555 100644
--- a/hw/dpi/float_dpi.cpp
+++ b/hw/dpi/float_dpi.cpp
@@ -347,7 +347,7 @@ void dpi_fmax(bool enable, int dst_fmt, int64_t a, int64_t b, int64_t* result, s
 
 // A is M * K, B is K * M, C is M * M, D is M * M
 #define M 4
-#define K 2
+#define K 2 // FIXME: 4x4x1 / cycle / octet!
 
 // all row major
 float c_A_tile[M][K];
@@ -551,7 +551,7 @@ void dpi_print_results(int wid, int octet, const svBitVecVal* A_tile, const svBi
   }
 
   steps[wid] += 1;
-  if (steps[wid] % 64 == 0) {
+  if (steps[wid] % 32 == 0) {
     steps[wid] = 0;
     std::cout << "warp " << wid << " finished wmma\n";
     std::cout << "A tile" << "\n";
diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh
index 65d56e8a..5ef71794 100644
--- a/hw/rtl/VX_config.vh
+++ b/hw/rtl/VX_config.vh
@@ -391,7 +391,7 @@
 
 // Tensor Core Latency
 `ifndef LATENCY_HMMA
-`define LATENCY_HMMA 8
+`define LATENCY_HMMA 2
 `endif
 
 // Icache Configurable Knobs //////////////////////////////////////////////////
diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index 14d8175b..185218fc 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -32,6 +32,10 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
         .execute_if (execute_if)
     );
 
+    // FIXME: when multiple warps are running, step0_0 from multiple warps can
+    // get interleaved before the first warp advances to step0_1, fucking
+    // everything up
+
     VX_commit_if #(
         .NUM_LANES (NUM_LANES)
     ) commit_block_if[BLOCK_SIZE]();
@@ -175,6 +179,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
         execute_if.data.PC, 
         execute_if.data.wb, 
         execute_if.data.rd
+        // pid/sop/eop set later
     };
 
     wire [DATAW-1:0] execute_if_data_deq;
@@ -320,8 +325,16 @@ module VX_tensor_octet #(
         end
     end
 
+    wire hmma_ready;
     wire stall = result_valid && ~result_ready;
+    // backpressure from commit
     assign operands_ready = ~stall;
+    // TODO: Below line is to only allow 1 warp to occupy the octet at a time;
+    // currently, dpu is fully-pipelined and allows concurrency between
+    // multiple warps.  This seems to be not a problem though given that the
+    // RF operand read takes >=2 cycles, which should be the end-to-end
+    // latency of the DPU anyways
+    // assign operands_ready = hmma_ready && ~stall;
 
     // A is 4x2 fp32 matrix
     wire [3:0][1:0][31:0] A_tile = {
@@ -359,6 +372,7 @@ module VX_tensor_octet #(
         .stall(stall),
         
         .valid_in(do_hmma),
+        .ready_in(hmma_ready),
         .A_tile(A_tile),
         .B_tile(B_tile),
         .C_tile(C_tile),
diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv
index cfc5f507..63d35ae7 100644
--- a/hw/rtl/fpu/VX_tensor_dpu.sv
+++ b/hw/rtl/fpu/VX_tensor_dpu.sv
@@ -11,6 +11,7 @@ module VX_tensor_dpu #(
     input stall,
 
     input valid_in,
+    output ready_in,
     input [3:0][1:0][31:0] A_tile,
     input [1:0][3:0][31:0] B_tile,
     input [3:0][3:0][31:0] C_tile,
@@ -24,12 +25,20 @@ module VX_tensor_dpu #(
         dpi_hmma(valid_in, A_tile, B_tile, C_tile, result_hmma);
     end
 
+    logic ready_reg;
     always @(posedge clk) begin
-        if (~reset && valid_in) begin
+        if (reset) begin
+            ready_reg <= '1;
+        end else if (valid_in) begin
+            ready_reg <= '0;
             dpi_print_results(int'(ISW), int'(OCTET), A_tile, B_tile, C_tile, result_hmma);
+        end else if (valid_out) begin
+            ready_reg <= '1;
         end
     end
-    
+
+    // ready as soon as valid_out
+    assign ready_in = ready_reg || valid_out;
 
     VX_shift_register #(
         .DATAW  (1 + $bits(D_tile)),

From 317695a8d0b8ca69ad30d6b38236a0d6bfa7f90f Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 16 May 2024 15:32:46 -0700
Subject: [PATCH 04/31] Add perf counters on LSU resp valid tmasks

---
 hw/rtl/core/VX_lsu_unit.sv | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/hw/rtl/core/VX_lsu_unit.sv b/hw/rtl/core/VX_lsu_unit.sv
index b4fd6ee1..63f1d4c6 100644
--- a/hw/rtl/core/VX_lsu_unit.sv
+++ b/hw/rtl/core/VX_lsu_unit.sv
@@ -596,6 +596,31 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
         .commit_out_if (commit_if)
     );
 
+`ifdef PERF_ENABLE
+    wire [`CLOG2(NUM_LANES+1)-1:0] perf_rsp_tmask_valids_per_cycle;
+    wire [`CLOG2(NUM_LANES+1)-1:0] perf_rsp_tmask_total_per_cycle;
+    reg [`PERF_CTR_BITS-1:0] perf_rsp_tmask_valids;
+    reg [`PERF_CTR_BITS-1:0] perf_rsp_tmask_total;
+    reg [`PERF_CTR_BITS-1:0] perf_rsp_fires;
+
+    `POP_COUNT(perf_rsp_tmask_valids_per_cycle, rsp_tmask);
+    assign perf_rsp_tmask_total_per_cycle = NUM_LANES;
+
+    always @(posedge clk) begin
+        if (reset) begin
+            perf_rsp_tmask_valids <= '0;
+            perf_rsp_tmask_total  <= '0;
+            perf_rsp_fires        <= '0;
+        end else begin
+            if (mem_rsp_fire) begin
+                perf_rsp_tmask_valids <= perf_rsp_tmask_valids + perf_rsp_tmask_valids_per_cycle;
+                perf_rsp_tmask_total  <= perf_rsp_tmask_total  + perf_rsp_tmask_total_per_cycle;
+                perf_rsp_fires        <= perf_rsp_fires + 1'b1;
+            end
+        end
+    end
+`endif
+
 `ifdef DBG_SCOPE_LSU
     if (CORE_ID == 0) begin
     `ifdef SCOPE

From 5034d8d14b548fd42bf983666ac1ad0d1d00c091 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 16 May 2024 20:07:30 -0700
Subject: [PATCH 05/31] tensor: Add buffer to hide 2cyc commit latency

Since operand and commit throughput are the same (2 cycles), it is
unnecessary to stall the dpu during the multi-cycle commit.
This enables the dpu to operate at full throughput of 1 operand every 2
cycles.
---
 hw/rtl/VX_config.vh           |  2 +-
 hw/rtl/core/VX_tensor_core.sv | 26 +++++++++++++++++++++++---
 hw/rtl/fpu/VX_tensor_dpu.sv   |  1 +
 3 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh
index 5ef71794..65d56e8a 100644
--- a/hw/rtl/VX_config.vh
+++ b/hw/rtl/VX_config.vh
@@ -391,7 +391,7 @@
 
 // Tensor Core Latency
 `ifndef LATENCY_HMMA
-`define LATENCY_HMMA 2
+`define LATENCY_HMMA 8
 `endif
 
 // Icache Configurable Knobs //////////////////////////////////////////////////
diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index 185218fc..29bfb98c 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -326,8 +326,10 @@ module VX_tensor_octet #(
     end
 
     wire hmma_ready;
-    wire stall = result_valid && ~result_ready;
+    wire outbuf_ready_in;
+    // wire stall = result_valid && ~result_ready;
     // backpressure from commit
+    wire stall = ~outbuf_ready_in;
     assign operands_ready = ~stall;
     // TODO: Below line is to only allow 1 warp to occupy the octet at a time;
     // currently, dpu is fully-pipelined and allows concurrency between
@@ -349,6 +351,7 @@ module VX_tensor_octet #(
     };
     // C is 4x4 fp32 matrix
     logic [3:0][3:0][31:0] C_tile;
+    logic [3:0][3:0][31:0] D_tile;
     
     always @(*) begin
         C_tile = {
@@ -360,6 +363,7 @@ module VX_tensor_octet #(
     end 
 
     wire do_hmma = (substep == 1'b1 && operands_valid && operands_ready);
+    wire dpu_valid;
 
     // this does (m,n,k)=(4,4,2) matmul, modeling compute of a single octet
     VX_tensor_dpu #(
@@ -377,8 +381,24 @@ module VX_tensor_octet #(
         .B_tile(B_tile),
         .C_tile(C_tile),
 
-        .valid_out(result_valid),
-        .D_tile(D_out)
+        .valid_out(dpu_valid),
+        .D_tile(D_tile)
+    );
+
+    // buffer to stage the result tile for 2 cycles until commit/writeback is
+    // complete
+    VX_stream_buffer #(
+        .DATAW   ($bits(D_out)),
+        .OUT_REG (1) // not sure this is necessary
+    ) output_buffer (
+        .clk (clk),
+        .reset (reset),
+        .valid_in  (dpu_valid),
+        .ready_in  (outbuf_ready_in),
+        .data_in   (D_tile),
+        .data_out  (D_out),
+        .ready_out (result_ready),
+        .valid_out (result_valid)
     );
 endmodule
 `endif
diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv
index 63d35ae7..4130fb98 100644
--- a/hw/rtl/fpu/VX_tensor_dpu.sv
+++ b/hw/rtl/fpu/VX_tensor_dpu.sv
@@ -40,6 +40,7 @@ module VX_tensor_dpu #(
     // ready as soon as valid_out
     assign ready_in = ready_reg || valid_out;
 
+    // fixed-latency model
     VX_shift_register #(
         .DATAW  (1 + $bits(D_tile)),
         .DEPTH  (`LATENCY_HMMA),

From 45d86b26a2d32f6fdec33ec9a9be5df1f850f057 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 16 May 2024 22:15:01 -0700
Subject: [PATCH 06/31] tensor: Add counter for dpu operations

---
 hw/rtl/core/VX_tensor_core.sv | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index 29bfb98c..e37f5016 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -400,5 +400,19 @@ module VX_tensor_octet #(
         .ready_out (result_ready),
         .valid_out (result_valid)
     );
+
+`ifdef PERF_ENABLE
+    logic [`PERF_CTR_BITS-1:0] perf_tensor_dpu_total;
+
+    always @(posedge clk) begin
+        if (reset) begin
+            perf_tensor_dpu_total <= '0;
+        end else begin
+            if (do_hmma) begin
+                perf_tensor_dpu_total <= perf_tensor_dpu_total + 1'b1;
+            end
+        end
+    end
+`endif
 endmodule
 `endif

From 8775458a8fcab23d45edcbf81cfb2b2ff26aa18e Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Sat, 25 May 2024 19:08:17 -0700
Subject: [PATCH 07/31] Stage half-operands per warp

An easy solution to handle multiple concurrent warp operations by
staging half-operands in their own per-warp register.  This might
increase area requirement by quite a bit.

TODO: Commit is not being handled correctly yet
---
 hw/rtl/core/VX_tensor_core.sv | 81 +++++++++++++++++++++++------------
 1 file changed, 54 insertions(+), 27 deletions(-)

diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index e37f5016..b6b11754 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -83,6 +83,8 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
     localparam LANE_OFFSET_THREADGROUP = (4 * NUM_OCTETS);
 
     wire [1:0] step = 2'(execute_if.data.op_type);
+    wire operands_last_in_pair = (execute_if.data.op_mod == `INST_MOD_BITS'(1));
+
     logic [NUM_OCTETS-1:0] octet_results_valid;
     logic [NUM_OCTETS-1:0] octet_results_ready;
     logic [NUM_OCTETS-1:0] octet_operands_ready;
@@ -111,6 +113,8 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
         logic [3:0][3:0][31:0] octet_D;
         logic result_valid;
         logic result_ready;
+
+        // op_mod is reused to indicate instruction's id in pair
         VX_tensor_octet #(
             .ISW(ISW),
             .OCTET(i)
@@ -122,6 +126,8 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
             .B_in(octet_B),
             .C_in(octet_C),
             .operands_valid(execute_if.valid),
+            .operands_wid(execute_if.data.wid),
+            .operands_last_in_pair(operands_last_in_pair),
             .operands_ready(octet_operands_ready[i]),
 
             .step(step),
@@ -245,11 +251,14 @@ module VX_tensor_octet #(
     input clk,
     input reset,
 
-    input [7:0][31:0] A_in,
-    input [7:0][31:0] B_in,
-    input [7:0][31:0] C_in,
-    input operands_valid, // we have to backpressure due to there potentially being contention over commit
-    output operands_ready,
+    input [7:0][31:0]     A_in,
+    input [7:0][31:0]     B_in,
+    input [7:0][31:0]     C_in,
+    input                 operands_valid,
+    input [`NW_WIDTH-1:0] operands_wid,
+    input                 operands_last_in_pair,
+    // we have to backpressure due to there potentially being contention over commit
+    output                operands_ready,
 
     input [1:0] step,
 
@@ -258,9 +267,9 @@ module VX_tensor_octet #(
     input result_ready
 );
     // 512 bits/octet * 4 octets per warp
-    logic [3:0][31:0] A_buffer, A_buffer_n;
-    logic [3:0][31:0] B_buffer, B_buffer_n;
-    logic [7:0][31:0] C_buffer, C_buffer_n;
+    logic [`NUM_WARPS-1:0][3:0][31:0] A_buffer, A_buffer_n;
+    logic [`NUM_WARPS-1:0][3:0][31:0] B_buffer, B_buffer_n;
+    logic [`NUM_WARPS-1:0][7:0][31:0] C_buffer, C_buffer_n;
 
     // half the inputs are buffered, half are not (instead coming straight
     // from operand bus) unlike the real tensor core.
@@ -268,6 +277,10 @@ module VX_tensor_octet #(
     logic [3:0][31:0] A_half;
     logic [3:0][31:0] B_half;
     logic [7:0][31:0] C_half;
+
+    logic [`NUM_WARPS-1:0] substeps;
+    logic [`NUM_WARPS-1:0] substeps_n;
+
     always @(*) begin
         // note that not all lanes participate at every step
         case (step)
@@ -296,18 +309,29 @@ module VX_tensor_octet #(
     end
 
     logic substep;
-    wire substep_n = (operands_ready && operands_valid) ? ~substep : substep;
+    wire operands_fire = operands_ready && operands_valid;
+    wire substep_n = operands_fire && operands_last_in_pair;
 
     always @(*) begin
         A_buffer_n = A_buffer;
         B_buffer_n = B_buffer;
         C_buffer_n = C_buffer;
+        substeps_n = substeps;
         
-        if (substep == 1'b0) begin
-            A_buffer_n = A_half;
-            B_buffer_n = B_half;
-            C_buffer_n = C_half;
+        if (operands_fire) begin
+          substeps_n[operands_wid] = ~substeps[operands_wid];
+          if (!operands_last_in_pair) begin
+            A_buffer_n[operands_wid] = A_half;
+            B_buffer_n[operands_wid] = B_half;
+            C_buffer_n[operands_wid] = C_half;
+          end
         end
+
+        // if (operands_fire && (substep == 1'b0)) begin
+        //     A_buffer_n[operands_wid] = A_half;
+        //     B_buffer_n[operands_wid] = B_half;
+        //     C_buffer_n[operands_wid] = C_half;
+        // end
     end
 
     always @(posedge clk) begin
@@ -315,13 +339,17 @@ module VX_tensor_octet #(
             A_buffer <= '0;
             B_buffer <= '0;
             C_buffer <= '0;
+
             substep <= '0;
+            substeps <= '0;
         end
         else begin
             A_buffer <= A_buffer_n;
             B_buffer <= B_buffer_n;
             C_buffer <= C_buffer_n;
+
             substep <= substep_n;
+            substeps <= substeps_n;
         end
     end
 
@@ -330,39 +358,38 @@ module VX_tensor_octet #(
     // wire stall = result_valid && ~result_ready;
     // backpressure from commit
     wire stall = ~outbuf_ready_in;
-    assign operands_ready = ~stall;
+    // assign operands_ready = ~stall;
     // TODO: Below line is to only allow 1 warp to occupy the octet at a time;
     // currently, dpu is fully-pipelined and allows concurrency between
     // multiple warps.  This seems to be not a problem though given that the
     // RF operand read takes >=2 cycles, which should be the end-to-end
     // latency of the DPU anyways
-    // assign operands_ready = hmma_ready && ~stall;
+    assign operands_ready = hmma_ready && ~stall;
 
     // A is 4x2 fp32 matrix
     wire [3:0][1:0][31:0] A_tile = {
-        { A_half[3], A_buffer[3] },
-        { A_half[2], A_buffer[2] },
-        { A_half[1], A_buffer[1] },
-        { A_half[0], A_buffer[0] }
+        { A_half[3], A_buffer[operands_wid][3] },
+        { A_half[2], A_buffer[operands_wid][2] },
+        { A_half[1], A_buffer[operands_wid][1] },
+        { A_half[0], A_buffer[operands_wid][0] }
     };
     // B is 2x4 fp32 matrix
     wire [1:0][3:0][31:0] B_tile = {
-        B_half, B_buffer
+        B_half, B_buffer[operands_wid]
     };
     // C is 4x4 fp32 matrix
     logic [3:0][3:0][31:0] C_tile;
     logic [3:0][3:0][31:0] D_tile;
     
     always @(*) begin
-        C_tile = {
-            C_half[7], C_buffer[7], C_half[5], C_buffer[5],
-            C_half[6], C_buffer[6], C_half[4], C_buffer[4],
-            C_half[3], C_buffer[3], C_half[1], C_buffer[1],
-            C_half[2], C_buffer[2], C_half[0], C_buffer[0]
-        };
+        C_tile[3] = { C_half[7], C_buffer[operands_wid][7], C_half[5], C_buffer[operands_wid][5] };
+        C_tile[2] = { C_half[6], C_buffer[operands_wid][6], C_half[4], C_buffer[operands_wid][4] };
+        C_tile[1] = { C_half[3], C_buffer[operands_wid][3], C_half[1], C_buffer[operands_wid][1] };
+        C_tile[0] = { C_half[2], C_buffer[operands_wid][2], C_half[0], C_buffer[operands_wid][0] };
     end 
 
-    wire do_hmma = (substep == 1'b1 && operands_valid && operands_ready);
+    // wire do_hmma = operands_fire && (substeps[operands_wid] == 1'b1);
+    wire do_hmma = operands_fire && operands_last_in_pair;
     wire dpu_valid;
 
     // this does (m,n,k)=(4,4,2) matmul, modeling compute of a single octet

From 5a95eba1f5424629be64dc2c927aae2b02662dec Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Sat, 25 May 2024 19:54:03 -0700
Subject: [PATCH 08/31] tensor: Clear c_*_tile before compute

This didn't really cause any problem, but just to be sure.
---
 hw/dpi/float_dpi.cpp | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/hw/dpi/float_dpi.cpp b/hw/dpi/float_dpi.cpp
index 6a810555..570d6bf2 100644
--- a/hw/dpi/float_dpi.cpp
+++ b/hw/dpi/float_dpi.cpp
@@ -358,6 +358,15 @@ float c_D_tile[M][M];
 // code assumes that svBitVecVal is basically a uint32_t
 static_assert(sizeof(svBitVecVal) == 4);
 
+void clear_float_array(float* c_tile, int rows, int cols) {
+  for (int i = 0; i < rows; i += 1) {
+    for (int j = 0; j < cols; j += 1) {
+      int index = i * cols + j;
+      c_tile[index] = 0.0f;
+    }
+  }
+}
+
 void fill_float_array(const svBitVecVal* sv_tile, float* c_tile, int rows, int cols) {
   
   for (int i = 0; i < rows; i += 1) {
@@ -396,6 +405,11 @@ void dpi_hmma(bool enable, const svBitVecVal* A_tile, const svBitVecVal* B_tile,
   if (!enable) {
     return;
   }
+  clear_float_array(&c_A_tile[0][0], M, K);
+  clear_float_array(&c_B_tile[0][0], K, M);
+  clear_float_array(&c_C_tile[0][0], M, M);
+  clear_float_array(&c_D_tile[0][0], M, M);
+
   // std::cout << "A: " << std::endl;
   fill_float_array(A_tile, &c_A_tile[0][0], M, K);
   // std::cout << "B: " << std::endl;

From 864265bda5ee5115d0de15939ea59ba92145295b Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Sat, 25 May 2024 20:04:31 -0700
Subject: [PATCH 09/31] tensor: Fix consecutive commits to write to same warp

... by splitting the pending_uops queue across warps.
---
 hw/rtl/core/VX_tensor_core.sv | 76 ++++++++++++++++++++++-------------
 hw/rtl/fpu/VX_tensor_dpu.sv   | 10 +++--
 2 files changed, 53 insertions(+), 33 deletions(-)

diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index b6b11754..d1ee3b38 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -32,10 +32,6 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
         .execute_if (execute_if)
     );
 
-    // FIXME: when multiple warps are running, step0_0 from multiple warps can
-    // get interleaved before the first warp advances to step0_1, fucking
-    // everything up
-
     VX_commit_if #(
         .NUM_LANES (NUM_LANES)
     ) commit_block_if[BLOCK_SIZE]();
@@ -83,7 +79,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
     localparam LANE_OFFSET_THREADGROUP = (4 * NUM_OCTETS);
 
     wire [1:0] step = 2'(execute_if.data.op_type);
-    wire operands_last_in_pair = (execute_if.data.op_mod == `INST_MOD_BITS'(1));
+    wire last_in_pair = (execute_if.data.op_mod == `INST_MOD_BITS'(1));
 
     logic [NUM_OCTETS-1:0] octet_results_valid;
     logic [NUM_OCTETS-1:0] octet_results_ready;
@@ -91,6 +87,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
     // FIXME: should be NUM_LANES?
     logic [`NUM_THREADS-1:0][`XLEN-1:0] wb_data_0;
     logic [`NUM_THREADS-1:0][`XLEN-1:0] wb_data_1;
+    wire [`NW_WIDTH-1:0] wb_wid;
     
     assign execute_if.ready = &octet_operands_ready;
 
@@ -127,12 +124,13 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
             .C_in(octet_C),
             .operands_valid(execute_if.valid),
             .operands_wid(execute_if.data.wid),
-            .operands_last_in_pair(operands_last_in_pair),
+            .operands_last_in_pair(last_in_pair),
             .operands_ready(octet_operands_ready[i]),
 
             .step(step),
 
             .D_out(octet_D),
+            .D_wid(wb_wid),
             .result_valid(result_valid),
             .result_ready(result_ready)
         );
@@ -188,33 +186,49 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
         // pid/sop/eop set later
     };
 
-    wire [DATAW-1:0] execute_if_data_deq;
+    wire [`NUM_WARPS-1:0][DATAW-1:0] execute_if_data_deq;
 
-    // this is probably a little oversized
-    VX_fifo_queue #(
-        .DATAW(DATAW),
-        .DEPTH(16)
-    ) pending_uops (
-        .clk(clk),
-        .reset(reset),    
-        .push(execute_if_fire),
-        .pop(commit_if_fire),
-        .data_in(execute_if_data_enq),
-        .data_out(execute_if_data_deq),
-        `UNUSED_PIN(empty),      
-        `UNUSED_PIN(alm_empty),
-        `UNUSED_PIN(full), // should be impossible to overflow            
-        `UNUSED_PIN(alm_full),
-        `UNUSED_PIN(size)
-    );
+    for (genvar i = 0; i < `NUM_WARPS; i++) begin
+        wire enq = execute_if_fire && (execute_if.data.wid == i);
+        wire deq = commit_if_fire && (wb_wid == i);
+        logic full;
 
+        // execute_if request queue.
+        // This has to be separated per-warp, as otherwise requests from
+        // multiple warps can be enqueued interleaved, which makes it hard to
+        // ensure two consecutive dequeues are associated to the same warp for
+        // commit.
+        VX_fifo_queue #(
+            .DATAW(DATAW),
+            .DEPTH(4 /* FIXME: arbitrary */)
+        ) pending_uops (
+            .clk(clk),
+            .reset(reset),
+            .push(enq),
+            .pop(deq),
+            .data_in(execute_if_data_enq),
+            .data_out(execute_if_data_deq[i]),
+            `UNUSED_PIN(empty),
+            `UNUSED_PIN(alm_empty),
+            .full(full), // should be impossible to overflow
+            `UNUSED_PIN(alm_full),
+            `UNUSED_PIN(size)
+        );
+
+        `RUNTIME_ASSERT(!full, ("tensor core uop queue is full!"));
+    end
+
+    // unlike execute which can be interleaved between warps, commit is
+    // serialized and completed one-warp-by-warp, therefore we only need to
+    // keep one subcommit state bit unlike for `substeps`
     logic subcommit, subcommit_n;
+
     wire all_valid = (& octet_results_valid);
     assign commit_if.valid = all_valid;
 
     localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1;
     wire [COMMIT_DATAW-1:0] commit_if_data = {
-        execute_if_data_deq, /* uuid ~ rd */
+        execute_if_data_deq[wb_wid], /* uuid ~ rd */
         subcommit == 1'b0 ? wb_data_0 : wb_data_1, /* data */
         1'b0, /* pid */
         1'b1, /* sop */
@@ -263,6 +277,7 @@ module VX_tensor_octet #(
     input [1:0] step,
 
     output [3:0][3:0][31:0] D_out,
+    output [`NW_WIDTH-1:0]  D_wid,
     output result_valid,
     input result_ready
 );
@@ -380,6 +395,7 @@ module VX_tensor_octet #(
     // C is 4x4 fp32 matrix
     logic [3:0][3:0][31:0] C_tile;
     logic [3:0][3:0][31:0] D_tile;
+    logic [`NW_WIDTH-1:0]  D_warp_id;
     
     always @(*) begin
         C_tile[3] = { C_half[7], C_buffer[operands_wid][7], C_half[5], C_buffer[operands_wid][5] };
@@ -407,23 +423,25 @@ module VX_tensor_octet #(
         .A_tile(A_tile),
         .B_tile(B_tile),
         .C_tile(C_tile),
+        .warp_id(operands_wid),
 
         .valid_out(dpu_valid),
-        .D_tile(D_tile)
+        .D_tile(D_tile),
+        .D_warp_id(D_warp_id)
     );
 
     // buffer to stage the result tile for 2 cycles until commit/writeback is
     // complete
     VX_stream_buffer #(
-        .DATAW   ($bits(D_out)),
+        .DATAW   ($bits(D_wid) + $bits(D_out)),
         .OUT_REG (1) // not sure this is necessary
     ) output_buffer (
         .clk (clk),
         .reset (reset),
         .valid_in  (dpu_valid),
         .ready_in  (outbuf_ready_in),
-        .data_in   (D_tile),
-        .data_out  (D_out),
+        .data_in   ({D_warp_id, D_tile}),
+        .data_out  ({D_wid,     D_out}),
         .ready_out (result_ready),
         .valid_out (result_valid)
     );
diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv
index 4130fb98..1ffbb6d3 100644
--- a/hw/rtl/fpu/VX_tensor_dpu.sv
+++ b/hw/rtl/fpu/VX_tensor_dpu.sv
@@ -15,9 +15,11 @@ module VX_tensor_dpu #(
     input [3:0][1:0][31:0] A_tile,
     input [1:0][3:0][31:0] B_tile,
     input [3:0][3:0][31:0] C_tile,
+    input [`NW_WIDTH-1:0]  warp_id,
 
     output valid_out,
-    output [3:0][3:0][31:0] D_tile
+    output [3:0][3:0][31:0] D_tile,
+    output [`NW_WIDTH-1:0]  D_warp_id
 );
     logic [3:0][3:0][31:0] result_hmma;
 
@@ -42,15 +44,15 @@ module VX_tensor_dpu #(
 
     // fixed-latency model
     VX_shift_register #(
-        .DATAW  (1 + $bits(D_tile)),
+        .DATAW  (1 + $bits(warp_id) + $bits(D_tile)),
         .DEPTH  (`LATENCY_HMMA),
         .RESETW (1)
     ) shift_reg (
         .clk      (clk),
         .reset    (reset),
         .enable   (~stall),
-        .data_in  ({valid_in, result_hmma}),
-        .data_out ({valid_out, D_tile})
+        .data_in  ({valid_in,  warp_id,   result_hmma}),
+        .data_out ({valid_out, D_warp_id, D_tile})
     );
 endmodule
 `endif

From 28f6cd59b5dcfc8885827d33fd2e881a2c33e96e Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Sun, 26 May 2024 21:59:25 -0700
Subject: [PATCH 10/31] tensor: Improve commit efficiency by decoupling dpu
 with fifo

---
 hw/rtl/VX_config.vh           |  2 +-
 hw/rtl/core/VX_tensor_core.sv | 42 +++++++++++++++++++++++------------
 hw/rtl/fpu/VX_tensor_dpu.sv   | 10 ++++-----
 3 files changed, 34 insertions(+), 20 deletions(-)

diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh
index 65d56e8a..5ef71794 100644
--- a/hw/rtl/VX_config.vh
+++ b/hw/rtl/VX_config.vh
@@ -391,7 +391,7 @@
 
 // Tensor Core Latency
 `ifndef LATENCY_HMMA
-`define LATENCY_HMMA 8
+`define LATENCY_HMMA 2
 `endif
 
 // Icache Configurable Knobs //////////////////////////////////////////////////
diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index d1ee3b38..0612ca12 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -189,8 +189,8 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
     wire [`NUM_WARPS-1:0][DATAW-1:0] execute_if_data_deq;
 
     for (genvar i = 0; i < `NUM_WARPS; i++) begin
-        wire enq = execute_if_fire && (execute_if.data.wid == i);
-        wire deq = commit_if_fire && (wb_wid == i);
+        wire enq = execute_if_fire && (execute_if.data.wid == `NW_WIDTH'(i));
+        wire deq =  commit_if_fire && (             wb_wid == `NW_WIDTH'(i));
         logic full;
 
         // execute_if request queue.
@@ -395,7 +395,7 @@ module VX_tensor_octet #(
     // C is 4x4 fp32 matrix
     logic [3:0][3:0][31:0] C_tile;
     logic [3:0][3:0][31:0] D_tile;
-    logic [`NW_WIDTH-1:0]  D_warp_id;
+    logic [`NW_WIDTH-1:0]  D_wid_dpu;
     
     always @(*) begin
         C_tile[3] = { C_half[7], C_buffer[operands_wid][7], C_half[5], C_buffer[operands_wid][5] };
@@ -423,27 +423,41 @@ module VX_tensor_octet #(
         .A_tile(A_tile),
         .B_tile(B_tile),
         .C_tile(C_tile),
-        .warp_id(operands_wid),
+        .wid(operands_wid),
 
         .valid_out(dpu_valid),
         .D_tile(D_tile),
-        .D_warp_id(D_warp_id)
+        .D_wid(D_wid_dpu)
     );
 
+    wire outbuf_empty;
+    wire outbuf_full;
+    assign outbuf_ready_in = ~outbuf_full;
+    assign result_valid    = ~outbuf_empty;
+
+    wire outbuf_enq = outbuf_ready_in && dpu_valid;
+    wire outbuf_deq = result_valid && result_ready;
+
     // buffer to stage the result tile for 2 cycles until commit/writeback is
-    // complete
-    VX_stream_buffer #(
+    // complete.  This decouples the irregular dpu output traffic from the
+    // regular, every-2-cycle commit traffic and thereby ensures the commit
+    // pipeline is used more efficiently.
+    // TODO: This is probably oversized.
+    VX_fifo_queue #(
         .DATAW   ($bits(D_wid) + $bits(D_out)),
-        .OUT_REG (1) // not sure this is necessary
+        .DEPTH   (8 /* FIXME: arbitrary */)
     ) output_buffer (
-        .clk (clk),
+        .clk   (clk),
         .reset (reset),
-        .valid_in  (dpu_valid),
-        .ready_in  (outbuf_ready_in),
-        .data_in   ({D_warp_id, D_tile}),
+        .push      (outbuf_enq),
+        .pop       (outbuf_deq),
+        .data_in   ({D_wid_dpu, D_tile}),
         .data_out  ({D_wid,     D_out}),
-        .ready_out (result_ready),
-        .valid_out (result_valid)
+        .empty     (outbuf_empty),
+        `UNUSED_PIN(alm_empty),
+        .full      (outbuf_full), // should be impossible to overflow
+        `UNUSED_PIN(alm_full),
+        `UNUSED_PIN(size)
     );
 
 `ifdef PERF_ENABLE
diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv
index 1ffbb6d3..7a3ee41d 100644
--- a/hw/rtl/fpu/VX_tensor_dpu.sv
+++ b/hw/rtl/fpu/VX_tensor_dpu.sv
@@ -15,11 +15,11 @@ module VX_tensor_dpu #(
     input [3:0][1:0][31:0] A_tile,
     input [1:0][3:0][31:0] B_tile,
     input [3:0][3:0][31:0] C_tile,
-    input [`NW_WIDTH-1:0]  warp_id,
+    input [`NW_WIDTH-1:0]  wid,
 
     output valid_out,
     output [3:0][3:0][31:0] D_tile,
-    output [`NW_WIDTH-1:0]  D_warp_id
+    output [`NW_WIDTH-1:0]  D_wid
 );
     logic [3:0][3:0][31:0] result_hmma;
 
@@ -44,15 +44,15 @@ module VX_tensor_dpu #(
 
     // fixed-latency model
     VX_shift_register #(
-        .DATAW  (1 + $bits(warp_id) + $bits(D_tile)),
+        .DATAW  (1 + $bits(wid) + $bits(D_tile)),
         .DEPTH  (`LATENCY_HMMA),
         .RESETW (1)
     ) shift_reg (
         .clk      (clk),
         .reset    (reset),
         .enable   (~stall),
-        .data_in  ({valid_in,  warp_id,   result_hmma}),
-        .data_out ({valid_out, D_warp_id, D_tile})
+        .data_in  ({valid_in,  wid,   result_hmma}),
+        .data_out ({valid_out, D_wid, D_tile})
     );
 endmodule
 `endif

From c03a5b070c4046b5a708c1799ea880249c85d2d5 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Mon, 27 May 2024 18:24:24 -0700
Subject: [PATCH 11/31] tensor: Issue queue for dpu to improve utilization

---
 hw/rtl/core/VX_tensor_core.sv | 198 +++++++++++++++++++++++-----------
 hw/rtl/fpu/VX_tensor_dpu.sv   |   2 +-
 2 files changed, 138 insertions(+), 62 deletions(-)

diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index 0612ca12..5f32f504 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -125,10 +125,9 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
             .operands_valid(execute_if.valid),
             .operands_wid(execute_if.data.wid),
             .operands_last_in_pair(last_in_pair),
+            .operands_step(step),
             .operands_ready(octet_operands_ready[i]),
 
-            .step(step),
-
             .D_out(octet_D),
             .D_wid(wb_wid),
             .result_valid(result_valid),
@@ -186,18 +185,38 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
         // pid/sop/eop set later
     };
 
+    // wire [DATAW-1:0] execute_if_data_deq;
+
+    // VX_fifo_queue #(
+    //     .DATAW(DATAW),
+    //     .DEPTH(4 /* FIXME: arbitrary */)
+    // ) pending_uops (
+    //     .clk(clk),
+    //     .reset(reset),
+    //     .push(execute_if_fire),
+    //     .pop(commit_if_fire),
+    //     .data_in(execute_if_data_enq),
+    //     .data_out(execute_if_data_deq),
+    //     `UNUSED_PIN(empty),
+    //     `UNUSED_PIN(alm_empty),
+    //     `UNUSED_PIN(full), // should be impossible to overflow
+    //     `UNUSED_PIN(alm_full),
+    //     `UNUSED_PIN(size)
+    // );
+
     wire [`NUM_WARPS-1:0][DATAW-1:0] execute_if_data_deq;
 
     for (genvar i = 0; i < `NUM_WARPS; i++) begin
-        wire enq = execute_if_fire && (execute_if.data.wid == `NW_WIDTH'(i));
-        wire deq =  commit_if_fire && (             wb_wid == `NW_WIDTH'(i));
-        logic full;
-
         // execute_if request queue.
         // This has to be separated per-warp, as otherwise requests from
         // multiple warps can be enqueued interleaved, which makes it hard to
-        // ensure two consecutive dequeues are associated to the same warp for
+        // ensure two consecutive dequeues are associated with the same warp for
         // commit.
+
+        wire enq = execute_if_fire && (execute_if.data.wid == `NW_WIDTH'(i));
+        wire deq =  commit_if_fire && (             wb_wid == `NW_WIDTH'(i));
+        wire full;
+
         VX_fifo_queue #(
             .DATAW(DATAW),
             .DEPTH(4 /* FIXME: arbitrary */)
@@ -215,7 +234,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
             `UNUSED_PIN(size)
         );
 
-        `RUNTIME_ASSERT(!full, ("tensor core uop queue is full!"));
+        `RUNTIME_ASSERT(!(!reset && full), ("tensor core uop queue is full!"));
     end
 
     // unlike execute which can be interleaved between warps, commit is
@@ -229,6 +248,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
     localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1;
     wire [COMMIT_DATAW-1:0] commit_if_data = {
         execute_if_data_deq[wb_wid], /* uuid ~ rd */
+        // execute_if_data_deq, /* uuid ~ rd */
         subcommit == 1'b0 ? wb_data_0 : wb_data_1, /* data */
         1'b0, /* pid */
         1'b1, /* sop */
@@ -271,11 +291,10 @@ module VX_tensor_octet #(
     input                 operands_valid,
     input [`NW_WIDTH-1:0] operands_wid,
     input                 operands_last_in_pair,
+    input [1:0]           operands_step,
     // we have to backpressure due to there potentially being contention over commit
     output                operands_ready,
 
-    input [1:0] step,
-
     output [3:0][3:0][31:0] D_out,
     output [`NW_WIDTH-1:0]  D_wid,
     output result_valid,
@@ -292,11 +311,73 @@ module VX_tensor_octet #(
     logic [3:0][31:0] A_half;
     logic [3:0][31:0] B_half;
     logic [7:0][31:0] C_half;
+    logic [3:0][31:0] A_half_buf;
+    logic [3:0][31:0] B_half_buf;
+    logic [7:0][31:0] C_half_buf;
+
 
     logic [`NUM_WARPS-1:0] substeps;
     logic [`NUM_WARPS-1:0] substeps_n;
 
-    always @(*) begin
+    wire [7:0][31:0]     A_in_buf;
+    wire [7:0][31:0]     B_in_buf;
+    wire [7:0][31:0]     C_in_buf;
+    wire                 operands_valid_buf;
+    wire                 operands_ready_buf;
+    wire [`NW_WIDTH-1:0] operands_wid_buf;
+    wire                 operands_last_in_pair_buf;
+    wire [1:0]           operands_step_buf;
+
+    wire inbuf_empty;
+    wire inbuf_full;
+    wire inbuf_ready_in;
+    assign inbuf_ready_in     = !inbuf_full;
+    assign operands_ready     = inbuf_ready_in;
+    assign operands_valid_buf = !inbuf_empty;
+
+    wire inbuf_enq = operands_ready && operands_valid && operands_last_in_pair;
+    wire inbuf_deq = operands_valid_buf && operands_ready_buf;
+
+    // the 'issue queue' for the dpu.
+    // This exists to decouple the input of the dot-product unit from
+    // execute_if.ready.  execute_if can arrive intermittently according to
+    // the frontend's behavior, and since the dpu can also stall for a fixed
+    // initiation latency, we need to decouple the two to efficiently feed the
+    // dpu.
+    // This only applies to the last instruction in a pair, since the first
+    // instruction only acts to buffer the operands and can execute
+    // immediately without backpressure.  So we don't enqueue them.
+    VX_fifo_queue #(
+        .DATAW   ($bits(A_in) + $bits(B_in) + $bits(C_in) +
+                  $bits(operands_wid) + $bits(operands_step) + $bits(operands_last_in_pair)),
+        .DEPTH   (4 /* FIXME: arbitrary */)
+    ) input_buffer (
+        .clk   (clk),
+        .reset (reset),
+        .push      (inbuf_enq),
+        .pop       (inbuf_deq),
+        .data_in   ({A_in,     B_in,     C_in,     operands_wid,     operands_step,     operands_last_in_pair}),
+        .data_out  ({A_in_buf, B_in_buf, C_in_buf, operands_wid_buf, operands_step_buf, operands_last_in_pair_buf}),
+        .empty     (inbuf_empty),
+        `UNUSED_PIN(alm_empty),
+        .full      (inbuf_full),
+        `UNUSED_PIN(alm_full),
+        `UNUSED_PIN(size)
+    );
+
+    typedef struct {
+      logic [3:0][31:0] A_half;
+      logic [3:0][31:0] B_half;
+      logic [7:0][31:0] C_half;
+    } half_t;
+
+    function half_t get_operand_half(
+      input logic [1:0] step,
+      input logic [7:0][31:0] A_in,
+      input logic [7:0][31:0] B_in,
+      input logic [7:0][31:0] C_in
+    );
+        half_t half;
         // note that not all lanes participate at every step
         case (step)
             2'b00: begin
@@ -304,28 +385,34 @@ module VX_tensor_octet #(
                 // by two threadgroups: [0:2,0:2] and [4:6,0:2] in Step 0 of
                 // Figure 10(b).  B_in OTOH is shared by two threadgroups.
                 // Note k-dimension is shrunk from 4 to 2.
-                A_half = { A_in[5:4], A_in[1:0] };
-                B_half = B_in[3:0];
+                half.A_half = { A_in[5:4], A_in[1:0] };
+                half.B_half = B_in[3:0];
             end
             2'b01: begin
-                A_half = { A_in[7:6], A_in[3:2] };
-                B_half = B_in[3:0];
+                half.A_half = { A_in[7:6], A_in[3:2] };
+                half.B_half = B_in[3:0];
             end
             2'b10: begin
-                A_half = { A_in[5:4], A_in[1:0] };
-                B_half = B_in[7:4];
+                half.A_half = { A_in[5:4], A_in[1:0] };
+                half.B_half = B_in[7:4];
             end
             2'b11: begin
-                A_half = { A_in[7:6], A_in[3:2] };
-                B_half = B_in[7:4];
+                half.A_half = { A_in[7:6], A_in[3:2] };
+                half.B_half = B_in[7:4];
             end
         endcase
-        C_half = C_in;
-    end
+        half.C_half = C_in;
+        return half;
+    endfunction
 
-    logic substep;
-    wire operands_fire = operands_ready && operands_valid;
-    wire substep_n = operands_fire && operands_last_in_pair;
+    half_t halves;
+    half_t halves_buf;
+    assign halves     = get_operand_half(operands_step, A_in, B_in, C_in);
+    assign halves_buf = get_operand_half(operands_step_buf, A_in_buf, B_in_buf, C_in_buf);
+
+    wire do_hmma = operands_ready_buf && operands_valid_buf && operands_last_in_pair_buf;
+    wire operands_first_in_pair_fire = operands_ready && operands_valid && (!operands_last_in_pair);
+    // wire operands_first_in_pair_fire = operands_ready && operands_valid;
 
     always @(*) begin
         A_buffer_n = A_buffer;
@@ -333,20 +420,15 @@ module VX_tensor_octet #(
         C_buffer_n = C_buffer;
         substeps_n = substeps;
         
-        if (operands_fire) begin
-          substeps_n[operands_wid] = ~substeps[operands_wid];
-          if (!operands_last_in_pair) begin
-            A_buffer_n[operands_wid] = A_half;
-            B_buffer_n[operands_wid] = B_half;
-            C_buffer_n[operands_wid] = C_half;
-          end
+        if (operands_first_in_pair_fire) begin
+          substeps_n[operands_wid] = 1'b1; // ready for hmma
+          A_buffer_n[operands_wid] = halves.A_half;
+          B_buffer_n[operands_wid] = halves.B_half;
+          C_buffer_n[operands_wid] = halves.C_half;
+        end
+        if (do_hmma) begin
+          substeps_n[operands_wid_buf] = 1'b0; // finished hmma, ready for next operand
         end
-
-        // if (operands_fire && (substep == 1'b0)) begin
-        //     A_buffer_n[operands_wid] = A_half;
-        //     B_buffer_n[operands_wid] = B_half;
-        //     C_buffer_n[operands_wid] = C_half;
-        // end
     end
 
     always @(posedge clk) begin
@@ -354,43 +436,39 @@ module VX_tensor_octet #(
             A_buffer <= '0;
             B_buffer <= '0;
             C_buffer <= '0;
-
-            substep <= '0;
             substeps <= '0;
         end
         else begin
             A_buffer <= A_buffer_n;
             B_buffer <= B_buffer_n;
             C_buffer <= C_buffer_n;
-
-            substep <= substep_n;
             substeps <= substeps_n;
         end
     end
 
-    wire hmma_ready;
     wire outbuf_ready_in;
-    // wire stall = result_valid && ~result_ready;
     // backpressure from commit
     wire stall = ~outbuf_ready_in;
+    wire hmma_ready;
+
     // assign operands_ready = ~stall;
     // TODO: Below line is to only allow 1 warp to occupy the octet at a time;
     // currently, dpu is fully-pipelined and allows concurrency between
     // multiple warps.  This seems to be not a problem though given that the
     // RF operand read takes >=2 cycles, which should be the end-to-end
     // latency of the DPU anyways
-    assign operands_ready = hmma_ready && ~stall;
+    assign operands_ready_buf = hmma_ready && ~stall;
 
     // A is 4x2 fp32 matrix
     wire [3:0][1:0][31:0] A_tile = {
-        { A_half[3], A_buffer[operands_wid][3] },
-        { A_half[2], A_buffer[operands_wid][2] },
-        { A_half[1], A_buffer[operands_wid][1] },
-        { A_half[0], A_buffer[operands_wid][0] }
+        { halves_buf.A_half[3], A_buffer[operands_wid_buf][3] },
+        { halves_buf.A_half[2], A_buffer[operands_wid_buf][2] },
+        { halves_buf.A_half[1], A_buffer[operands_wid_buf][1] },
+        { halves_buf.A_half[0], A_buffer[operands_wid_buf][0] }
     };
     // B is 2x4 fp32 matrix
     wire [1:0][3:0][31:0] B_tile = {
-        B_half, B_buffer[operands_wid]
+        halves_buf.B_half, B_buffer[operands_wid_buf]
     };
     // C is 4x4 fp32 matrix
     logic [3:0][3:0][31:0] C_tile;
@@ -398,14 +476,12 @@ module VX_tensor_octet #(
     logic [`NW_WIDTH-1:0]  D_wid_dpu;
     
     always @(*) begin
-        C_tile[3] = { C_half[7], C_buffer[operands_wid][7], C_half[5], C_buffer[operands_wid][5] };
-        C_tile[2] = { C_half[6], C_buffer[operands_wid][6], C_half[4], C_buffer[operands_wid][4] };
-        C_tile[1] = { C_half[3], C_buffer[operands_wid][3], C_half[1], C_buffer[operands_wid][1] };
-        C_tile[0] = { C_half[2], C_buffer[operands_wid][2], C_half[0], C_buffer[operands_wid][0] };
+        C_tile[3] = { halves_buf.C_half[7], C_buffer[operands_wid_buf][7], halves_buf.C_half[5], C_buffer[operands_wid_buf][5] };
+        C_tile[2] = { halves_buf.C_half[6], C_buffer[operands_wid_buf][6], halves_buf.C_half[4], C_buffer[operands_wid_buf][4] };
+        C_tile[1] = { halves_buf.C_half[3], C_buffer[operands_wid_buf][3], halves_buf.C_half[1], C_buffer[operands_wid_buf][1] };
+        C_tile[0] = { halves_buf.C_half[2], C_buffer[operands_wid_buf][2], halves_buf.C_half[0], C_buffer[operands_wid_buf][0] };
     end 
 
-    // wire do_hmma = operands_fire && (substeps[operands_wid] == 1'b1);
-    wire do_hmma = operands_fire && operands_last_in_pair;
     wire dpu_valid;
 
     // this does (m,n,k)=(4,4,2) matmul, modeling compute of a single octet
@@ -423,7 +499,7 @@ module VX_tensor_octet #(
         .A_tile(A_tile),
         .B_tile(B_tile),
         .C_tile(C_tile),
-        .wid(operands_wid),
+        .wid(operands_wid_buf),
 
         .valid_out(dpu_valid),
         .D_tile(D_tile),
@@ -438,14 +514,14 @@ module VX_tensor_octet #(
     wire outbuf_enq = outbuf_ready_in && dpu_valid;
     wire outbuf_deq = result_valid && result_ready;
 
-    // buffer to stage the result tile for 2 cycles until commit/writeback is
-    // complete.  This decouples the irregular dpu output traffic from the
-    // regular, every-2-cycle commit traffic and thereby ensures the commit
-    // pipeline is used more efficiently.
+    // buffer to stage the result D tile for 2 cycles until commit/writeback
+    // is complete.  This decouples the irregular dpu output traffic from the
+    // regular, every-2-cycle commit traffic to ensure the commit pipeline is
+    // used more efficiently.
     // TODO: This is probably oversized.
     VX_fifo_queue #(
         .DATAW   ($bits(D_wid) + $bits(D_out)),
-        .DEPTH   (8 /* FIXME: arbitrary */)
+        .DEPTH   (4 /* FIXME: arbitrary */)
     ) output_buffer (
         .clk   (clk),
         .reset (reset),
diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv
index 7a3ee41d..7e96a296 100644
--- a/hw/rtl/fpu/VX_tensor_dpu.sv
+++ b/hw/rtl/fpu/VX_tensor_dpu.sv
@@ -51,7 +51,7 @@ module VX_tensor_dpu #(
         .clk      (clk),
         .reset    (reset),
         .enable   (~stall),
-        .data_in  ({valid_in,  wid,   result_hmma}),
+        .data_in  ({valid_in && ready_in,  wid,   result_hmma}),
         .data_out ({valid_out, D_wid, D_tile})
     );
 endmodule

From e9df173745295d78acabf9613098186f2df5d164 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Wed, 29 May 2024 13:34:25 -0700
Subject: [PATCH 12/31] tensor: Use chisel-generated dpu module

---
 hw/rtl/fpu/VX_tensor_dpu.sv | 113 ++++++++++++++++++++++++++++++++++--
 1 file changed, 108 insertions(+), 5 deletions(-)

diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv
index 7e96a296..33529370 100644
--- a/hw/rtl/fpu/VX_tensor_dpu.sv
+++ b/hw/rtl/fpu/VX_tensor_dpu.sv
@@ -42,17 +42,120 @@ module VX_tensor_dpu #(
     // ready as soon as valid_out
     assign ready_in = ready_reg || valid_out;
 
-    // fixed-latency model
+    // fully pipelined; always ready
+    // assign ready_in = 1'b1;
+
+    // wire        dpu_valid;
+    // wire [31:0] dpu_data;
+    // TensorDotProductUnit dpu_pipe (
+    //   .clock (clk),
+    //   .reset (reset),
+    //   .io_in_valid  (valid_in && ready_in),
+    //   .io_in_bits_a_0 (32'h40000000),
+    //   .io_in_bits_a_1 (32'h40000000),
+    //   .io_in_bits_a_2 (32'h40000000),
+    //   .io_in_bits_a_3 (32'h40000000),
+    //   .io_in_bits_b_0 (32'h40000000),
+    //   .io_in_bits_b_1 (32'h40000000),
+    //   .io_in_bits_b_2 (32'h40000000),
+    //   .io_in_bits_b_3 (32'h40000000),
+    //   .io_in_bits_c   (32'h3f800000),
+    //   .io_out_valid (dpu_valid),
+    //   .io_out_bits_data (dpu_data)
+    // );
+
+    logic [1:0] threadgroup_valids;
+    // B_tile is shared across the two threadgroups; see Figure 13
+    VX_tensor_threadgroup #(
+    ) threadgroup_0 (
+        .clk   (clk),
+        .reset (reset),
+        .valid_in  (valid_in && ready_in),
+        .stall     (stall),
+        .A_frag    (A_tile[1:0]),
+        .B_frag    (B_tile),
+        .C_frag    (C_tile[1:0]),
+        .valid_out (threadgroup_valids[0]),
+        .D_frag    (D_tile[1:0])
+    );
+    VX_tensor_threadgroup #(
+    ) threadgroup_1 (
+        .clk   (clk),
+        .reset (reset),
+        .valid_in  (valid_in && ready_in),
+        .stall     (stall),
+        .A_frag    (A_tile[3:2]),
+        .B_frag    (B_tile),
+        .C_frag    (C_tile[3:2]),
+        .valid_out (threadgroup_valids[1]),
+        .D_frag    (D_tile[3:2])
+    );
+
+    // fixed-latency queue
     VX_shift_register #(
-        .DATAW  (1 + $bits(wid) + $bits(D_tile)),
-        .DEPTH  (`LATENCY_HMMA),
+        .DATAW  (1 + $bits(wid)/* + $bits(D_tile)*/),
+        // .DEPTH  (`LATENCY_HMMA),
+        .DEPTH  (2),
         .RESETW (1)
     ) shift_reg (
         .clk      (clk),
         .reset    (reset),
         .enable   (~stall),
-        .data_in  ({valid_in && ready_in,  wid,   result_hmma}),
-        .data_out ({valid_out, D_wid, D_tile})
+        .data_in  ({valid_in && ready_in, wid  /*, result_hmma*/}),
+        .data_out ({valid_out,            D_wid/*, D_tile     */})
     );
+
+    // FIXME: breaks when stall is on!
+    `RUNTIME_ASSERT(reset || (&(threadgroup_valids) == valid_out),
+                    ("FEDP and metadata queue went out of sync!"))
 endmodule
+
+// does (m,n,k) = (2,4,2) matmul compute over 2 cycles.
+// matches Figure 10(b) of the paper.
+module VX_tensor_threadgroup #(
+) (
+    input clk,
+    input reset,
+
+    input valid_in,
+    input stall,
+    input [1:0][1:0][31:0] A_frag,
+    input [1:0][3:0][31:0] B_frag,
+    input [1:0][3:0][31:0] C_frag,
+
+    output valid_out,
+    output [1:0][3:0][31:0] D_frag
+
+);
+    // 4 FEDPs per threadgroup
+    // FIXME: experimenting with 8 FEDPs first
+    logic [1:0][3:0] valids;
+    for (genvar D_row = 0; D_row < 2; ++D_row) begin
+      for (genvar D_col = 0; D_col < 4; ++D_col) begin
+        // four-element dot product (FEDP) unit
+        TensorDotProductUnit fedp (
+          .clock (clk),
+          .reset (reset),
+          .io_in_valid      (valid_in),
+          .io_in_bits_a_0   (A_frag[D_row][0]),
+          .io_in_bits_a_1   (A_frag[D_row][1]),
+          .io_in_bits_a_2   (32'h0),
+          .io_in_bits_a_3   (32'h0),
+          .io_in_bits_b_0   (B_frag[0][D_col]),
+          .io_in_bits_b_1   (B_frag[1][D_col]),
+          .io_in_bits_b_2   (32'h0),
+          .io_in_bits_b_3   (32'h0),
+          .io_in_bits_c     (C_frag[D_row][D_col]),
+          .io_stall         (1'b0), // FIXME
+          .io_out_valid     (valids[D_row][D_col]),
+          .io_out_bits_data (D_frag[D_row][D_col])
+        );
+      end
+    end
+
+    assign valid_out = (&(valids[0])) && (&(valids[1]));
+
+    `RUNTIME_ASSERT(reset || !stall, ("stall not supported yet in tensor dpu!"))
+endmodule
+
 `endif

From f5a9ca5bf31fc4ddc70a81b5c7a5e6d8bc751697 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Wed, 29 May 2024 14:47:25 -0700
Subject: [PATCH 13/31] tensor: Enqueue both insts in pair to issue queue

Otherwise the first-in-pair instructions can run ahead, latching their
inputs for the next pair before the second-in-pair insts finish compute
on the current one.  Might introduce more frontend stalls, need more
experimenting
---
 hw/rtl/core/VX_tensor_core.sv | 27 +++++++++++++++++----------
 hw/rtl/fpu/VX_tensor_dpu.sv   |  2 +-
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index 5f32f504..2fc54fc5 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -219,7 +219,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
 
         VX_fifo_queue #(
             .DATAW(DATAW),
-            .DEPTH(4 /* FIXME: arbitrary */)
+            .DEPTH(8 /* FIXME: arbitrary */)
         ) pending_uops (
             .clk(clk),
             .reset(reset),
@@ -335,7 +335,8 @@ module VX_tensor_octet #(
     assign operands_ready     = inbuf_ready_in;
     assign operands_valid_buf = !inbuf_empty;
 
-    wire inbuf_enq = operands_ready && operands_valid && operands_last_in_pair;
+    // wire inbuf_enq = operands_ready && operands_valid && operands_last_in_pair;
+    wire inbuf_enq = operands_ready && operands_valid;
     wire inbuf_deq = operands_valid_buf && operands_ready_buf;
 
     // the 'issue queue' for the dpu.
@@ -350,7 +351,7 @@ module VX_tensor_octet #(
     VX_fifo_queue #(
         .DATAW   ($bits(A_in) + $bits(B_in) + $bits(C_in) +
                   $bits(operands_wid) + $bits(operands_step) + $bits(operands_last_in_pair)),
-        .DEPTH   (4 /* FIXME: arbitrary */)
+        .DEPTH   (8 /* FIXME: arbitrary */)
     ) input_buffer (
         .clk   (clk),
         .reset (reset),
@@ -365,6 +366,9 @@ module VX_tensor_octet #(
         `UNUSED_PIN(size)
     );
 
+    // FIXME: this shouldn't be necessary
+    `RUNTIME_ASSERT(reset || !inbuf_full, ("dpu issue queue is full!"))
+
     typedef struct {
       logic [3:0][31:0] A_half;
       logic [3:0][31:0] B_half;
@@ -411,8 +415,8 @@ module VX_tensor_octet #(
     assign halves_buf = get_operand_half(operands_step_buf, A_in_buf, B_in_buf, C_in_buf);
 
     wire do_hmma = operands_ready_buf && operands_valid_buf && operands_last_in_pair_buf;
-    wire operands_first_in_pair_fire = operands_ready && operands_valid && (!operands_last_in_pair);
-    // wire operands_first_in_pair_fire = operands_ready && operands_valid;
+    // wire operands_first_in_pair_fire = operands_ready && operands_valid && (!operands_last_in_pair);
+    wire operands_first_in_pair_fire = operands_ready_buf && operands_valid_buf && (!operands_last_in_pair_buf);
 
     always @(*) begin
         A_buffer_n = A_buffer;
@@ -421,10 +425,10 @@ module VX_tensor_octet #(
         substeps_n = substeps;
         
         if (operands_first_in_pair_fire) begin
-          substeps_n[operands_wid] = 1'b1; // ready for hmma
-          A_buffer_n[operands_wid] = halves.A_half;
-          B_buffer_n[operands_wid] = halves.B_half;
-          C_buffer_n[operands_wid] = halves.C_half;
+          substeps_n[operands_wid_buf] = 1'b1; // ready for hmma
+          A_buffer_n[operands_wid_buf] = halves_buf.A_half;
+          B_buffer_n[operands_wid_buf] = halves_buf.B_half;
+          C_buffer_n[operands_wid_buf] = halves_buf.C_half;
         end
         if (do_hmma) begin
           substeps_n[operands_wid_buf] = 1'b0; // finished hmma, ready for next operand
@@ -521,7 +525,7 @@ module VX_tensor_octet #(
     // TODO: This is probably oversized.
     VX_fifo_queue #(
         .DATAW   ($bits(D_wid) + $bits(D_out)),
-        .DEPTH   (4 /* FIXME: arbitrary */)
+        .DEPTH   (8 /* FIXME: arbitrary */)
     ) output_buffer (
         .clk   (clk),
         .reset (reset),
@@ -536,6 +540,9 @@ module VX_tensor_octet #(
         `UNUSED_PIN(size)
     );
 
+    // FIXME: this shouldn't be necessary
+    `RUNTIME_ASSERT(reset || !outbuf_full, ("dpu result queue is full!"))
+
 `ifdef PERF_ENABLE
     logic [`PERF_CTR_BITS-1:0] perf_tensor_dpu_total;
 
diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv
index 33529370..90c2c7ed 100644
--- a/hw/rtl/fpu/VX_tensor_dpu.sv
+++ b/hw/rtl/fpu/VX_tensor_dpu.sv
@@ -95,7 +95,7 @@ module VX_tensor_dpu #(
     VX_shift_register #(
         .DATAW  (1 + $bits(wid)/* + $bits(D_tile)*/),
         // .DEPTH  (`LATENCY_HMMA),
-        .DEPTH  (2),
+        .DEPTH  (4),
         .RESETW (1)
     ) shift_reg (
         .clk      (clk),

From 5ed6041e33bf6c00000bdf322bd814f1c27b71e5 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Wed, 29 May 2024 17:05:12 -0700
Subject: [PATCH 14/31] tensor: Properly stall dpu upon commit backpressure

& better-reasoned queue depths
---
 hw/rtl/core/VX_tensor_core.sv | 29 ++++++++++++++---------------
 hw/rtl/fpu/VX_tensor_dpu.sv   | 28 ++++++++++++----------------
 2 files changed, 26 insertions(+), 31 deletions(-)

diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index 2fc54fc5..71b17e08 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -77,6 +77,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
     // octet. E.g. two tgs map lane 0-3 and lane 16-19 -> 16
     // FIXME: not sure this is the right logic.  just filling in what works
     localparam LANE_OFFSET_THREADGROUP = (4 * NUM_OCTETS);
+    localparam REQ_QUEUE_DEPTH = 4;
 
     wire [1:0] step = 2'(execute_if.data.op_type);
     wire last_in_pair = (execute_if.data.op_mod == `INST_MOD_BITS'(1));
@@ -219,7 +220,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
 
         VX_fifo_queue #(
             .DATAW(DATAW),
-            .DEPTH(8 /* FIXME: arbitrary */)
+            .DEPTH(REQ_QUEUE_DEPTH)
         ) pending_uops (
             .clk(clk),
             .reset(reset),
@@ -234,6 +235,8 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
             `UNUSED_PIN(size)
         );
 
+        // this shouldn't really happen unless there's a big contention over
+        // the commit stage
         `RUNTIME_ASSERT(!(!reset && full), ("tensor core uop queue is full!"));
     end
 
@@ -300,6 +303,8 @@ module VX_tensor_octet #(
     output result_valid,
     input result_ready
 );
+    localparam ISSUE_QUEUE_DEPTH = 4;
+
     // 512 bits/octet * 4 octets per warp
     logic [`NUM_WARPS-1:0][3:0][31:0] A_buffer, A_buffer_n;
     logic [`NUM_WARPS-1:0][3:0][31:0] B_buffer, B_buffer_n;
@@ -351,7 +356,7 @@ module VX_tensor_octet #(
     VX_fifo_queue #(
         .DATAW   ($bits(A_in) + $bits(B_in) + $bits(C_in) +
                   $bits(operands_wid) + $bits(operands_step) + $bits(operands_last_in_pair)),
-        .DEPTH   (8 /* FIXME: arbitrary */)
+        .DEPTH   (ISSUE_QUEUE_DEPTH)
     ) input_buffer (
         .clk   (clk),
         .reset (reset),
@@ -451,17 +456,8 @@ module VX_tensor_octet #(
     end
 
     wire outbuf_ready_in;
-    // backpressure from commit
-    wire stall = ~outbuf_ready_in;
     wire hmma_ready;
-
-    // assign operands_ready = ~stall;
-    // TODO: Below line is to only allow 1 warp to occupy the octet at a time;
-    // currently, dpu is fully-pipelined and allows concurrency between
-    // multiple warps.  This seems to be not a problem though given that the
-    // RF operand read takes >=2 cycles, which should be the end-to-end
-    // latency of the DPU anyways
-    assign operands_ready_buf = hmma_ready && ~stall;
+    assign operands_ready_buf = hmma_ready;
 
     // A is 4x2 fp32 matrix
     wire [3:0][1:0][31:0] A_tile = {
@@ -496,8 +492,6 @@ module VX_tensor_octet #(
         .clk(clk),
         .reset(reset),
 
-        .stall(stall),
-        
         .valid_in(do_hmma),
         .ready_in(hmma_ready),
         .A_tile(A_tile),
@@ -506,12 +500,14 @@ module VX_tensor_octet #(
         .wid(operands_wid_buf),
 
         .valid_out(dpu_valid),
+        .ready_out(outbuf_ready_in),
         .D_tile(D_tile),
         .D_wid(D_wid_dpu)
     );
 
     wire outbuf_empty;
     wire outbuf_full;
+    // backpressure from commit
     assign outbuf_ready_in = ~outbuf_full;
     assign result_valid    = ~outbuf_empty;
 
@@ -525,7 +521,10 @@ module VX_tensor_octet #(
     // TODO: This is probably oversized.
     VX_fifo_queue #(
         .DATAW   ($bits(D_wid) + $bits(D_out)),
-        .DEPTH   (8 /* FIXME: arbitrary */)
+        // depth of this queue should ideally be deeper than the dpu pipeline
+        // latency, since the dpu is fully-pipelined and it can output the
+        // latency-number of outputs in a burst-y way.
+        .DEPTH   (`LATENCY_HMMA + `LATENCY_HMMA)
     ) output_buffer (
         .clk   (clk),
         .reset (reset),
diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv
index 90c2c7ed..51112c96 100644
--- a/hw/rtl/fpu/VX_tensor_dpu.sv
+++ b/hw/rtl/fpu/VX_tensor_dpu.sv
@@ -8,8 +8,6 @@ module VX_tensor_dpu #(
     input clk,
     input reset,
 
-    input stall,
-
     input valid_in,
     output ready_in,
     input [3:0][1:0][31:0] A_tile,
@@ -18,6 +16,7 @@ module VX_tensor_dpu #(
     input [`NW_WIDTH-1:0]  wid,
 
     output valid_out,
+    input  ready_out,
     output [3:0][3:0][31:0] D_tile,
     output [`NW_WIDTH-1:0]  D_wid
 );
@@ -40,10 +39,11 @@ module VX_tensor_dpu #(
     end
 
     // ready as soon as valid_out
-    assign ready_in = ready_reg || valid_out;
+    // assign ready_in = ready_reg || valid_out;
 
-    // fully pipelined; always ready
-    // assign ready_in = 1'b1;
+    // fully pipelined; ready_in is coupled to ready_out by immediately
+    // stalling
+    assign ready_in = ready_out;
 
     // wire        dpu_valid;
     // wire [31:0] dpu_data;
@@ -70,8 +70,8 @@ module VX_tensor_dpu #(
     ) threadgroup_0 (
         .clk   (clk),
         .reset (reset),
-        .valid_in  (valid_in && ready_in),
-        .stall     (stall),
+        .valid_in  (valid_in),
+        .stall     (!ready_out),
         .A_frag    (A_tile[1:0]),
         .B_frag    (B_tile),
         .C_frag    (C_tile[1:0]),
@@ -82,8 +82,8 @@ module VX_tensor_dpu #(
     ) threadgroup_1 (
         .clk   (clk),
         .reset (reset),
-        .valid_in  (valid_in && ready_in),
-        .stall     (stall),
+        .valid_in  (valid_in),
+        .stall     (!ready_out),
         .A_frag    (A_tile[3:2]),
         .B_frag    (B_tile),
         .C_frag    (C_tile[3:2]),
@@ -94,18 +94,16 @@ module VX_tensor_dpu #(
     // fixed-latency queue
     VX_shift_register #(
         .DATAW  (1 + $bits(wid)/* + $bits(D_tile)*/),
-        // .DEPTH  (`LATENCY_HMMA),
-        .DEPTH  (4),
+        .DEPTH  (`LATENCY_HMMA),
         .RESETW (1)
     ) shift_reg (
         .clk      (clk),
         .reset    (reset),
-        .enable   (~stall),
+        .enable   (ready_out),
         .data_in  ({valid_in && ready_in, wid  /*, result_hmma*/}),
         .data_out ({valid_out,            D_wid/*, D_tile     */})
     );
 
-    // FIXME: breaks when stall is on!
     `RUNTIME_ASSERT(reset || (&(threadgroup_valids) == valid_out),
                     ("FEDP and metadata queue went out of sync!"))
 endmodule
@@ -146,7 +144,7 @@ module VX_tensor_threadgroup #(
           .io_in_bits_b_2   (32'h0),
           .io_in_bits_b_3   (32'h0),
           .io_in_bits_c     (C_frag[D_row][D_col]),
-          .io_stall         (1'b0), // FIXME
+          .io_stall         (stall),
           .io_out_valid     (valids[D_row][D_col]),
           .io_out_bits_data (D_frag[D_row][D_col])
         );
@@ -154,8 +152,6 @@ module VX_tensor_threadgroup #(
     end
 
     assign valid_out = (&(valids[0])) && (&(valids[1]));
-
-    `RUNTIME_ASSERT(reset || !stall, ("stall not supported yet in tensor dpu!"))
 endmodule
 
 `endif

From 35273b3d742b4391a4edc1c689b1acf85b75f4d6 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Wed, 29 May 2024 17:14:54 -0700
Subject: [PATCH 15/31] Set correct dpu hmma latency

---
 hw/rtl/VX_config.vh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/rtl/VX_config.vh b/hw/rtl/VX_config.vh
index 5ef71794..8905bd3d 100644
--- a/hw/rtl/VX_config.vh
+++ b/hw/rtl/VX_config.vh
@@ -391,7 +391,7 @@
 
 // Tensor Core Latency
 `ifndef LATENCY_HMMA
-`define LATENCY_HMMA 2
+`define LATENCY_HMMA 4
 `endif
 
 // Icache Configurable Knobs //////////////////////////////////////////////////

From 73a2f5781e242746f076eadaed36a391bcf34951 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Wed, 29 May 2024 22:01:03 -0700
Subject: [PATCH 16/31] Do two-cycle compute with 1 FEDP per lane

---
 hw/rtl/fpu/VX_tensor_dpu.sv | 199 +++++++++++++++++++++++++++---------
 1 file changed, 150 insertions(+), 49 deletions(-)

diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv
index 51112c96..faace3f0 100644
--- a/hw/rtl/fpu/VX_tensor_dpu.sv
+++ b/hw/rtl/fpu/VX_tensor_dpu.sv
@@ -30,47 +30,43 @@ module VX_tensor_dpu #(
     always @(posedge clk) begin
         if (reset) begin
             ready_reg <= '1;
-        end else if (valid_in) begin
+        end else if (valid_in && ready_in) begin
             ready_reg <= '0;
             dpi_print_results(int'(ISW), int'(OCTET), A_tile, B_tile, C_tile, result_hmma);
-        end else if (valid_out) begin
+        end else if (valid_out && ready_out) begin
             ready_reg <= '1;
         end
     end
 
     // ready as soon as valid_out
-    // assign ready_in = ready_reg || valid_out;
+    // assign ready_in = ready_reg;
 
     // fully pipelined; ready_in is coupled to ready_out by immediately
     // stalling
-    assign ready_in = ready_out;
+    // assign ready_in = ready_out;
 
-    // wire        dpu_valid;
-    // wire [31:0] dpu_data;
-    // TensorDotProductUnit dpu_pipe (
-    //   .clock (clk),
-    //   .reset (reset),
-    //   .io_in_valid  (valid_in && ready_in),
-    //   .io_in_bits_a_0 (32'h40000000),
-    //   .io_in_bits_a_1 (32'h40000000),
-    //   .io_in_bits_a_2 (32'h40000000),
-    //   .io_in_bits_a_3 (32'h40000000),
-    //   .io_in_bits_b_0 (32'h40000000),
-    //   .io_in_bits_b_1 (32'h40000000),
-    //   .io_in_bits_b_2 (32'h40000000),
-    //   .io_in_bits_b_3 (32'h40000000),
-    //   .io_in_bits_c   (32'h3f800000),
-    //   .io_out_valid (dpu_valid),
-    //   .io_out_bits_data (dpu_data)
+    // // fixed-latency queue
+    // VX_shift_register #(
+    //     .DATAW  (1 + $bits(wid)/* + $bits(D_tile)*/),
+    //     .DEPTH  (`LATENCY_HMMA + 1),
+    //     .RESETW (1)
+    // ) shift_reg (
+    //     .clk      (clk),
+    //     .reset    (reset),
+    //     .enable   (ready_out),
+    //     .data_in  ({valid_in && ready_in, wid  /*, result_hmma*/}),
+    //     .data_out ({valid_out,            D_wid/*, D_tile     */})
     // );
 
     logic [1:0] threadgroup_valids;
+    logic [1:0] threadgroup_readys;
     // B_tile is shared across the two threadgroups; see Figure 13
     VX_tensor_threadgroup #(
     ) threadgroup_0 (
         .clk   (clk),
         .reset (reset),
         .valid_in  (valid_in),
+        .ready_in  (threadgroup_readys[0]),
         .stall     (!ready_out),
         .A_frag    (A_tile[1:0]),
         .B_frag    (B_tile),
@@ -83,6 +79,7 @@ module VX_tensor_dpu #(
         .clk   (clk),
         .reset (reset),
         .valid_in  (valid_in),
+        .ready_in  (threadgroup_readys[1]),
         .stall     (!ready_out),
         .A_frag    (A_tile[3:2]),
         .B_frag    (B_tile),
@@ -91,21 +88,36 @@ module VX_tensor_dpu #(
         .D_frag    (D_tile[3:2])
     );
 
-    // fixed-latency queue
-    VX_shift_register #(
-        .DATAW  (1 + $bits(wid)/* + $bits(D_tile)*/),
-        .DEPTH  (`LATENCY_HMMA),
-        .RESETW (1)
-    ) shift_reg (
-        .clk      (clk),
-        .reset    (reset),
-        .enable   (ready_out),
-        .data_in  ({valid_in && ready_in, wid  /*, result_hmma*/}),
-        .data_out ({valid_out,            D_wid/*, D_tile     */})
+    wire empty;
+    wire full;
+    wire enq = valid_in && ready_in;
+    wire deq = valid_out && ready_out;
+
+    assign ready_in  = &(threadgroup_readys);
+    assign valid_out = &(threadgroup_valids);
+
+    // need to pass along warp id's to do multithreading
+    VX_fifo_queue #(
+        .DATAW   ($bits(wid)),
+        .DEPTH   (`LATENCY_HMMA + `LATENCY_HMMA)
+    ) wid_queue (
+        .clk   (clk),
+        .reset (reset),
+        .push      (enq),
+        .pop       (deq),
+        .data_in   (wid),
+        .data_out  (D_wid),
+        .empty     (empty),
+        `UNUSED_PIN(alm_empty),
+        .full      (full), // should be impossible to overflow
+        `UNUSED_PIN(alm_full),
+        `UNUSED_PIN(size)
     );
 
-    `RUNTIME_ASSERT(reset || (&(threadgroup_valids) == valid_out),
-                    ("FEDP and metadata queue went out of sync!"))
+    `RUNTIME_ASSERT(reset || !full, ("dpu wid queue is full!"))
+
+    // `RUNTIME_ASSERT(reset || (&(threadgroup_valids) == valid_out),
+    //                 ("FEDP and metadata queue went out of sync!"))
 endmodule
 
 // does (m,n,k) = (2,4,2) matmul compute over 2 cycles.
@@ -116,6 +128,7 @@ module VX_tensor_threadgroup #(
     input reset,
 
     input valid_in,
+    output ready_in,
     input stall,
     input [1:0][1:0][31:0] A_frag,
     input [1:0][3:0][31:0] B_frag,
@@ -123,35 +136,123 @@ module VX_tensor_threadgroup #(
 
     output valid_out,
     output [1:0][3:0][31:0] D_frag
-
 );
+    wire [1:0][1:0][31:0] A_frag_buf;
+    wire [1:0][3:0][31:0] B_frag_buf;
+    wire [1:0][3:0][31:0] C_frag_buf;
+
+    wire valid_buf;
+    wire ready_buf;
+
+    wire enq = valid_in && ready_in;
+    wire deq = valid_buf && ready_buf;
+    wire empty;
+    wire full;
+    assign ready_in  = !full;
+    assign valid_buf = !empty;
+
+    VX_fifo_queue #(
+        .DATAW ($bits(A_frag) + $bits(B_frag) + $bits(C_frag)),
+        .DEPTH   (4)
+    ) input_buffer (
+        .clk       (clk),
+        .reset     (reset),
+        .push      (enq),
+        .pop       (deq),
+        .data_in   ({A_frag,     B_frag,     C_frag}),
+        .data_out  ({A_frag_buf, B_frag_buf, C_frag_buf}),
+        .empty     (empty),
+        `UNUSED_PIN(alm_empty),
+        .full      (full),
+        `UNUSED_PIN(alm_full),
+        `UNUSED_PIN(size)
+    );
+
+    logic [3:0] fedp_valids;
+    wire fedp_valid_out = &(fedp_valids);
+    wire fedp_ready_out = !stall;
+    wire fedp_fire_out  = fedp_valid_out && fedp_ready_out;
+
+    wire fedp_valid_in = valid_buf;
+    wire fedp_ready_in = fedp_ready_out; // coupled
+    wire fedp_fire_in  = fedp_valid_in && fedp_ready_in;
+
+    // 0: FEDP uses first half from input_buffer
+    // 1: FEDP uses last half and pops input_buffer
+    logic step_in;
+    // 0: FEDP produces first half of D_frag
+    // 1: FEDP produces last half of D_frag and asserts valid_out
+    logic step_out;
+    assign ready_buf = fedp_fire_in && (step_in == 1'b1);
+
+    // FIXME shrink size
+    logic [1:0][3:0][31:0] D_reg, D_reg_n;
+    wire  [3:0][31:0] D_half;
+    always @(*) begin
+        D_reg_n = D_reg;
+
+        if (fedp_fire_out) begin
+            if (step_out == 1'b0) begin
+                D_reg_n[0][0] = D_half[0];
+                D_reg_n[0][2] = D_half[1];
+                D_reg_n[1][0] = D_half[2];
+                D_reg_n[1][2] = D_half[3];
+            end
+        end
+    end
+
+    always @(posedge clk) begin
+        if (reset) begin
+            step_in <= '0;
+            step_out <= '0;
+
+            D_reg <= '0;
+        end else begin
+            if (fedp_fire_in) begin
+                step_in <= ~step_in;
+            end
+            if (fedp_fire_out) begin
+                step_out <= ~step_out;
+            end
+
+            D_reg <= D_reg_n;
+        end
+    end
+
+    assign D_frag[0][0] = D_reg[0][0];
+    assign D_frag[0][2] = D_reg[0][2];
+    assign D_frag[1][0] = D_reg[1][0];
+    assign D_frag[1][2] = D_reg[1][2];
+    assign D_frag[0][1] = D_half[0];
+    assign D_frag[0][3] = D_half[1];
+    assign D_frag[1][1] = D_half[2];
+    assign D_frag[1][3] = D_half[3];
+
     // 4 FEDPs per threadgroup
-    // FIXME: experimenting with 8 FEDPs first
-    logic [1:0][3:0] valids;
-    for (genvar D_row = 0; D_row < 2; ++D_row) begin
-      for (genvar D_col = 0; D_col < 4; ++D_col) begin
+    for (genvar i = 0; i < 4; ++i) begin
+        localparam int d_row = i / 2;
+        localparam int d_col = (i % 2) * 2;
         // four-element dot product (FEDP) unit
         TensorDotProductUnit fedp (
           .clock (clk),
           .reset (reset),
-          .io_in_valid      (valid_in),
-          .io_in_bits_a_0   (A_frag[D_row][0]),
-          .io_in_bits_a_1   (A_frag[D_row][1]),
+          .io_in_valid      (fedp_fire_in),
+          .io_in_bits_a_0   (A_frag_buf[d_row][0]),
+          .io_in_bits_a_1   (A_frag_buf[d_row][1]),
           .io_in_bits_a_2   (32'h0),
           .io_in_bits_a_3   (32'h0),
-          .io_in_bits_b_0   (B_frag[0][D_col]),
-          .io_in_bits_b_1   (B_frag[1][D_col]),
+          .io_in_bits_b_0   (step_in == 1'b0 ? B_frag_buf[0][d_col] : B_frag_buf[0][d_col + 1]),
+          .io_in_bits_b_1   (step_in == 1'b0 ? B_frag_buf[1][d_col] : B_frag_buf[1][d_col + 1]),
           .io_in_bits_b_2   (32'h0),
           .io_in_bits_b_3   (32'h0),
-          .io_in_bits_c     (C_frag[D_row][D_col]),
+          .io_in_bits_c     (step_in == 1'b0 ? C_frag_buf[d_row][d_col] : C_frag_buf[d_row][d_col + 1]),
           .io_stall         (stall),
-          .io_out_valid     (valids[D_row][D_col]),
-          .io_out_bits_data (D_frag[D_row][D_col])
+          .io_out_valid     (fedp_valids[i]),
+          .io_out_bits_data (D_half[i])
         );
-      end
     end
 
-    assign valid_out = (&(valids[0])) && (&(valids[1]));
+    assign valid_out = fedp_valid_out && (step_out == 1'b1);
 endmodule
 
 `endif

From 2e2decc8b6fa2877b0d844b4a0395589d5e146e9 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 30 May 2024 12:46:45 -0700
Subject: [PATCH 17/31] Shrink size of D_half latch

---
 hw/rtl/fpu/VX_tensor_dpu.sv | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv
index faace3f0..49d2418d 100644
--- a/hw/rtl/fpu/VX_tensor_dpu.sv
+++ b/hw/rtl/fpu/VX_tensor_dpu.sv
@@ -185,18 +185,14 @@ module VX_tensor_threadgroup #(
     logic step_out;
     assign ready_buf = fedp_fire_in && (step_in == 1'b1);
 
-    // FIXME shrink size
-    logic [1:0][3:0][31:0] D_reg, D_reg_n;
+    // latch the first-half result of D_frag
+    logic [3:0][31:0] D_reg, D_reg_n;
     wire  [3:0][31:0] D_half;
     always @(*) begin
         D_reg_n = D_reg;
-
         if (fedp_fire_out) begin
             if (step_out == 1'b0) begin
-                D_reg_n[0][0] = D_half[0];
-                D_reg_n[0][2] = D_half[1];
-                D_reg_n[1][0] = D_half[2];
-                D_reg_n[1][2] = D_half[3];
+                D_reg_n = D_half;
             end
         end
     end
@@ -219,10 +215,10 @@ module VX_tensor_threadgroup #(
         end
     end
 
-    assign D_frag[0][0] = D_reg[0][0];
-    assign D_frag[0][2] = D_reg[0][2];
-    assign D_frag[1][0] = D_reg[1][0];
-    assign D_frag[1][2] = D_reg[1][2];
+    assign D_frag[0][0] = D_reg[0];
+    assign D_frag[0][2] = D_reg[1];
+    assign D_frag[1][0] = D_reg[2];
+    assign D_frag[1][2] = D_reg[3];
     assign D_frag[0][1] = D_half[0];
     assign D_frag[0][3] = D_half[1];
     assign D_frag[1][1] = D_half[2];

From 2743d32bd2658b362656088f45736942a6e699bc Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 30 May 2024 15:25:00 -0700
Subject: [PATCH 18/31] tensor: Handle wid queue backpressure in dpu

---
 hw/rtl/core/VX_tensor_core.sv | 4 ++--
 hw/rtl/fpu/VX_tensor_dpu.sv   | 6 ++----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index 71b17e08..2ddd6a70 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -77,7 +77,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
     // octet. E.g. two tgs map lane 0-3 and lane 16-19 -> 16
     // FIXME: not sure this is the right logic.  just filling in what works
     localparam LANE_OFFSET_THREADGROUP = (4 * NUM_OCTETS);
-    localparam REQ_QUEUE_DEPTH = 4;
+    localparam METADATA_QUEUE_DEPTH = 4;
 
     wire [1:0] step = 2'(execute_if.data.op_type);
     wire last_in_pair = (execute_if.data.op_mod == `INST_MOD_BITS'(1));
@@ -220,7 +220,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
 
         VX_fifo_queue #(
             .DATAW(DATAW),
-            .DEPTH(REQ_QUEUE_DEPTH)
+            .DEPTH(METADATA_QUEUE_DEPTH)
         ) pending_uops (
             .clk(clk),
             .reset(reset),
diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv
index 49d2418d..870f6870 100644
--- a/hw/rtl/fpu/VX_tensor_dpu.sv
+++ b/hw/rtl/fpu/VX_tensor_dpu.sv
@@ -93,7 +93,7 @@ module VX_tensor_dpu #(
     wire enq = valid_in && ready_in;
     wire deq = valid_out && ready_out;
 
-    assign ready_in  = &(threadgroup_readys);
+    assign ready_in  = &(threadgroup_readys) && !full;
     assign valid_out = &(threadgroup_valids);
 
     // need to pass along warp id's to do multithreading
@@ -109,13 +109,11 @@ module VX_tensor_dpu #(
         .data_out  (D_wid),
         .empty     (empty),
         `UNUSED_PIN(alm_empty),
-        .full      (full), // should be impossible to overflow
+        .full      (full),
         `UNUSED_PIN(alm_full),
         `UNUSED_PIN(size)
     );
 
-    `RUNTIME_ASSERT(reset || !full, ("dpu wid queue is full!"))
-
     // `RUNTIME_ASSERT(reset || (&(threadgroup_valids) == valid_out),
     //                 ("FEDP and metadata queue went out of sync!"))
 endmodule

From dfb2276657c8ba4f7fcafc18ee0f091ce6e1481d Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 30 May 2024 17:29:59 -0700
Subject: [PATCH 19/31] tensor: Remove redundant issue queue outside pdu

---
 hw/rtl/core/VX_tensor_core.sv | 83 +++++++++++++++++++----------------
 1 file changed, 46 insertions(+), 37 deletions(-)

diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index 2ddd6a70..bedf8245 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -333,46 +333,55 @@ module VX_tensor_octet #(
     wire                 operands_last_in_pair_buf;
     wire [1:0]           operands_step_buf;
 
-    wire inbuf_empty;
-    wire inbuf_full;
-    wire inbuf_ready_in;
-    assign inbuf_ready_in     = !inbuf_full;
-    assign operands_ready     = inbuf_ready_in;
-    assign operands_valid_buf = !inbuf_empty;
+    // wire inbuf_empty;
+    // wire inbuf_full;
+    // wire inbuf_ready_in;
+    // assign inbuf_ready_in     = !inbuf_full;
+    // assign operands_ready     = inbuf_ready_in;
+    // assign operands_valid_buf = !inbuf_empty;
 
-    // wire inbuf_enq = operands_ready && operands_valid && operands_last_in_pair;
-    wire inbuf_enq = operands_ready && operands_valid;
-    wire inbuf_deq = operands_valid_buf && operands_ready_buf;
+    // // wire inbuf_enq = operands_ready && operands_valid && operands_last_in_pair;
+    // wire inbuf_enq = operands_ready && operands_valid;
+    // wire inbuf_deq = operands_valid_buf && operands_ready_buf;
 
-    // the 'issue queue' for the dpu.
-    // This exists to decouple the input of the dot-product unit from
-    // execute_if.ready.  execute_if can arrive intermittently according to
-    // the frontend's behavior, and since the dpu can also stall for a fixed
-    // initiation latency, we need to decouple the two to efficiently feed the
-    // dpu.
-    // This only applies to the last instruction in a pair, since the first
-    // instruction only acts to buffer the operands and can execute
-    // immediately without backpressure.  So we don't enqueue them.
-    VX_fifo_queue #(
-        .DATAW   ($bits(A_in) + $bits(B_in) + $bits(C_in) +
-                  $bits(operands_wid) + $bits(operands_step) + $bits(operands_last_in_pair)),
-        .DEPTH   (ISSUE_QUEUE_DEPTH)
-    ) input_buffer (
-        .clk   (clk),
-        .reset (reset),
-        .push      (inbuf_enq),
-        .pop       (inbuf_deq),
-        .data_in   ({A_in,     B_in,     C_in,     operands_wid,     operands_step,     operands_last_in_pair}),
-        .data_out  ({A_in_buf, B_in_buf, C_in_buf, operands_wid_buf, operands_step_buf, operands_last_in_pair_buf}),
-        .empty     (inbuf_empty),
-        `UNUSED_PIN(alm_empty),
-        .full      (inbuf_full),
-        `UNUSED_PIN(alm_full),
-        `UNUSED_PIN(size)
-    );
+    // // the 'issue queue' for the dpu.
+    // // This exists to decouple the input of the dot-product unit from
+    // // execute_if.ready.  execute_if can arrive intermittently according to
+    // // the frontend's behavior, and since the dpu can also stall for a fixed
+    // // initiation latency, we need to decouple the two to efficiently feed the
+    // // dpu.
+    // // This only applies to the last instruction in a pair, since the first
+    // // instruction only acts to buffer the operands and can execute
+    // // immediately without backpressure.  So we don't enqueue them.
+    // VX_fifo_queue #(
+    //     .DATAW   ($bits(A_in) + $bits(B_in) + $bits(C_in) +
+    //               $bits(operands_wid) + $bits(operands_step) + $bits(operands_last_in_pair)),
+    //     .DEPTH   (ISSUE_QUEUE_DEPTH)
+    // ) input_buffer (
+    //     .clk   (clk),
+    //     .reset (reset),
+    //     .push      (inbuf_enq),
+    //     .pop       (inbuf_deq),
+    //     .data_in   ({A_in,     B_in,     C_in,     operands_wid,     operands_step,     operands_last_in_pair}),
+    //     .data_out  ({A_in_buf, B_in_buf, C_in_buf, operands_wid_buf, operands_step_buf, operands_last_in_pair_buf}),
+    //     .empty     (inbuf_empty),
+    //     `UNUSED_PIN(alm_empty),
+    //     .full      (inbuf_full),
+    //     `UNUSED_PIN(alm_full),
+    //     `UNUSED_PIN(size)
+    // );
 
-    // FIXME: this shouldn't be necessary
-    `RUNTIME_ASSERT(reset || !inbuf_full, ("dpu issue queue is full!"))
+    // // FIXME: this shouldn't be necessary
+    // `RUNTIME_ASSERT(reset || !inbuf_full, ("dpu issue queue is full!"))
+
+    assign A_in_buf = A_in;
+    assign B_in_buf = B_in;
+    assign C_in_buf = C_in;
+    assign operands_step_buf         = operands_step;
+    assign operands_wid_buf          = operands_wid;
+    assign operands_last_in_pair_buf = operands_last_in_pair;
+    assign operands_valid_buf = operands_valid;
+    assign operands_ready = operands_ready_buf;
 
     typedef struct {
       logic [3:0][31:0] A_half;

From 06e0f901ff44cad582a8247457956106890a9eab Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 30 May 2024 17:34:39 -0700
Subject: [PATCH 20/31] tensor: Handle backpressure from metadata queue

---
 hw/rtl/core/VX_tensor_core.sv | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index bedf8245..5d4c02a4 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -90,8 +90,6 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
     logic [`NUM_THREADS-1:0][`XLEN-1:0] wb_data_1;
     wire [`NW_WIDTH-1:0] wb_wid;
     
-    assign execute_if.ready = &octet_operands_ready;
-
 `ifdef EXT_T_ENABLE
     for (genvar i = 0; i < NUM_OCTETS; ++i) begin
 `else
@@ -207,16 +205,23 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
 
     wire [`NUM_WARPS-1:0][DATAW-1:0] execute_if_data_deq;
 
+    wire [`NUM_WARPS-1:0] metadata_queue_fulls;
+    // OR not AND, we don't want any warp full
+    wire metadata_queue_full = |(metadata_queue_fulls);
+
+    assign execute_if.ready = &(octet_operands_ready) && !metadata_queue_full;
+
     for (genvar i = 0; i < `NUM_WARPS; i++) begin
-        // execute_if request queue.
+        // Metadata queue for commit_if.  This simply copies execute_if's
+        // metadata and pops them in conjunction with commit fire.
+        //
         // This has to be separated per-warp, as otherwise requests from
         // multiple warps can be enqueued interleaved, which makes it hard to
         // ensure two consecutive dequeues are associated with the same warp for
-        // commit.
+        // commit. (FIXME: this is not strictly necessary though.)
 
         wire enq = execute_if_fire && (execute_if.data.wid == `NW_WIDTH'(i));
         wire deq =  commit_if_fire && (             wb_wid == `NW_WIDTH'(i));
-        wire full;
 
         VX_fifo_queue #(
             .DATAW(DATAW),
@@ -230,16 +235,16 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
             .data_out(execute_if_data_deq[i]),
             `UNUSED_PIN(empty),
             `UNUSED_PIN(alm_empty),
-            .full(full), // should be impossible to overflow
+            .full(metadata_queue_fulls[i]),
             `UNUSED_PIN(alm_full),
             `UNUSED_PIN(size)
         );
-
-        // this shouldn't really happen unless there's a big contention over
-        // the commit stage
-        `RUNTIME_ASSERT(!(!reset && full), ("tensor core uop queue is full!"));
     end
 
+    // this shouldn't really happen unless there's a big contention over
+    // the commit stage
+    `RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!"));
+
     // unlike execute which can be interleaved between warps, commit is
     // serialized and completed one-warp-by-warp, therefore we only need to
     // keep one subcommit state bit unlike for `substeps`
@@ -527,13 +532,11 @@ module VX_tensor_octet #(
     // is complete.  This decouples the irregular dpu output traffic from the
     // regular, every-2-cycle commit traffic to ensure the commit pipeline is
     // used more efficiently.
+    // FIXME: unnecessary?
     // TODO: This is probably oversized.
     VX_fifo_queue #(
         .DATAW   ($bits(D_wid) + $bits(D_out)),
-        // depth of this queue should ideally be deeper than the dpu pipeline
-        // latency, since the dpu is fully-pipelined and it can output the
-        // latency-number of outputs in a burst-y way.
-        .DEPTH   (`LATENCY_HMMA + `LATENCY_HMMA)
+        .DEPTH   (`LATENCY_HMMA)
     ) output_buffer (
         .clk   (clk),
         .reset (reset),

From 97f37b1c75d4efbca80cd8c9bde639c3500f4e8c Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 30 May 2024 18:00:26 -0700
Subject: [PATCH 21/31] tensor: Add commit stall injection for debugging

---
 hw/rtl/core/VX_tensor_core.sv | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index 5d4c02a4..b00d0a46 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -251,7 +251,27 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
     logic subcommit, subcommit_n;
 
     wire all_valid = (& octet_results_valid);
+
+// define this to inject artificial commit backpressure for debugging
+`define INJECT_COMMIT_BACKPRESSURE
+`ifndef INJECT_COMMIT_BACKPRESSURE
     assign commit_if.valid = all_valid;
+    assign commit_if_ready_override = commit_if.ready;
+`else
+    logic [1:0] counter;
+    always @(posedge clk) begin
+        if (reset) begin
+            counter <= '0;
+        end else begin
+            if (all_valid) begin
+                counter <= counter + 1'b1;
+            end
+        end
+    end
+
+    assign commit_if.valid = all_valid && (counter == 2'b0);
+    assign commit_if_ready_override = commit_if.ready && (counter == 2'b0);
+`endif
 
     localparam COMMIT_DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS + (`NUM_THREADS * `XLEN) + 1 + 1 + 1;
     wire [COMMIT_DATAW-1:0] commit_if_data = {

From 0a032ab400ae94d525d40dc8673e7b3b8d56e89b Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 30 May 2024 18:03:04 -0700
Subject: [PATCH 22/31] tensor: Fix out-of-sync enqueue to dpu and metadata
 queue

---
 hw/rtl/core/VX_tensor_core.sv | 92 ++++++++---------------------------
 hw/rtl/fpu/VX_tensor_dpu.sv   | 19 ++++++--
 2 files changed, 37 insertions(+), 74 deletions(-)

diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index b00d0a46..44485ccb 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -77,7 +77,8 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
     // octet. E.g. two tgs map lane 0-3 and lane 16-19 -> 16
     // FIXME: not sure this is the right logic.  just filling in what works
     localparam LANE_OFFSET_THREADGROUP = (4 * NUM_OCTETS);
-    localparam METADATA_QUEUE_DEPTH = 4;
+    // this is only a rule of thumb
+    localparam METADATA_QUEUE_DEPTH = `LATENCY_HMMA;
 
     wire [1:0] step = 2'(execute_if.data.op_type);
     wire last_in_pair = (execute_if.data.op_mod == `INST_MOD_BITS'(1));
@@ -89,7 +90,11 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
     logic [`NUM_THREADS-1:0][`XLEN-1:0] wb_data_0;
     logic [`NUM_THREADS-1:0][`XLEN-1:0] wb_data_1;
     wire [`NW_WIDTH-1:0] wb_wid;
-    
+
+    // valid signal synced between the functional units (octet) and the
+    // metadata queue
+    wire operands_valid_synced;
+
 `ifdef EXT_T_ENABLE
     for (genvar i = 0; i < NUM_OCTETS; ++i) begin
 `else
@@ -121,7 +126,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
             .A_in(octet_A),
             .B_in(octet_B),
             .C_in(octet_C),
-            .operands_valid(execute_if.valid),
+            .operands_valid(operands_valid_synced),
             .operands_wid(execute_if.data.wid),
             .operands_last_in_pair(last_in_pair),
             .operands_step(step),
@@ -172,8 +177,10 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
 
     localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS;
     
-    wire execute_if_fire = execute_if.valid && execute_if.ready;
-    wire commit_if_fire = commit_if.valid && commit_if.ready;
+    wire commit_if_ready_override;
+
+    wire operand_enq_fire = operands_valid_synced && execute_if.ready;
+    wire commit_if_fire = commit_if.valid && commit_if_ready_override;
     wire [DATAW-1:0] execute_if_data_enq = {
         execute_if.data.uuid, 
         execute_if.data.wid,
@@ -184,31 +191,14 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
         // pid/sop/eop set later
     };
 
-    // wire [DATAW-1:0] execute_if_data_deq;
-
-    // VX_fifo_queue #(
-    //     .DATAW(DATAW),
-    //     .DEPTH(4 /* FIXME: arbitrary */)
-    // ) pending_uops (
-    //     .clk(clk),
-    //     .reset(reset),
-    //     .push(execute_if_fire),
-    //     .pop(commit_if_fire),
-    //     .data_in(execute_if_data_enq),
-    //     .data_out(execute_if_data_deq),
-    //     `UNUSED_PIN(empty),
-    //     `UNUSED_PIN(alm_empty),
-    //     `UNUSED_PIN(full), // should be impossible to overflow
-    //     `UNUSED_PIN(alm_full),
-    //     `UNUSED_PIN(size)
-    // );
-
     wire [`NUM_WARPS-1:0][DATAW-1:0] execute_if_data_deq;
 
     wire [`NUM_WARPS-1:0] metadata_queue_fulls;
     // OR not AND, we don't want any warp full
     wire metadata_queue_full = |(metadata_queue_fulls);
 
+    // need to make sure both metadata and octet issue queues are in sync
+    assign operands_valid_synced = execute_if.valid && !metadata_queue_full;
     assign execute_if.ready = &(octet_operands_ready) && !metadata_queue_full;
 
     for (genvar i = 0; i < `NUM_WARPS; i++) begin
@@ -220,8 +210,8 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
         // ensure two consecutive dequeues are associated with the same warp for
         // commit. (FIXME: this is not strictly necessary though.)
 
-        wire enq = execute_if_fire && (execute_if.data.wid == `NW_WIDTH'(i));
-        wire deq =  commit_if_fire && (             wb_wid == `NW_WIDTH'(i));
+        wire enq = operand_enq_fire && (execute_if.data.wid == `NW_WIDTH'(i));
+        wire deq =   commit_if_fire && (             wb_wid == `NW_WIDTH'(i));
 
         VX_fifo_queue #(
             .DATAW(DATAW),
@@ -253,8 +243,8 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
     wire all_valid = (& octet_results_valid);
 
 // define this to inject artificial commit backpressure for debugging
-`define INJECT_COMMIT_BACKPRESSURE
-`ifndef INJECT_COMMIT_BACKPRESSURE
+// `define TENSOR_INJECT_COMMIT_BACKPRESSURE
+`ifndef TENSOR_INJECT_COMMIT_BACKPRESSURE
     assign commit_if.valid = all_valid;
     assign commit_if_ready_override = commit_if.ready;
 `else
@@ -358,47 +348,6 @@ module VX_tensor_octet #(
     wire                 operands_last_in_pair_buf;
     wire [1:0]           operands_step_buf;
 
-    // wire inbuf_empty;
-    // wire inbuf_full;
-    // wire inbuf_ready_in;
-    // assign inbuf_ready_in     = !inbuf_full;
-    // assign operands_ready     = inbuf_ready_in;
-    // assign operands_valid_buf = !inbuf_empty;
-
-    // // wire inbuf_enq = operands_ready && operands_valid && operands_last_in_pair;
-    // wire inbuf_enq = operands_ready && operands_valid;
-    // wire inbuf_deq = operands_valid_buf && operands_ready_buf;
-
-    // // the 'issue queue' for the dpu.
-    // // This exists to decouple the input of the dot-product unit from
-    // // execute_if.ready.  execute_if can arrive intermittently according to
-    // // the frontend's behavior, and since the dpu can also stall for a fixed
-    // // initiation latency, we need to decouple the two to efficiently feed the
-    // // dpu.
-    // // This only applies to the last instruction in a pair, since the first
-    // // instruction only acts to buffer the operands and can execute
-    // // immediately without backpressure.  So we don't enqueue them.
-    // VX_fifo_queue #(
-    //     .DATAW   ($bits(A_in) + $bits(B_in) + $bits(C_in) +
-    //               $bits(operands_wid) + $bits(operands_step) + $bits(operands_last_in_pair)),
-    //     .DEPTH   (ISSUE_QUEUE_DEPTH)
-    // ) input_buffer (
-    //     .clk   (clk),
-    //     .reset (reset),
-    //     .push      (inbuf_enq),
-    //     .pop       (inbuf_deq),
-    //     .data_in   ({A_in,     B_in,     C_in,     operands_wid,     operands_step,     operands_last_in_pair}),
-    //     .data_out  ({A_in_buf, B_in_buf, C_in_buf, operands_wid_buf, operands_step_buf, operands_last_in_pair_buf}),
-    //     .empty     (inbuf_empty),
-    //     `UNUSED_PIN(alm_empty),
-    //     .full      (inbuf_full),
-    //     `UNUSED_PIN(alm_full),
-    //     `UNUSED_PIN(size)
-    // );
-
-    // // FIXME: this shouldn't be necessary
-    // `RUNTIME_ASSERT(reset || !inbuf_full, ("dpu issue queue is full!"))
-
     assign A_in_buf = A_in;
     assign B_in_buf = B_in;
     assign C_in_buf = C_in;
@@ -521,7 +470,8 @@ module VX_tensor_octet #(
     // this does (m,n,k)=(4,4,2) matmul, modeling compute of a single octet
     VX_tensor_dpu #(
         .ISW(ISW),
-        .OCTET(OCTET)
+        .OCTET(OCTET),
+        .ISSUE_QUEUE_DEPTH(2)
     ) dpu (
         .clk(clk),
         .reset(reset),
@@ -556,7 +506,7 @@ module VX_tensor_octet #(
     // TODO: This is probably oversized.
     VX_fifo_queue #(
         .DATAW   ($bits(D_wid) + $bits(D_out)),
-        .DEPTH   (`LATENCY_HMMA)
+        .DEPTH   (2 /*`LATENCY_HMMA*/)
     ) output_buffer (
         .clk   (clk),
         .reset (reset),
diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv
index 870f6870..694af4ae 100644
--- a/hw/rtl/fpu/VX_tensor_dpu.sv
+++ b/hw/rtl/fpu/VX_tensor_dpu.sv
@@ -3,7 +3,8 @@
 
 module VX_tensor_dpu #(
     parameter ISW,
-    parameter OCTET
+    parameter OCTET,
+    parameter ISSUE_QUEUE_DEPTH = `LATENCY_HMMA
 ) (
     input clk,
     input reset,
@@ -62,6 +63,7 @@ module VX_tensor_dpu #(
     logic [1:0] threadgroup_readys;
     // B_tile is shared across the two threadgroups; see Figure 13
     VX_tensor_threadgroup #(
+        .ISSUE_QUEUE_DEPTH(ISSUE_QUEUE_DEPTH)
     ) threadgroup_0 (
         .clk   (clk),
         .reset (reset),
@@ -75,6 +77,7 @@ module VX_tensor_dpu #(
         .D_frag    (D_tile[1:0])
     );
     VX_tensor_threadgroup #(
+        .ISSUE_QUEUE_DEPTH(ISSUE_QUEUE_DEPTH)
     ) threadgroup_1 (
         .clk   (clk),
         .reset (reset),
@@ -99,7 +102,7 @@ module VX_tensor_dpu #(
     // need to pass along warp id's to do multithreading
     VX_fifo_queue #(
         .DATAW   ($bits(wid)),
-        .DEPTH   (`LATENCY_HMMA + `LATENCY_HMMA)
+        .DEPTH   (ISSUE_QUEUE_DEPTH)
     ) wid_queue (
         .clk   (clk),
         .reset (reset),
@@ -121,6 +124,7 @@ endmodule
 // does (m,n,k) = (2,4,2) matmul compute over 2 cycles.
 // matches Figure 10(b) of the paper.
 module VX_tensor_threadgroup #(
+    parameter ISSUE_QUEUE_DEPTH
 ) (
     input clk,
     input reset,
@@ -149,9 +153,18 @@ module VX_tensor_threadgroup #(
     assign ready_in  = !full;
     assign valid_buf = !empty;
 
+    // 'Issue queue' for the FEDP units.
+    // This exists to decouple the execution of the dot-product unit from
+    // the operand arrival.  Operands from execute_if can arrive
+    // intermittently according to the frontend's behavior, and since the dpu
+    // can also stall for a fixed initiation latency, we need to decouple the
+    // two to efficiently feed the dpu.
+    //
+    // TODO: better queue design possible; e.g. B_frag is shared by two
+    // threadgroups, so we need only 1 queue per octet for B
     VX_fifo_queue #(
         .DATAW ($bits(A_frag) + $bits(B_frag) + $bits(C_frag)),
-        .DEPTH   (4)
+        .DEPTH   (ISSUE_QUEUE_DEPTH)
     ) input_buffer (
         .clk       (clk),
         .reset     (reset),

From 83f9f6d84fc3f662d257bdc899682176d4be0cff Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 30 May 2024 18:22:36 -0700
Subject: [PATCH 23/31] tensor: Fix sync for dpu warp queue as well

---
 hw/rtl/core/VX_tensor_core.sv |  2 +-
 hw/rtl/fpu/VX_tensor_dpu.sv   | 27 +++++++++++++++------------
 2 files changed, 16 insertions(+), 13 deletions(-)

diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index 44485ccb..1f363f45 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -471,7 +471,7 @@ module VX_tensor_octet #(
     VX_tensor_dpu #(
         .ISW(ISW),
         .OCTET(OCTET),
-        .ISSUE_QUEUE_DEPTH(2)
+        .ISSUE_QUEUE_DEPTH(4)
     ) dpu (
         .clk(clk),
         .reset(reset),
diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv
index 694af4ae..08e37cfa 100644
--- a/hw/rtl/fpu/VX_tensor_dpu.sv
+++ b/hw/rtl/fpu/VX_tensor_dpu.sv
@@ -39,13 +39,6 @@ module VX_tensor_dpu #(
         end
     end
 
-    // ready as soon as valid_out
-    // assign ready_in = ready_reg;
-
-    // fully pipelined; ready_in is coupled to ready_out by immediately
-    // stalling
-    // assign ready_in = ready_out;
-
     // // fixed-latency queue
     // VX_shift_register #(
     //     .DATAW  (1 + $bits(wid)/* + $bits(D_tile)*/),
@@ -59,6 +52,16 @@ module VX_tensor_dpu #(
     //     .data_out ({valid_out,            D_wid/*, D_tile     */})
     // );
 
+    // ready as soon as valid_out
+    // assign ready_in = ready_reg || valid_out;
+
+    // fully pipelined; ready_in is coupled to ready_out by immediately
+    // stalling
+    // assign ready_in = ready_out;
+
+    logic synced_fire;
+    assign synced_fire = valid_in && ready_in;
+
     logic [1:0] threadgroup_valids;
     logic [1:0] threadgroup_readys;
     // B_tile is shared across the two threadgroups; see Figure 13
@@ -67,7 +70,7 @@ module VX_tensor_dpu #(
     ) threadgroup_0 (
         .clk   (clk),
         .reset (reset),
-        .valid_in  (valid_in),
+        .valid_in  (synced_fire),
         .ready_in  (threadgroup_readys[0]),
         .stall     (!ready_out),
         .A_frag    (A_tile[1:0]),
@@ -81,7 +84,7 @@ module VX_tensor_dpu #(
     ) threadgroup_1 (
         .clk   (clk),
         .reset (reset),
-        .valid_in  (valid_in),
+        .valid_in  (synced_fire),
         .ready_in  (threadgroup_readys[1]),
         .stall     (!ready_out),
         .A_frag    (A_tile[3:2]),
@@ -102,7 +105,7 @@ module VX_tensor_dpu #(
     // need to pass along warp id's to do multithreading
     VX_fifo_queue #(
         .DATAW   ($bits(wid)),
-        .DEPTH   (ISSUE_QUEUE_DEPTH)
+        .DEPTH   (ISSUE_QUEUE_DEPTH + ISSUE_QUEUE_DEPTH)
     ) wid_queue (
         .clk   (clk),
         .reset (reset),
@@ -117,8 +120,8 @@ module VX_tensor_dpu #(
         `UNUSED_PIN(size)
     );
 
-    // `RUNTIME_ASSERT(reset || (&(threadgroup_valids) == valid_out),
-    //                 ("FEDP and metadata queue went out of sync!"))
+    `RUNTIME_ASSERT(reset || !(deq && empty),
+                    ("dequeueing from empty warp id queue!"))
 endmodule
 
 // does (m,n,k) = (2,4,2) matmul compute over 2 cycles.

From 574cc0e5f035826745d281820c625cbe678c5bfb Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 30 May 2024 18:32:27 -0700
Subject: [PATCH 24/31] tensor: Document configuring queue depths

---
 hw/rtl/core/VX_tensor_core.sv | 7 ++-----
 hw/rtl/fpu/VX_tensor_dpu.sv   | 9 +++++++--
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index 1f363f45..a5128272 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -318,8 +318,6 @@ module VX_tensor_octet #(
     output result_valid,
     input result_ready
 );
-    localparam ISSUE_QUEUE_DEPTH = 4;
-
     // 512 bits/octet * 4 octets per warp
     logic [`NUM_WARPS-1:0][3:0][31:0] A_buffer, A_buffer_n;
     logic [`NUM_WARPS-1:0][3:0][31:0] B_buffer, B_buffer_n;
@@ -471,7 +469,7 @@ module VX_tensor_octet #(
     VX_tensor_dpu #(
         .ISW(ISW),
         .OCTET(OCTET),
-        .ISSUE_QUEUE_DEPTH(4)
+        .ISSUE_QUEUE_DEPTH(4 /*@perf: arbtirary*/)
     ) dpu (
         .clk(clk),
         .reset(reset),
@@ -503,10 +501,9 @@ module VX_tensor_octet #(
     // regular, every-2-cycle commit traffic to ensure the commit pipeline is
     // used more efficiently.
     // FIXME: unnecessary?
-    // TODO: This is probably oversized.
     VX_fifo_queue #(
         .DATAW   ($bits(D_wid) + $bits(D_out)),
-        .DEPTH   (2 /*`LATENCY_HMMA*/)
+        .DEPTH   (2 /* arbitrary */)
     ) output_buffer (
         .clk   (clk),
         .reset (reset),
diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv
index 08e37cfa..79ee5757 100644
--- a/hw/rtl/fpu/VX_tensor_dpu.sv
+++ b/hw/rtl/fpu/VX_tensor_dpu.sv
@@ -4,6 +4,9 @@
 module VX_tensor_dpu #(
     parameter ISW,
     parameter OCTET,
+    // @perf: has big impact on throughput.  A rule of thumb is to set it to
+    // the pipeline length of FEDPs in order to make sure there are enough
+    // entries to fully saturate the pipeline, but this is still rough
     parameter ISSUE_QUEUE_DEPTH = `LATENCY_HMMA
 ) (
     input clk,
@@ -105,7 +108,9 @@ module VX_tensor_dpu #(
     // need to pass along warp id's to do multithreading
     VX_fifo_queue #(
         .DATAW   ($bits(wid)),
-        .DEPTH   (ISSUE_QUEUE_DEPTH + ISSUE_QUEUE_DEPTH)
+        // @perf: seems to require deeper depth than the FEDP issue queues to
+        // not cause stalls.
+        .DEPTH   (2 * ISSUE_QUEUE_DEPTH)
     ) wid_queue (
         .clk   (clk),
         .reset (reset),
@@ -167,7 +172,7 @@ module VX_tensor_threadgroup #(
     // threadgroups, so we need only 1 queue per octet for B
     VX_fifo_queue #(
         .DATAW ($bits(A_frag) + $bits(B_frag) + $bits(C_frag)),
-        .DEPTH   (ISSUE_QUEUE_DEPTH)
+        .DEPTH (ISSUE_QUEUE_DEPTH)
     ) input_buffer (
         .clk       (clk),
         .reset     (reset),

From a02773eb922c02fe516cf96756bd5a2b18b58149 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 30 May 2024 21:55:42 -0700
Subject: [PATCH 25/31] Add more efficient dispatch_unit

Instead of having a single candidate to be considered for dispatch
(designated by 'batch_idx' counter), add a dispatch_unit variant that
considerse all `ISSUE_WIDTH dispatch signals and picks a valid one in a
round-robin manner.

This increases core utilization significantly due to better overlapping
of smem/tensor ops.
---
 hw/rtl/core/VX_dispatch_unit_sane.sv | 163 +++++++++++++++++++++++++++
 hw/rtl/core/VX_lsu_unit.sv           |   2 +-
 hw/rtl/core/VX_tensor_core.sv        |   5 +-
 3 files changed, 166 insertions(+), 4 deletions(-)
 create mode 100644 hw/rtl/core/VX_dispatch_unit_sane.sv

diff --git a/hw/rtl/core/VX_dispatch_unit_sane.sv b/hw/rtl/core/VX_dispatch_unit_sane.sv
new file mode 100644
index 00000000..26d2800b
--- /dev/null
+++ b/hw/rtl/core/VX_dispatch_unit_sane.sv
@@ -0,0 +1,163 @@
+// Copyright © 2019-2023
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+`include "VX_define.vh"
+
+module VX_dispatch_unit_sane import VX_gpu_pkg::*; #(    
+    parameter BLOCK_SIZE = 1,
+    parameter NUM_LANES  = 1,
+    parameter OUT_REG    = 0,
+    parameter MAX_FANOUT = `MAX_FANOUT
+) ( 
+    input  wire             clk,
+    input  wire             reset,
+
+    // inputs    
+    VX_dispatch_if.slave    dispatch_if [`ISSUE_WIDTH],
+
+    // outputs
+    VX_execute_if.master    execute_if [BLOCK_SIZE]
+
+);
+    `STATIC_ASSERT ((`NUM_THREADS == NUM_LANES  * (`NUM_THREADS / NUM_LANES)), ("invalid parameter"))
+    localparam BLOCK_SIZE_W = `LOG2UP(BLOCK_SIZE);
+    localparam NUM_PACKETS  = `NUM_THREADS / NUM_LANES;
+    localparam PID_BITS     = `CLOG2(NUM_PACKETS);
+    localparam PID_WIDTH    = `UP(PID_BITS);
+    localparam BATCH_COUNT  = `ISSUE_WIDTH / BLOCK_SIZE;
+    localparam BATCH_COUNT_W= `LOG2UP(BATCH_COUNT);
+    localparam ISSUE_W      = `LOG2UP(`ISSUE_WIDTH);
+    localparam IN_DATAW     = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + 1 + `XLEN + `XLEN + `NR_BITS + `NT_WIDTH + (3 * `NUM_THREADS * `XLEN);
+    localparam OUT_DATAW    = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + 1 + `XLEN + `XLEN + `NR_BITS + `NT_WIDTH + (3 * NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1;
+    localparam FANOUT_ENABLE= (`NUM_THREADS > (MAX_FANOUT + MAX_FANOUT/2));
+
+    localparam DATA_TMASK_OFF = IN_DATAW - (`UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS);
+    localparam DATA_REGS_OFF = 0;
+
+    wire [`ISSUE_WIDTH-1:0] dispatch_valid;
+    wire [`ISSUE_WIDTH-1:0][IN_DATAW-1:0] dispatch_data;
+    wire [`ISSUE_WIDTH-1:0] dispatch_ready;
+
+    for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
+        assign dispatch_valid[i] = dispatch_if[i].valid;
+        assign dispatch_data[i] = dispatch_if[i].data;
+        assign dispatch_if[i].ready = dispatch_ready[i];
+    end
+    
+    wire [BLOCK_SIZE-1:0][ISSUE_W-1:0] issue_indices;
+    wire [BLOCK_SIZE-1:0] block_ready;
+    wire [BLOCK_SIZE-1:0][NUM_LANES-1:0] block_tmask;
+    wire [BLOCK_SIZE-1:0][2:0][NUM_LANES-1:0][`XLEN-1:0] block_regs;
+    wire [BLOCK_SIZE-1:0][PID_WIDTH-1:0] block_pid;
+    wire [BLOCK_SIZE-1:0] block_sop;
+    wire [BLOCK_SIZE-1:0] block_eop;
+    wire [BLOCK_SIZE-1:0] block_done;
+
+    wire batch_done = (& block_done);
+    
+    logic [BATCH_COUNT_W-1:0] batch_idx;
+    // if (BATCH_COUNT != 1) begin
+    //     always @(posedge clk) begin
+    //         if (reset) begin
+    //             batch_idx <= '0;
+    //         end else begin
+    //             batch_idx <= batch_idx + BATCH_COUNT_W'(batch_done);
+    //         end
+    //     end
+    // end else begin
+    //     assign batch_idx = 0;
+    //     `UNUSED_VAR(batch_done)
+    // end
+    
+    wire dispatch_any_valid;
+    VX_lzc_rr #(
+        .N       (`ISSUE_WIDTH)
+    ) batch_select (
+        .clk       (clk),
+        .reset     (reset),
+        .data_in   (dispatch_valid),
+        .data_out  (batch_idx),
+        .valid_out (dispatch_any_valid)
+    );
+
+    `STATIC_ASSERT ((BLOCK_SIZE == 1), ("dispatch_unit_sane only supports BLOCK_SIZE == 1 for now"))
+
+    for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
+
+        wire [ISSUE_W-1:0] issue_idx = ISSUE_W'(batch_idx * BLOCK_SIZE) + ISSUE_W'(block_idx);
+        assign issue_indices[block_idx] = issue_idx;
+
+        wire valid_p, ready_p;
+
+        assign valid_p = dispatch_valid[issue_idx];
+        assign block_tmask[block_idx] = dispatch_data[issue_idx][DATA_TMASK_OFF +: `NUM_THREADS];
+        assign block_regs[block_idx][0] = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
+        assign block_regs[block_idx][1] = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
+        assign block_regs[block_idx][2] = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
+        assign block_pid[block_idx]   = '0;
+        assign block_sop[block_idx]   = 1'b1;
+        assign block_eop[block_idx]   = 1'b1;
+        assign block_ready[block_idx] = ready_p;
+        assign block_done[block_idx]  = ~valid_p || ready_p;
+
+        wire [ISSUE_ISW_W-1:0] isw;
+        if (BATCH_COUNT != 1) begin
+            if (BLOCK_SIZE != 1) begin
+                assign isw = {batch_idx, BLOCK_SIZE_W'(block_idx)};
+            end else begin
+                assign isw = batch_idx;
+            end
+        end else begin
+            assign isw = block_idx;
+        end
+
+        `RESET_RELAY(buf_out_reset, reset);
+
+        wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], isw);
+
+        VX_elastic_buffer #(
+            .DATAW   (OUT_DATAW),
+            .SIZE    (`OUT_REG_TO_EB_SIZE(OUT_REG)),
+            .OUT_REG (`OUT_REG_TO_EB_REG(OUT_REG))
+        ) buf_out (
+            .clk       (clk),
+            .reset     (buf_out_reset),
+            .valid_in  (valid_p),
+            .ready_in  (ready_p),
+            .data_in   ({                
+                dispatch_data[issue_idx][IN_DATAW-1 : DATA_TMASK_OFF+`NUM_THREADS+ISSUE_WIS_W],
+                block_wid,
+                block_tmask[block_idx],
+                dispatch_data[issue_idx][DATA_TMASK_OFF-1 : DATA_REGS_OFF + 3 * `NUM_THREADS * `XLEN],
+                block_regs[block_idx][0],
+                block_regs[block_idx][1],
+                block_regs[block_idx][2],     
+                block_pid[block_idx],
+                block_sop[block_idx],
+                block_eop[block_idx]}),
+            .data_out  (execute_if[block_idx].data),
+            .valid_out (execute_if[block_idx].valid),
+            .ready_out (execute_if[block_idx].ready)
+        );
+    end
+
+    reg [`ISSUE_WIDTH-1:0] ready_in;
+    always @(*) begin
+        ready_in = 0;
+        for (integer i = 0; i < BLOCK_SIZE; ++i) begin
+            ready_in[issue_indices[i]] = block_ready[i] && block_eop[i];
+        end
+    end
+    assign dispatch_ready = ready_in; 
+
+endmodule
diff --git a/hw/rtl/core/VX_lsu_unit.sv b/hw/rtl/core/VX_lsu_unit.sv
index 63f1d4c6..20fac1d1 100644
--- a/hw/rtl/core/VX_lsu_unit.sv
+++ b/hw/rtl/core/VX_lsu_unit.sv
@@ -49,7 +49,7 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
 
     `RESET_RELAY (dispatch_reset, reset);
 
-    VX_dispatch_unit #(
+    VX_dispatch_unit_sane #(
         .BLOCK_SIZE (BLOCK_SIZE),
         .NUM_LANES  (NUM_LANES),
         .OUT_REG    (1)
diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index a5128272..6c9d9f6b 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -21,7 +21,7 @@ module VX_tensor_core import VX_gpu_pkg::*; #(
 
     `RESET_RELAY (dispatch_reset, reset);
 
-    VX_dispatch_unit #(
+    VX_dispatch_unit_sane #(
         .BLOCK_SIZE (BLOCK_SIZE),
         .NUM_LANES  (NUM_LANES),
         .OUT_REG    (PARTIAL_BW ? 1 : 0)
@@ -177,9 +177,8 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
 
     localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `XLEN + 1 + `NR_BITS;
     
-    wire commit_if_ready_override;
-
     wire operand_enq_fire = operands_valid_synced && execute_if.ready;
+    wire commit_if_ready_override;
     wire commit_if_fire = commit_if.valid && commit_if_ready_override;
     wire [DATAW-1:0] execute_if_data_enq = {
         execute_if.data.uuid, 

From 52bb827a4665bdddbc8968b3e2eefdc06947db2f Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 30 May 2024 23:20:21 -0700
Subject: [PATCH 26/31] Handle BLOCK_SIZE != 1 in dispatch_unit

+ change ALU and FPU unit to use it as well
---
 hw/rtl/core/VX_alu_unit.sv           |   2 +-
 hw/rtl/core/VX_dispatch_unit_sane.sv | 141 ++++++++++++++++++++++++---
 hw/rtl/core/VX_fpu_unit.sv           |   2 +-
 3 files changed, 128 insertions(+), 17 deletions(-)

diff --git a/hw/rtl/core/VX_alu_unit.sv b/hw/rtl/core/VX_alu_unit.sv
index 7546f4b3..c1724360 100644
--- a/hw/rtl/core/VX_alu_unit.sv
+++ b/hw/rtl/core/VX_alu_unit.sv
@@ -42,7 +42,7 @@ module VX_alu_unit #(
 
     `RESET_RELAY (dispatch_reset, reset);
 
-    VX_dispatch_unit #(
+    VX_dispatch_unit_sane #(
         .BLOCK_SIZE (BLOCK_SIZE),
         .NUM_LANES  (NUM_LANES),
         .OUT_REG    (PARTIAL_BW ? 1 : 0)
diff --git a/hw/rtl/core/VX_dispatch_unit_sane.sv b/hw/rtl/core/VX_dispatch_unit_sane.sv
index 26d2800b..3e31ced2 100644
--- a/hw/rtl/core/VX_dispatch_unit_sane.sv
+++ b/hw/rtl/core/VX_dispatch_unit_sane.sv
@@ -78,20 +78,25 @@ module VX_dispatch_unit_sane import VX_gpu_pkg::*; #(
     //     assign batch_idx = 0;
     //     `UNUSED_VAR(batch_done)
     // end
-    
+
+    // group dispatch_valid by blocks
+    wire [BATCH_COUNT-1:0] batch_valids;
+    for (genvar i = 0; i < BATCH_COUNT; ++i) begin
+        assign batch_valids[i] = |(dispatch_valid[(BLOCK_SIZE * i) +: BLOCK_SIZE]);
+    end
+
+    // elect the leftmost-valid batch for the dispatch
     wire dispatch_any_valid;
     VX_lzc_rr #(
-        .N       (`ISSUE_WIDTH)
+        .N       (BATCH_COUNT)
     ) batch_select (
         .clk       (clk),
         .reset     (reset),
-        .data_in   (dispatch_valid),
+        .data_in   (batch_valids),
         .data_out  (batch_idx),
         .valid_out (dispatch_any_valid)
     );
 
-    `STATIC_ASSERT ((BLOCK_SIZE == 1), ("dispatch_unit_sane only supports BLOCK_SIZE == 1 for now"))
-
     for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
 
         wire [ISSUE_W-1:0] issue_idx = ISSUE_W'(batch_idx * BLOCK_SIZE) + ISSUE_W'(block_idx);
@@ -99,16 +104,122 @@ module VX_dispatch_unit_sane import VX_gpu_pkg::*; #(
 
         wire valid_p, ready_p;
 
-        assign valid_p = dispatch_valid[issue_idx];
-        assign block_tmask[block_idx] = dispatch_data[issue_idx][DATA_TMASK_OFF +: `NUM_THREADS];
-        assign block_regs[block_idx][0] = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
-        assign block_regs[block_idx][1] = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
-        assign block_regs[block_idx][2] = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
-        assign block_pid[block_idx]   = '0;
-        assign block_sop[block_idx]   = 1'b1;
-        assign block_eop[block_idx]   = 1'b1;
-        assign block_ready[block_idx] = ready_p;
-        assign block_done[block_idx]  = ~valid_p || ready_p;
+        if (`NUM_THREADS != NUM_LANES) begin
+            reg [NUM_PACKETS-1:0] sent_mask_p;
+            wire [PID_WIDTH-1:0] start_p_n, start_p, end_p;
+            wire dispatch_valid_r;
+            reg is_first_p;
+
+            wire fire_p = valid_p && ready_p;
+
+            wire is_last_p = (start_p == end_p);
+
+            wire fire_eop = fire_p && is_last_p;
+
+            always @(posedge clk) begin
+                if (reset) begin
+                    sent_mask_p <= '0;
+                    is_first_p  <= 1;
+                end else begin
+                    if ((BATCH_COUNT != 1) ? batch_done : fire_eop) begin
+                        sent_mask_p <= '0;
+                        is_first_p <= 1;
+                    end else if (fire_p) begin
+                        sent_mask_p[start_p] <= 1;
+                        is_first_p <= 0;
+                    end
+                end
+            end
+
+            wire [NUM_PACKETS-1:0][NUM_LANES-1:0] per_packet_tmask;
+            wire [NUM_PACKETS-1:0][2:0][NUM_LANES-1:0][`XLEN-1:0] per_packet_regs; 
+
+            wire [`NUM_THREADS-1:0] dispatch_tmask = dispatch_data[issue_idx][DATA_TMASK_OFF +: `NUM_THREADS];
+            wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs1_data = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
+            wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs2_data = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
+            wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs3_data = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
+
+            for (genvar i = 0; i < NUM_PACKETS; ++i) begin
+                for (genvar j = 0; j < NUM_LANES; ++j) begin
+                    localparam k = i * NUM_LANES + j;
+                    assign per_packet_tmask[i][j]   = dispatch_tmask[k];
+                    assign per_packet_regs[i][0][j] = dispatch_rs1_data[k];
+                    assign per_packet_regs[i][1][j] = dispatch_rs2_data[k];
+                    assign per_packet_regs[i][2][j] = dispatch_rs3_data[k];
+                end
+            end
+
+            wire [NUM_PACKETS-1:0] packet_valids;
+            wire [NUM_PACKETS-1:0][PID_WIDTH-1:0] packet_ids;
+
+            for (genvar i = 0; i < NUM_PACKETS; ++i) begin                 
+                assign packet_valids[i] = (| per_packet_tmask[i]);
+                assign packet_ids[i] = PID_WIDTH'(i);
+            end
+
+            VX_find_first #(
+                .N       (NUM_PACKETS),
+                .DATAW   (PID_WIDTH),
+                .REVERSE (0)
+            ) find_first (                    
+                .valid_in  (packet_valids & ~sent_mask_p),
+                .data_in   (packet_ids),
+                .data_out  (start_p_n),
+                `UNUSED_PIN (valid_out)
+            );
+
+            VX_find_first #(
+                .N       (NUM_PACKETS),
+                .DATAW   (PID_WIDTH),
+                .REVERSE (1)
+            ) find_last (                    
+                .valid_in  (packet_valids),
+                .data_in   (packet_ids),
+                .data_out  (end_p),
+                `UNUSED_PIN (valid_out)
+            );   
+
+            VX_pipe_register #(
+                .DATAW  (1 + PID_WIDTH),
+                .RESETW (1),
+                .DEPTH  (FANOUT_ENABLE ? 1 : 0)
+            ) pipe_reg (
+                .clk      (clk),
+                .reset    (reset || fire_p), // should flush on fire
+                .enable   (1'b1),
+                .data_in  ({dispatch_valid[issue_idx], start_p_n}),
+                .data_out ({dispatch_valid_r, start_p})
+            );  
+
+            wire [NUM_LANES-1:0] tmask_p = per_packet_tmask[start_p];
+            wire [2:0][NUM_LANES-1:0][`XLEN-1:0] regs_p = per_packet_regs[start_p];
+
+            wire block_enable = (BATCH_COUNT == 1 || ~(& sent_mask_p));
+            
+            assign valid_p = dispatch_valid_r && block_enable;            
+            assign block_tmask[block_idx] = tmask_p;
+            assign block_regs[block_idx]  = regs_p;
+            assign block_pid[block_idx]   = start_p;
+            assign block_sop[block_idx]   = is_first_p;
+            assign block_eop[block_idx]   = is_last_p;
+            if (FANOUT_ENABLE) begin
+                assign block_ready[block_idx] = dispatch_valid_r && ready_p && block_enable;
+            end else begin
+                assign block_ready[block_idx] = ready_p && block_enable;
+            end
+            assign block_done[block_idx] = ~dispatch_valid[issue_idx] || fire_eop;
+        end else begin
+            assign valid_p = dispatch_valid[issue_idx];
+            assign block_tmask[block_idx] = dispatch_data[issue_idx][DATA_TMASK_OFF +: `NUM_THREADS];
+            assign block_regs[block_idx][0] = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
+            assign block_regs[block_idx][1] = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
+            assign block_regs[block_idx][2] = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
+            assign block_pid[block_idx]   = '0;
+            assign block_sop[block_idx]   = 1'b1;
+            assign block_eop[block_idx]   = 1'b1;
+            assign block_ready[block_idx] = ready_p;
+            assign block_done[block_idx]  = ~valid_p || ready_p;
+        end
 
         wire [ISSUE_ISW_W-1:0] isw;
         if (BATCH_COUNT != 1) begin
diff --git a/hw/rtl/core/VX_fpu_unit.sv b/hw/rtl/core/VX_fpu_unit.sv
index 26956213..7e0875ba 100644
--- a/hw/rtl/core/VX_fpu_unit.sv
+++ b/hw/rtl/core/VX_fpu_unit.sv
@@ -39,7 +39,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
 
     `RESET_RELAY (dispatch_reset, reset);
 
-    VX_dispatch_unit #(
+    VX_dispatch_unit_sane #(
         .BLOCK_SIZE (BLOCK_SIZE),
         .NUM_LANES  (NUM_LANES),
         .OUT_REG    (PARTIAL_BW ? 1 : 0)

From 73293061ea6887f9926c34fd6ef2f169045eb73b Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Thu, 30 May 2024 23:21:23 -0700
Subject: [PATCH 27/31] tensor: Enlarge metadata queue

---
 hw/rtl/core/VX_tensor_core.sv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index 6c9d9f6b..105fab2f 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -78,7 +78,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
     // FIXME: not sure this is the right logic.  just filling in what works
     localparam LANE_OFFSET_THREADGROUP = (4 * NUM_OCTETS);
     // this is only a rule of thumb
-    localparam METADATA_QUEUE_DEPTH = `LATENCY_HMMA;
+    localparam METADATA_QUEUE_DEPTH = 2 * `LATENCY_HMMA;
 
     wire [1:0] step = 2'(execute_if.data.op_type);
     wire last_in_pair = (execute_if.data.op_mod == `INST_MOD_BITS'(1));

From 0ebbb8e2238c90542824b6dad124af0adbb3ec55 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 31 May 2024 00:32:32 -0700
Subject: [PATCH 28/31] tensor: Fix perf counter; comment out dpi

---
 hw/rtl/core/VX_tensor_core.sv |  2 +-
 hw/rtl/fpu/VX_tensor_dpu.sv   | 30 +++++++++++++++---------------
 2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index 105fab2f..ca0d1064 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -528,7 +528,7 @@ module VX_tensor_octet #(
             perf_tensor_dpu_total <= '0;
         end else begin
             if (do_hmma) begin
-                perf_tensor_dpu_total <= perf_tensor_dpu_total + 1'b1;
+                perf_tensor_dpu_total <= perf_tensor_dpu_total + 2'd2;
             end
         end
     end
diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv
index 79ee5757..0155417b 100644
--- a/hw/rtl/fpu/VX_tensor_dpu.sv
+++ b/hw/rtl/fpu/VX_tensor_dpu.sv
@@ -24,23 +24,23 @@ module VX_tensor_dpu #(
     output [3:0][3:0][31:0] D_tile,
     output [`NW_WIDTH-1:0]  D_wid
 );
-    logic [3:0][3:0][31:0] result_hmma;
+    // logic [3:0][3:0][31:0] result_hmma;
 
-    always @(*) begin
-        dpi_hmma(valid_in, A_tile, B_tile, C_tile, result_hmma);
-    end
+    // always @(*) begin
+    //     dpi_hmma(valid_in, A_tile, B_tile, C_tile, result_hmma);
+    // end
 
-    logic ready_reg;
-    always @(posedge clk) begin
-        if (reset) begin
-            ready_reg <= '1;
-        end else if (valid_in && ready_in) begin
-            ready_reg <= '0;
-            dpi_print_results(int'(ISW), int'(OCTET), A_tile, B_tile, C_tile, result_hmma);
-        end else if (valid_out && ready_out) begin
-            ready_reg <= '1;
-        end
-    end
+    // logic ready_reg;
+    // always @(posedge clk) begin
+    //     if (reset) begin
+    //         ready_reg <= '1;
+    //     end else if (valid_in && ready_in) begin
+    //         ready_reg <= '0;
+    //         dpi_print_results(int'(ISW), int'(OCTET), A_tile, B_tile, C_tile, result_hmma);
+    //     end else if (valid_out && ready_out) begin
+    //         ready_reg <= '1;
+    //     end
+    // end
 
     // // fixed-latency queue
     // VX_shift_register #(

From 9caafb2d8a153f84e88c8134bb5e6423c6fbd044 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Fri, 31 May 2024 19:17:56 -0700
Subject: [PATCH 29/31] tensor: Decode rd of macro-op to designate additional
 accumulator

This is useful when you want to have the tensor core output to multiple
accumulator registers, e.g. when doing outer product within the RF.
---
 hw/rtl/core/VX_decode.sv        |  6 ++++++
 hw/rtl/core/VX_uop_sequencer.sv | 18 ++++++++++++++----
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/hw/rtl/core/VX_decode.sv b/hw/rtl/core/VX_decode.sv
index 6f4539e7..2ca414cd 100644
--- a/hw/rtl/core/VX_decode.sv
+++ b/hw/rtl/core/VX_decode.sv
@@ -545,6 +545,12 @@ module VX_decode  #(
             `INST_EXT4: begin
                 ex_type = `EX_TENSOR;
                 op_type = `INST_TENSOR_HMMA;
+                // tensor core macroop is encoded as r-type
+                use_rd = 1;
+                `USED_IREG (rd);
+                `USED_IREG (rs1);
+                `USED_IREG (rs2);
+                `USED_IREG (rs3);
             end
         `endif
             default:;
diff --git a/hw/rtl/core/VX_uop_sequencer.sv b/hw/rtl/core/VX_uop_sequencer.sv
index 24b5af3c..130866de 100644
--- a/hw/rtl/core/VX_uop_sequencer.sv
+++ b/hw/rtl/core/VX_uop_sequencer.sv
@@ -14,10 +14,9 @@ module VX_uop_sequencer import VX_gpu_pkg::*; (
     localparam UOP_TABLE_SIZE = 64;
     localparam UPC_BITS = `CLOG2(UOP_TABLE_SIZE);
 
-    localparam NEXT   = 2'b00;
-    localparam FINISH = 2'b01;
-
     localparam UBR_BITS = 2;
+    localparam NEXT   = UBR_BITS'(2'b00);
+    localparam FINISH = UBR_BITS'(2'b01);
 
     // uop metadata (sequencing, next state), execution metadata (EX_TYPE, OP_TYPE, OP_MOD), wb, use pc, use imm, pc, imm, rd, rs1, rs2, rs3
     localparam UOP_TABLE_WIDTH = UBR_BITS + UPC_BITS + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + 1 + 1 + `XLEN + `XLEN + (`NR_BITS * 4);
@@ -122,7 +121,18 @@ module VX_uop_sequencer import VX_gpu_pkg::*; (
     // passthrough when !use_uop
     assign ibuffer_if.valid = use_uop ? 1'b1 : uop_sequencer_if.valid;
     assign uop_sequencer_if.ready = use_uop ? (uop_fire && ubr == FINISH) : ibuffer_if.ready;
-    assign ibuffer_if.data = use_uop ? ibuffer_output : uop_sequencer_if.data;
+
+    always @(*) begin
+        ibuffer_if.data = use_uop ? ibuffer_output : uop_sequencer_if.data;
+
+        if (uop_sequencer_if.valid && use_uop &&
+            uop_sequencer_if.data.rd  == `NR_BITS'(1)) begin
+            // a little sketchy? but shouldn't create any loop
+            ibuffer_if.data.rd  = ibuffer_if.data.rd  + `NR_BITS'(8);
+            ibuffer_if.data.rs3 = ibuffer_if.data.rs3 + `NR_BITS'(8);
+            $display("yoooooooo! uop rd=%d\n", ibuffer_if.data.rd);
+        end
+    end
 
     always @(posedge clk) begin
         if (uop_start) begin

From 12f8722dd5b9505bbe22a0ac62dfd79df49d9f56 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Mon, 3 Jun 2024 13:04:09 -0700
Subject: [PATCH 30/31] Shush display

---
 hw/rtl/core/VX_tensor_core.sv   | 2 +-
 hw/rtl/core/VX_uop_sequencer.sv | 3 +--
 hw/rtl/fpu/VX_tensor_dpu.sv     | 3 +++
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index ca0d1064..d1c14588 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -81,6 +81,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
     localparam METADATA_QUEUE_DEPTH = 2 * `LATENCY_HMMA;
 
     wire [1:0] step = 2'(execute_if.data.op_type);
+    // op_mod is reused to indicate instruction's id in pair
     wire last_in_pair = (execute_if.data.op_mod == `INST_MOD_BITS'(1));
 
     logic [NUM_OCTETS-1:0] octet_results_valid;
@@ -115,7 +116,6 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
         logic result_valid;
         logic result_ready;
 
-        // op_mod is reused to indicate instruction's id in pair
         VX_tensor_octet #(
             .ISW(ISW),
             .OCTET(i)
diff --git a/hw/rtl/core/VX_uop_sequencer.sv b/hw/rtl/core/VX_uop_sequencer.sv
index 130866de..26817b8d 100644
--- a/hw/rtl/core/VX_uop_sequencer.sv
+++ b/hw/rtl/core/VX_uop_sequencer.sv
@@ -128,9 +128,8 @@ module VX_uop_sequencer import VX_gpu_pkg::*; (
         if (uop_sequencer_if.valid && use_uop &&
             uop_sequencer_if.data.rd  == `NR_BITS'(1)) begin
             // a little sketchy? but shouldn't create any loop
-            ibuffer_if.data.rd  = ibuffer_if.data.rd  + `NR_BITS'(8);
+            ibuffer_if.data.rd  = ibuffer_if.data.rd  + `NR_BITS'(8); // FIXME: 8 is hardcoded
             ibuffer_if.data.rs3 = ibuffer_if.data.rs3 + `NR_BITS'(8);
-            $display("yoooooooo! uop rd=%d\n", ibuffer_if.data.rd);
         end
     end
 
diff --git a/hw/rtl/fpu/VX_tensor_dpu.sv b/hw/rtl/fpu/VX_tensor_dpu.sv
index 0155417b..8b7a1c26 100644
--- a/hw/rtl/fpu/VX_tensor_dpu.sv
+++ b/hw/rtl/fpu/VX_tensor_dpu.sv
@@ -234,6 +234,9 @@ module VX_tensor_threadgroup #(
         end
     end
 
+    // TODO: Instead of latching half-result and constructing a full D tile,
+    // we should be able to send these half fragments down to commit stage
+    // immediately, saving flop space
     assign D_frag[0][0] = D_reg[0];
     assign D_frag[0][2] = D_reg[1];
     assign D_frag[1][0] = D_reg[2];

From 874a3bf1945f773951ce54eae73c9903e08f9737 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Sun, 9 Jun 2024 13:41:00 -0700
Subject: [PATCH 31/31] Doc changes

---
 hw/rtl/VX_platform.vh         | 2 +-
 hw/rtl/core/VX_smem_unit.sv   | 1 +
 hw/rtl/core/VX_tensor_core.sv | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/hw/rtl/VX_platform.vh b/hw/rtl/VX_platform.vh
index 65cbd0bf..282018b8 100644
--- a/hw/rtl/VX_platform.vh
+++ b/hw/rtl/VX_platform.vh
@@ -14,7 +14,7 @@
 `ifndef VX_PLATFORM_VH
 `define VX_PLATFORM_VH
 
-// synthesis only
+// enable synthesizable build if SIMULATION not explicitly defined
 `ifndef SIMULATION
 `define SYNTHESIS
 `define NDEBUG
diff --git a/hw/rtl/core/VX_smem_unit.sv b/hw/rtl/core/VX_smem_unit.sv
index 91587b2f..532dba55 100644
--- a/hw/rtl/core/VX_smem_unit.sv
+++ b/hw/rtl/core/VX_smem_unit.sv
@@ -66,6 +66,7 @@ module VX_smem_unit import VX_gpu_pkg::*; #(
         .req_valid  (smem_req_valid),
         .req_rw     (smem_req_rw),
         .req_byteen (smem_req_byteen),
+        // FIXME: synthesis complains undriven when USE_EXTERNAL_SMEM
         .req_addr   (smem_req_addr),
         .req_data   (smem_req_data),        
         .req_tag    (smem_req_tag),
diff --git a/hw/rtl/core/VX_tensor_core.sv b/hw/rtl/core/VX_tensor_core.sv
index d1c14588..efa74afd 100644
--- a/hw/rtl/core/VX_tensor_core.sv
+++ b/hw/rtl/core/VX_tensor_core.sv
@@ -232,7 +232,7 @@ module VX_tensor_core_warp import VX_gpu_pkg::*; #(
 
     // this shouldn't really happen unless there's a big contention over
     // the commit stage
-    `RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!"));
+    `RUNTIME_ASSERT(!(!reset && metadata_queue_full), ("tensor core uop queue is full!"))
 
     // unlike execute which can be interleaved between warps, commit is
     // serialized and completed one-warp-by-warp, therefore we only need to