From 90949f488bda6c65f4e23cea496983ba5ec53923 Mon Sep 17 00:00:00 2001
From: Hansung Kim <hansung_kim@berkeley.edu>
Date: Mon, 14 Oct 2024 22:34:11 -0700
Subject: [PATCH] tensor: Add memory response queue

---
 .../radiance/core/TensorCoreDecoupled.scala   | 44 +++++++++++++++----
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/src/main/scala/radiance/core/TensorCoreDecoupled.scala b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
index 05fe576..617659d 100644
--- a/src/main/scala/radiance/core/TensorCoreDecoupled.scala
+++ b/src/main/scala/radiance/core/TensorCoreDecoupled.scala
@@ -32,7 +32,7 @@ class TensorCoreDecoupled(
 ) extends Module {
   val numWarpBits = log2Ceil(numWarps)
   val wordSize = 4 // TODO FP16
-  val dataWidth = numLanes * wordSize // TODO FP16
+  val dataWidth = numLanes * wordSize * 8/*bits*/ // TODO FP16
   val sourceWidth = log2Ceil(numSourceIds)
 
   val io = IO(new Bundle {
@@ -40,8 +40,9 @@ class TensorCoreDecoupled(
       val wid = UInt(numWarpBits.W)
     }))
     val writeback = Decoupled(new Bundle {
-      val wid = UInt(numWarpBits.W)
       val last = Bool()
+      val wid = UInt(numWarpBits.W)
+      val data = Vec(numLanes, UInt(wordSize.W))
     })
     val respA = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth)))
     val respB = Flipped(Decoupled(new TensorMemResp(sourceWidth, dataWidth)))
@@ -95,7 +96,9 @@ class TensorCoreDecoupled(
     busy := false.B
   }
 
-  // memory traffic generation
+  // Memory traffic generation
+  // -------------------------
+  //
   val genReq = (state === TensorState.run)
 
   Seq((io.reqA, io.respA), (io.reqB, io.respB)).foreach {
@@ -127,9 +130,33 @@ class TensorCoreDecoupled(
     firedABReg := Seq(false.B, false.B)
   }
 
-  io.respA.ready := true.B
-  io.respB.ready := true.B
+  io.respA.ready := true.B // FIXME
+  io.respB.ready := true.B // FIXME
 
+  // Execute stage
+  // -------------
+  // Execute backend of the decoupled access/execute pipeline.
+  //
+  val respQueueDepth = 4 // FIXME: parameterize
+  val respQueueA = Queue(io.respA, respQueueDepth)
+  val respQueueB = Queue(io.respB, respQueueDepth)
+  respQueueA.ready := io.writeback.ready // FIXME
+  respQueueB.ready := io.writeback.ready // FIXME
+
+  require(respQueueA.bits.data.widthOption.get ==
+          io.writeback.bits.data.widthOption.get * numLanes,
+    "response data width does not match the writeback data width")
+
+  // FIXME: debug dummy: pipe A directly to writeback
+  io.writeback.valid := respQueueA.valid
+  val groupedRespA = respQueueA.bits.data.asBools.grouped(wordSize * 8/*bits*/)
+  (io.writeback.bits.data zip groupedRespA).foreach { case (wb, data) =>
+    wb := VecInit(data).asUInt
+  }
+
+  // State transition
+  // ----------------
+  //
   // set/step sequencing logic
   val lastSet = ((1 << setBits) - 1)
   val lastStep = ((1 << stepBits) - 1)
@@ -142,7 +169,6 @@ class TensorCoreDecoupled(
     }
   }
 
-  // state transition logic
   switch(state) {
     is(TensorState.idle) {
       when(io.initiate.fire) {
@@ -189,13 +215,13 @@ class TensorMemResp(
   dataWidth: Int
 ) extends Bundle {
   val source = UInt(sourceWidth.W)
-  val data = UInt(32.W)
+  val data = UInt(dataWidth.W)
 }
 
 // synthesizable unit tests
 
-// wraps TensorCoreDecoupled with TileLink client node for use in a Diplomacy
-// network.
+// wraps TensorCoreDecoupled with a TileLink client node for use in a Diplomacy
+// graph.
 class TensorCoreDecoupledTL(implicit p: Parameters) extends LazyModule {
   val numSrcIds = 4